Commit e8b5d11b authored by Amy Zhuang's avatar Amy Zhuang Committed by Robert Kimball

Create mkldnn primitives at first iteration for codegen - part1. (#2806)

* Create mkldnn primitives at first iteration for CODEGEN.

 OPs: add, lstm, and rnn.

*  OPs: batchnorm.

*  OPs: concat and lrn.

Remove dead code.

* Skip in place concat, relu, reshape, and slice when building node_primitive_string_deps_index map.

* Change NGRAPH_ASSERT to NGRAPH_CHECK.

* Ops: Qconv

* Ops: Convs

* Address PR Feedback.

* Dynamic scale support for qconvs

* updating to amy's recent change

* GroupConv and Cleaning dead code

* Address PR Feedback.

* Remove unused variable.

* Fix a bug.

* Fix style error.
parent d77ace68
......@@ -62,7 +62,7 @@ namespace ngraph
auto lstm_desc =
mkldnn_emitter->get_rnn_forward_desc<ngraph::op::Lstm>(node, args, out);
// Lstm needs 9 primitives: src_layer, src_iter, weights_layer, weights_iter, bias,
// dst_layer, dst_iter, and rnn_forward.
// dst_layer, dst_iter, workspace, and rnn_forward.
// It needs a new workspace.
auto lstm_index =
mkldnn_emitter->reserve_primitive_space(9, true /* new workspace */);
......
......@@ -57,7 +57,7 @@ namespace ngraph
auto rnn_desc =
mkldnn_emitter->get_rnn_forward_desc<ngraph::op::Rnn>(node, args, out);
// Rnn needs 9 primitives: src_layer, src_iter, weights_layer, weights_iter, bias,
// dst_layer, dst_iter, and rnn_forward.
// dst_layer, dst_iter, workspace, and rnn_forward.
// It needs a new workspace.
auto rnn_index =
mkldnn_emitter->reserve_primitive_space(9, true /* new workspace */);
......
This diff is collapsed.
......@@ -155,6 +155,7 @@
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
#include "ngraph/runtime/cpu/cpu_tracing.hpp"
#include "ngraph/runtime/cpu/cpu_visualize_tree.hpp"
#include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/batch_mat_mul_transpose.hpp"
#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
......@@ -473,7 +474,10 @@ void runtime::cpu::CPU_ExternalFunction::compile(ngraph::pass::PassConfig& pass_
// Build mkldnn primitives for codegen.
pass_manager.register_pass<runtime::cpu::pass::MKLDNNPrimitiveBuildPass>(
*m_mkldnn_emitter, m_node_primitive_idx_map);
m_desc_filename,
*m_mkldnn_emitter,
m_node_primitive_idx_map,
m_node_primitive_string_deps_index_map);
unordered_map<Node*, Node*> node_function_map;
string common_function_string;
......@@ -510,13 +514,17 @@ void runtime::cpu::CPU_ExternalFunction::compile(ngraph::pass::PassConfig& pass_
writer +=
R"(
#include <cmath>
#include <fstream>
#include <mkldnn.hpp>
#include "ngraph/distributed.hpp"
#include "ngraph/except.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/cpu/cpu_eigen_utils.hpp"
#include "ngraph/runtime/cpu/cpu_executor.hpp"
#include "ngraph/runtime/cpu/cpu_kernels.hpp"
#include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/reference/all.hpp"
#include "ngraph/runtime/reference/and.hpp"
#include "ngraph/runtime/reference/any.hpp"
......@@ -668,6 +676,14 @@ using namespace ngraph::runtime;
writer << common_function_string << "\n";
//initiate mkldnn_primitives for CPURuntimeContextCG
writer << "void inline CPURuntimeContextCG::init_mkldnn_primitives()\n";
writer.block_begin();
writer << "mkldnn_primitives = std::vector<mkldnn::primitive*>("
<< to_string(m_mkldnn_emitter->get_mkldnn_primitives_cg().size()) << ");\n";
writer.block_end();
writer << "\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
auto ordered_ops = function_ordered_ops.at(current_function);
......@@ -722,6 +738,16 @@ using namespace ngraph::runtime;
writer << "extern \"C\" void " << current_function->get_name() << func_params << "\n";
writer << "{\n";
writer.indent++;
writer << "std::ifstream desc_file (\"" << m_desc_filename << "\", std::ios::binary);\n";
//deserialize and build mkldnn primitives
writer << "if (ctx->first_iteration)\n";
writer.block_begin();
writer << "// read in memory descriptors and build mkldnn primitives\n";
writer << "deserialize_memory_descs_and_build_memory_primitives(" << m_desc_filename
<< ", cg_ctx, " << to_string(m_mkldnn_emitter->get_mkldnn_descriptors_size())
<< ");\n";
writer.block_end();
// Execution tracing support
if (runtime::cpu::IsTracingEnabled() && current_function->get_name() == m_function_name)
......
......@@ -125,6 +125,18 @@ namespace ngraph
return it->second;
}
// Return the tuple including the string to create mkldnn primitive, the deps and the index in CODEGEN
const std::tuple<std::string, std::vector<size_t>, size_t>&
get_primitive_build_tuple(const Node* node) const
{
auto it = m_node_primitive_string_deps_index_map.find(node);
NGRAPH_CHECK(it != m_node_primitive_string_deps_index_map.end(),
"Primitive build tuple not found for node ",
node->description());
return it->second;
}
size_t add_state(ngraph::State* state)
{
m_states.push_back(state);
......@@ -318,6 +330,11 @@ namespace ngraph
/// Map each node with mkldnn implementation to its mkldnn primitive index.
std::unordered_map<const Node*, size_t> m_node_primitive_idx_map;
/// Map each node with mkldnn implementation to its mkldnn primitive creating string, deps, and mkldnn primitive index.
std::map<const Node*, std::tuple<std::string, std::vector<size_t>, size_t>>
m_node_primitive_string_deps_index_map;
/// Name of the file to store descriptors for mkldnn_primitives
const std::string m_desc_filename = "desc_file";
};
}
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -18,6 +18,7 @@
#include "ngraph/pass/pass.hpp"
#include <fstream>
#include <functional>
#include <typeindex>
#include <unordered_map>
......@@ -26,6 +27,15 @@
build_primitive<op_name>(ngraph::runtime::cpu::MKLDNNEmitter & mkldnn_emitter, \
ngraph::Node * node)
#define CONSTRUCT_PRIMITIVE_BUILD_STRING_DECL(op_name) \
construct_primitive_build_string<op_name>(ngraph::runtime::cpu::MKLDNNEmitter & \
mkldnn_emitter, \
ngraph::Node * node, \
std::string & construct_string, \
std::vector<size_t> & deps, \
size_t & index, \
std::ofstream & desc_file)
namespace mkldnn
{
class primitive;
......@@ -48,23 +58,46 @@ namespace ngraph
using PrimitiveBuildOpMap =
std::unordered_map<std::type_index, PrimitiveBuildFunction>;
using PrimitiveBuildStringConstructFunction =
std::function<void(ngraph::runtime::cpu::MKLDNNEmitter&,
ngraph::Node*,
std::string&,
std::vector<size_t>&,
size_t&,
std::ofstream&)>;
using PrimitiveBuildStringConstructOpMap =
std::unordered_map<std::type_index, PrimitiveBuildStringConstructFunction>;
/// This pass traverses the call graph and creates MKLDNN primitives for those ops
/// that have been assigned to MKLDNN.
class MKLDNNPrimitiveBuildPass : public ngraph::pass::CallGraphPass
{
private:
std::string m_desc_filename;
ngraph::runtime::cpu::MKLDNNEmitter& m_mkldnn_emitter;
/// External map to store each node with mkldnn implementation and its mkldnn
/// associated primitive index.
std::unordered_map<const Node*, size_t>& m_node_primitive_idx_map;
/// External map to store each node with mkldnn implementation and its mkldnn
/// creation string, deps, and mkldnn primitive index.
std::map<const Node*, std::tuple<std::string, std::vector<size_t>, size_t>>&
m_node_primitive_string_deps_index_map;
public:
MKLDNNPrimitiveBuildPass(
std::string filename,
ngraph::runtime::cpu::MKLDNNEmitter& mkldnn_emitter,
std::unordered_map<const Node*, size_t>& node_primitive_idx_map)
: m_mkldnn_emitter(mkldnn_emitter)
std::unordered_map<const Node*, size_t>& node_primitive_idx_map,
std::map<const Node*, std::tuple<std::string, std::vector<size_t>, size_t>>&
node_primitive_string_deps_index_map)
: m_desc_filename(filename)
, m_mkldnn_emitter(mkldnn_emitter)
, m_node_primitive_idx_map(node_primitive_idx_map)
, m_node_primitive_string_deps_index_map(
node_primitive_string_deps_index_map)
{
}
......@@ -78,6 +111,19 @@ namespace ngraph
throw std::runtime_error("Unimplemented op '" + node->description() +
"' in MKLDNNPrimitiveBuildPass");
}
template <typename OP>
static void construct_primitive_build_string(
ngraph::runtime::cpu::MKLDNNEmitter& mkldnn_emitter,
ngraph::Node* node,
std::string& construct_string,
std::vector<size_t>& deps,
size_t& index,
std::ofstream& desc_file)
{
throw std::runtime_error("Unimplemented op '" + node->description() +
"' in MKLDNNPrimitiveBuildPass");
}
};
}
}
......
......@@ -26,8 +26,35 @@ struct CPURuntimeContextCG
std::unique_ptr<tbb::flow::graph> tbb_graph;
std::unique_ptr<tbb::global_control> tbb_gcontrol;
CPURuntimeContextCG() { init_tbb(); }
~CPURuntimeContextCG() { cleanup_tbb(); }
CPURuntimeContextCG() { init_tbb(); init_mkldnn_primitives();}
~CPURuntimeContextCG() { cleanup_tbb(); cleanup_mkldnn_primitives();}
std::vector<mkldnn::primitive*> mkldnn_primitives;
std::vector<char*> mkldnn_workspaces;
std::vector<mkldnn::memory::desc*> mkldnn_descriptors;
mkldnn::engine global_cpu_engine = mkldnn::engine(mkldnn::engine::cpu, 0);
void set_memory_ptr(size_t primitive_index,
void* ptr)
{
auto primitive = static_cast<mkldnn::memory*>(mkldnn_primitives[primitive_index]);
primitive->set_data_handle(ptr);
}
void mkldnn_invoke_primitive(size_t primitive_index)
{
mkldnn::stream s(mkldnn::stream::kind::eager);
try
{
s.submit({*mkldnn_primitives[primitive_index]}).wait();
}
catch (const mkldnn::error& e)
{
throw std::runtime_error("Could not run mkldnn primitive " + e.message);
}
}
private:
inline void init_tbb()
......@@ -59,6 +86,35 @@ private:
}
}
}
void init_mkldnn_primitives();
inline void cleanup_mkldnn_primitives()
{
for (auto p : mkldnn_primitives)
{
delete p;
}
#ifndef _WIN32
//To avoid memory leak in mkldnn, release any buffers that are not free'd yet.
//https://software.intel.com/en-us/mkl-linux-developer-guide-avoiding-memory-leaks-in-intel-mkl
//mkl_free_buffers() is not exposed at this point, hence using mkl_serv_free_buffers()
ngraph::runtime::cpu::mkldnn_utils::mkl_serv_free_buffers();
#endif
for (auto w : mkldnn_workspaces)
{
free(w);
}
}
inline void cleanup_mkldnn_descriptors()
{
for (auto d : mkldnn_descriptors)
{
free(d);
}
}
};
extern "C" CPURuntimeContextCG* init_cg_ctx()
......@@ -70,4 +126,25 @@ extern "C" void destroy_cg_ctx(CPURuntimeContextCG* cg_ctx)
{
delete cg_ctx;
}
static void
deserialize_memory_descs_and_build_memory_primitives(std::ifstream& desc_file,
CPURuntimeContextCG* cg_ctx,
size_t descs_count)
{
cg_ctx->mkldnn_descriptors = std::vector<mkldnn::memory::desc*>(descs_count);
for (auto i = 0; i < descs_count; i++)
{
size_t primitive_index;
desc_file >> primitive_index;
auto desc = (mkldnn::memory::desc*)malloc(sizeof(mkldnn::memory::desc));
if (!desc)
{
throw std::bad_alloc();
}
desc_file.read(reinterpret_cast<char*>(desc), sizeof(mkldnn::memory::desc));
cg_ctx->mkldnn_descriptors[i] = desc;
cg_ctx->mkldnn_primitives[primitive_index] = new mkldnn::memory({*cg_ctx->mkldnn_descriptors[i], cg_ctx->global_cpu_engine}, nullptr);
}
};
)"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment