Commit 5df0e17e authored by Jaikrishnan Menon's avatar Jaikrishnan Menon

Merge branch 'master' into dex2

parents c829a9c7 f75b8006
...@@ -68,16 +68,16 @@ public: ...@@ -68,16 +68,16 @@ public:
std::string generate_temporary_name(std::string prefix = "tempvar"); std::string generate_temporary_name(std::string prefix = "tempvar");
void block_begin(std::string block_prefix = "") void block_begin()
{ {
*this << "{" << block_prefix << "\n"; *this << "{\n";
indent++; indent++;
} }
void block_end(std::string block_suffix = "") void block_end()
{ {
indent--; indent--;
*this << "}" << block_suffix << "\n"; *this << "}\n";
} }
private: private:
......
...@@ -265,7 +265,6 @@ void codegen::StaticCompiler::add_header_search_path(const string& p) ...@@ -265,7 +265,6 @@ void codegen::StaticCompiler::add_header_search_path(const string& p)
vector<string> paths = split(p, ';'); vector<string> paths = split(p, ';');
for (const string& path : paths) for (const string& path : paths)
{ {
NGRAPH_INFO << path;
if (!contains(m_extra_search_path_list, path)) if (!contains(m_extra_search_path_list, path))
{ {
m_extra_search_path_list.push_back(path); m_extra_search_path_list.push_back(path);
......
...@@ -344,9 +344,10 @@ void runtime::cpu::CPU_ExternalFunction::compile() ...@@ -344,9 +344,10 @@ void runtime::cpu::CPU_ExternalFunction::compile()
pass_manager.register_pass<ngraph::pass::NopElimination>(); pass_manager.register_pass<ngraph::pass::NopElimination>();
pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>(); pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
pass_manager.register_pass<runtime::cpu::pass::RNNFusion>(); pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>(); pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
pass_manager.register_pass<runtime::cpu::pass::MultiLayerRNNFusion>();
pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>(); pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
pass_manager.register_pass<ngraph::pass::CoreFusion>(); pass_manager.register_pass<ngraph::pass::CoreFusion>();
pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(); pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
......
...@@ -29,6 +29,7 @@ namespace ngraph ...@@ -29,6 +29,7 @@ namespace ngraph
{ {
class LSTMFusion; class LSTMFusion;
class RNNFusion; class RNNFusion;
class MultiLayerRNNFusion;
} }
} }
} }
...@@ -61,3 +62,16 @@ public: ...@@ -61,3 +62,16 @@ public:
private: private:
void construct_rnn_lstm_fprop(); void construct_rnn_lstm_fprop();
}; };
class ngraph::runtime::cpu::pass::MultiLayerRNNFusion : public ngraph::pass::RecurrentGraphRewrite
{
public:
MultiLayerRNNFusion()
: RecurrentGraphRewrite()
{
construct_multi_layer_rnn_fusion_fprop();
}
private:
void construct_multi_layer_rnn_fusion_fprop();
};
...@@ -268,8 +268,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti ...@@ -268,8 +268,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
unsigned int rank = static_cast<unsigned int>(input_shape.size()); uint32_t rank = static_cast<uint32_t>(input_shape.size());
unsigned int nthreads = static_cast<unsigned int>(shape_size(input_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
GPUShape pad_below(input_shape.size(), 0); GPUShape pad_below(input_shape.size(), 0);
GPUShape pad_interior(input_shape.size(), 1); GPUShape pad_interior(input_shape.size(), 1);
...@@ -286,14 +286,14 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti ...@@ -286,14 +286,14 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_input_strides = allocator.reserve_argspace( size_t idx_input_strides =
input_strides.data(), input_strides.size() * sizeof(unsigned int)); allocator.reserve_argspace(input_strides.data(), input_strides.size() * sizeof(uint32_t));
size_t idx_output_strides = allocator.reserve_argspace( size_t idx_output_strides =
output_strides.data(), output_strides.size() * sizeof(unsigned int)); allocator.reserve_argspace(output_strides.data(), output_strides.size() * sizeof(uint32_t));
size_t idx_padding_below = size_t idx_padding_below =
allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(unsigned int)); allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(uint32_t));
size_t idx_padding_interior = size_t idx_padding_interior =
allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(unsigned int)); allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(uint32_t));
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs,
...@@ -1015,7 +1015,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c ...@@ -1015,7 +1015,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
args_list[6] = &nthreads; args_list[6] = &nthreads;
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(nthreads), static_cast<uint32_t>(nthreads),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
......
...@@ -285,19 +285,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( ...@@ -285,19 +285,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
const std::array<std::string, 2>& data_types) const std::array<std::string, 2>& data_types)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, " writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, unsigned int* input_strides, unsigned int* output_strides, " << data_types[1] << "* out, uint32_t* input_strides, uint32_t* output_strides, "
"unsigned int* padding_below, unsigned int* " "uint32_t* padding_below, uint32_t* "
"padding_interior, unsigned int rank, unsigned int n)\n"; "padding_interior, uint32_t rank, uint32_t n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;\n"; writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n"; writer << "if (tid < n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "unsigned int output_idx = 0;\n"; writer << "uint32_t output_idx = 0;\n";
writer << "unsigned int input_idx = tid;\n"; writer << "uint32_t input_idx = tid;\n";
writer << "for(unsigned int i = 0; i < rank; i++)\n"; writer << "for(uint32_t i = 0; i < rank; i++)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i] + " writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i] + "
......
...@@ -47,7 +47,7 @@ void runtime::gpu::emit_onehot(const std::string& name, ...@@ -47,7 +47,7 @@ void runtime::gpu::emit_onehot(const std::string& name,
void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count}; void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count), static_cast<uint32_t>(count),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
...@@ -84,7 +84,7 @@ void runtime::gpu::emit_reshape(const std::string& name, ...@@ -84,7 +84,7 @@ void runtime::gpu::emit_reshape(const std::string& name,
void* args_list[] = {&in, &out, &input_strides, &trans_strides, &rank, &count}; void* args_list[] = {&in, &out, &input_strides, &trans_strides, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count), static_cast<uint32_t>(count),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
...@@ -124,7 +124,7 @@ void runtime::gpu::emit_slice(const std::string& name, ...@@ -124,7 +124,7 @@ void runtime::gpu::emit_slice(const std::string& name,
void* args_list[] = { void* args_list[] = {
&in, &out, &input_strides, &lower_bounds, &slice_strides, &output_strides, &rank, &count}; &in, &out, &input_strides, &lower_bounds, &slice_strides, &output_strides, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count), static_cast<uint32_t>(count),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
...@@ -161,7 +161,7 @@ void runtime::gpu::emit_reverse(const std::string& name, ...@@ -161,7 +161,7 @@ void runtime::gpu::emit_reverse(const std::string& name,
void* args_list[] = {&in, &out, &input_shapes, &reverse_axes, &rank, &count}; void* args_list[] = {&in, &out, &input_shapes, &reverse_axes, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count), static_cast<uint32_t>(count),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
......
This diff is collapsed.
...@@ -77,7 +77,7 @@ namespace ngraph ...@@ -77,7 +77,7 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
writer.block_begin(" // " + node->get_name()); writer.block_begin();
{ {
std::vector<std::string> dtypes; std::vector<std::string> dtypes;
for (auto& arg : args) for (auto& arg : args)
......
...@@ -83,6 +83,13 @@ namespace ngraph ...@@ -83,6 +83,13 @@ namespace ngraph
const Node&, const Node&,
const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&); const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
void release_function() { m_function = nullptr; } void release_function() { m_function = nullptr; }
std::string emit_op_as_function(const Node& node, const std::string& function_name);
std::string strip_comments(const std::string& s) const;
bool is_functionally_identical(
const Node& n1,
const Node& n2,
const std::unordered_map<const Node*, std::string>& node_cache) const;
std::unique_ptr<codegen::Compiler> m_compiler; std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine; std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing; bool m_emit_timing;
......
...@@ -21,7 +21,6 @@ divide_by_zero_float32 ...@@ -21,7 +21,6 @@ divide_by_zero_float32
divide_by_zero_int32 divide_by_zero_int32
dot_4d_5d_multi_axis_big_fp64_VERY_SLOW dot_4d_5d_multi_axis_big_fp64_VERY_SLOW
dot_matrix_vector_int64 dot_matrix_vector_int64
function_call
mkldnn_layouts mkldnn_layouts
numeric_double_nan numeric_double_nan
numeric_float_inf numeric_float_inf
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "ngraph/op/relu.hpp" #include "ngraph/op/relu.hpp"
#include "ngraph/op/sum.hpp" #include "ngraph/op/sum.hpp"
#include "ngraph/op/tanh.hpp" #include "ngraph/op/tanh.hpp"
#include "ngraph/pass/algebraic_simplification.hpp"
#include "ngraph/pass/graph_rewrite.hpp" #include "ngraph/pass/graph_rewrite.hpp"
#include "ngraph/pass/manager.hpp" #include "ngraph/pass/manager.hpp"
#include "ngraph/pass/reshape_elimination.hpp" #include "ngraph/pass/reshape_elimination.hpp"
...@@ -2197,3 +2198,45 @@ TEST(cpu_fusion, fuse_batch_dot_forward) ...@@ -2197,3 +2198,45 @@ TEST(cpu_fusion, fuse_batch_dot_forward)
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f)); EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
} }
} }
TEST(cpu_fusion, fuse_rnn_across_layer)
{
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
pass_manager.register_pass<runtime::cpu::pass::MultiLayerRNNFusion>();
const string json_path =
file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_1timestep.json");
const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string);
shared_ptr<Function> func = ngraph::deserialize(ss);
pass_manager.run_passes(func);
size_t ref_rnn_count = 1;
auto rnn_count = count_ops_of_type<op::Rnn>(func);
EXPECT_EQ(ref_rnn_count, rnn_count);
}
TEST(cpu_fusion, fuse_rnn_across_2layer_1timestep)
{
const std::string file_name("mxnet/2rnn_layer_1timestep.json");
auto cpu_f = make_function(file_name);
auto int_f = make_function(file_name);
test::Uniform<float> rng(0.0f, 1.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : int_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_EQ(1, count_ops_of_type<op::Rnn>(cpu_f));
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(1), int_results.at(1), 1.0e-4f, 1.0e-4f));
}
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment