clang format

:

clang format
:
b5414ba5 · fenglei.tian · 6204a154 · b5414ba5
Commit b5414ba5 authored Mar 08, 2018 by fenglei.tian
Hide whitespace changes
Inline Side-by-side

Showing with 605 additions and 583 deletions

gpu_external_function.cpp src/ngraph/runtime/gpu/gpu_external_function.cpp +605 -583

No files found.
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -165,118 +165,113 @@ namespace ngraph
    {
        namespace gpu
        {
-static const OpMap dispatcher{
-    {TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
-    {TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
-    {TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
-    {TI(ngraph::op::Parameter), &GPU_Emitter::nop},
-    {TI(ngraph::op::Abs), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
-    {TI(ngraph::op::Divide), &GPU_Emitter::emit<ngraph::op::Divide>},
-    {TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
-    {TI(ngraph::op::GetOutputElement),
-     &GPU_Emitter::emit<ngraph::op::GetOutputElement>},
-    {TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
-    {TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
-    {TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
-    {TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
-    {TI(ngraph::op::Log), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
-    {TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
-    {TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
-    {TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
-    {TI(ngraph::op::Power), &GPU_Emitter::emit<ngraph::op::Power>},
-    {TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
-    {TI(ngraph::op::Subtract), &GPU_Emitter::emit<ngraph::op::Subtract>},
-    {TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
-    {TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
-    {TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
-    {TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
-    {TI(ngraph::op::FunctionCall),
-     &GPU_Emitter::emit<ngraph::op::FunctionCall>},
-    {TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
-    {TI(ngraph::op::Sign), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
-    {TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
-    {TI(ngraph::op::Exp), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Sin), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Sinh), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Cos), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Cosh), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Tan), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Tanh), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Asin), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Acos), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Atan), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::ReplaceSlice),
-     &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
-    {TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
-    {TI(ngraph::op::Floor), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Ceiling), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
-    {TI(ngraph::op::Convolution),
-     &GPU_Emitter::emit<ngraph::op::Convolution>},
-    {TI(ngraph::op::ConvolutionBackpropFilters),
-     &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
-    {TI(ngraph::op::ConvolutionBackpropData),
-     &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
-    {TI(ngraph::op::Not), &GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
-    {TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
-    {TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
-    {TI(ngraph::op::ReduceWindow),
-     &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
-    {TI(ngraph::op::SelectAndScatter),
-     &GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
-    {TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
-    {TI(ngraph::op::AvgPoolBackprop),
-     &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
-    {TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
-    {TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
-    {TI(ngraph::op::BatchNormBackprop),
-     &GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
-    {TI(ngraph::op::MaxPoolBackprop),
-     &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
-    {TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
-    {TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
-    {TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
-    {TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
-    {TI(ngraph::op::ReluBackprop),
-     &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
-    {TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
-};
-
-GPU_ExternalFunction::GPU_ExternalFunction(
-    const shared_ptr<ngraph::Function>& function, bool release_function)
-    : ngraph::runtime::ExternalFunction(function, release_function)
-    , m_compiled_function(nullptr)
-    , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
-{
-}
+            static const OpMap dispatcher{
+                {TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
+                {TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
+                {TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
+                {TI(ngraph::op::Parameter), &GPU_Emitter::nop},
+                {TI(ngraph::op::Abs), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
+                {TI(ngraph::op::Divide), &GPU_Emitter::emit<ngraph::op::Divide>},
+                {TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
+                {TI(ngraph::op::GetOutputElement),
+                 &GPU_Emitter::emit<ngraph::op::GetOutputElement>},
+                {TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
+                {TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
+                {TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
+                {TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
+                {TI(ngraph::op::Log), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
+                {TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
+                {TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
+                {TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
+                {TI(ngraph::op::Power), &GPU_Emitter::emit<ngraph::op::Power>},
+                {TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
+                {TI(ngraph::op::Subtract), &GPU_Emitter::emit<ngraph::op::Subtract>},
+                {TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
+                {TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
+                {TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
+                {TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
+                {TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
+                {TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
+                {TI(ngraph::op::Sign), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
+                {TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
+                {TI(ngraph::op::Exp), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Sin), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Sinh), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Cos), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Cosh), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Tan), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Tanh), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Asin), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Acos), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Atan), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
+                {TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
+                {TI(ngraph::op::Floor), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Ceiling), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
+                {TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
+                {TI(ngraph::op::ConvolutionBackpropFilters),
+                 &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
+                {TI(ngraph::op::ConvolutionBackpropData),
+                 &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
+                {TI(ngraph::op::Not), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
+                {TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
+                {TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
+                {TI(ngraph::op::ReduceWindow), &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
+                {TI(ngraph::op::SelectAndScatter),
+                 &GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
+                {TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
+                {TI(ngraph::op::AvgPoolBackprop), &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
+                {TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
+                {TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
+                {TI(ngraph::op::BatchNormBackprop),
+                 &GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
+                {TI(ngraph::op::MaxPoolBackprop), &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
+                {TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
+                {TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
+                {TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
+                {TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
+                {TI(ngraph::op::ReluBackprop), &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
+                {TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
+            };
+
+            GPU_ExternalFunction::GPU_ExternalFunction(const shared_ptr<ngraph::Function>& function,
+                                                       bool release_function)
+                : ngraph::runtime::ExternalFunction(function, release_function)
+                , m_compiled_function(nullptr)
+                , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
+            {
+            }

-void GPU_ExternalFunction::compile()
-{
-    if (m_is_compiled)
-    {
-        return;
-    }
+            void GPU_ExternalFunction::compile()
+            {
+                if (m_is_compiled)
+                {
+                    return;
+                }

-    string function_name = m_function->get_name();
-    string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
+                string function_name = m_function->get_name();
+                string dump_filename =
+                    file_util::path_join(s_output_dir, function_name + "_ops.txt");

-    pass::Manager pass_manager;
-    // pass_manager.register_pass<pass::TopologicalSort>();
-    // For now, just make everyone row-major.
-    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
-    pass_manager.register_pass<pass::Liveness>();
-    pass_manager.register_pass<pass::MemoryLayout>(64);
-    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
-    pass_manager.run_passes(m_function);
+                pass::Manager pass_manager;
+                // pass_manager.register_pass<pass::TopologicalSort>();
+                // For now, just make everyone row-major.
+                pass_manager
+                    .register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+                pass_manager.register_pass<pass::Liveness>();
+                pass_manager.register_pass<pass::MemoryLayout>(64);
+                pass_manager.register_pass<pass::DumpSorted>(dump_filename);
+                pass_manager.run_passes(m_function);

-    codegen::CodeWriter writer;
+                codegen::CodeWriter writer;

-    writer +=
-        R"(// Generated by the NGraph GPU backend
+                writer +=
+                    R"(// Generated by the NGraph GPU backend
    #include <cublas_v2.h>
    #include <cuda.h>
    #include <cuda_runtime.h>
@@ -302,532 +297,560 @@ void GPU_ExternalFunction::compile()
    #include "ngraph/util.hpp"
 )";

-    string pch_header_source = writer.get_code();
+                string pch_header_source = writer.get_code();

-    writer += R"(
+                writer += R"(
 using namespace ngraph;
 using namespace std;
    )";

-    if (m_emit_timing)
-    {
-        writer << "// Declare debug timers\n";
-        vector<string> names;
-        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-        {
-            for (shared_ptr<Node> node : current_function->get_ordered_ops())
-            {
-                if (!node->is_parameter() && !node->is_constant())
+                if (m_emit_timing)
                {
-                    names.push_back(node->get_name());
+                    writer << "// Declare debug timers\n";
+                    vector<string> names;
+                    for (shared_ptr<Function> current_function :
+                         pass_manager.get_state().get_functions())
+                    {
+                        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                        {
+                            if (!node->is_parameter() && !node->is_constant())
+                            {
+                                names.push_back(node->get_name());
+                            }
+                        }
+                    }
+                    for (const string& s : names)
+                    {
+                        writer << "ngraph::stopwatch timer_" << s << ";\n";
+                    }
+                    writer << "extern \"C\" size_t get_debug_timer_count() { return "
+                           << names.size() << "; }\n";
+                    writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "const char* rc;\n";
+                    writer << "switch(index)\n";
+                    writer << "{\n";
+                    for (size_t i = 0; i < names.size(); i++)
+                    {
+                        writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
+                    }
+                    writer << "default: rc = \"\";\n";
+                    writer << "}\n";
+                    writer << "return rc;\n";
+                    writer.indent--;
+                    writer << "}\n";
+                    writer
+                        << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "size_t rc;\n";
+                    writer << "switch(index)\n";
+                    writer << "{\n";
+                    for (size_t i = 0; i < names.size(); i++)
+                    {
+                        writer << "case " << i << ": rc = timer_" << names[i]
+                               << ".get_total_microseconds(); break;\n";
+                    }
+                    writer << "default: rc = 0;\n";
+                    writer << "}\n";
+                    writer << "return rc;\n";
+                    writer.indent--;
+                    writer << "}\n";
+                    writer
+                        << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "size_t rc;\n";
+                    writer << "switch(index)\n";
+                    writer << "{\n";
+                    for (size_t i = 0; i < names.size(); i++)
+                    {
+                        writer << "case " << i << ": rc = timer_" << names[i]
+                               << ".get_call_count(); break;\n";
+                    }
+                    writer << "default: rc = 0;\n";
+                    writer << "}\n";
+                    writer << "return rc;\n";
+                    writer.indent--;
+                    writer << "}\n";
+                    writer << "\n";
+                }
+                //     // The "dso_handle" symbol is required by __cxa_atexit()
+                //     // which is enabled because the JIT uses it as the default mechanism
+                //     // to register cleanup handlers. We use it, and not atexit(), because
+                //     // atexit() happens too late, when the JIT is no longer alive
+
+                writer << "void *__dso_handle = 0;\n\n";
+                writer << "// Declare all constants\n";
+                for (shared_ptr<Function> current_function :
+                     pass_manager.get_state().get_functions())
+                {
+                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                    {
+                        const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
+                        if (c)
+                        {
+                            shared_ptr<descriptor::TensorView> tv =
+                                node->get_outputs()[0].get_tensor_view();
+                            auto c_value_strings = c->get_value_strings();
+                            writer << "static "
+                                   << tv->get_tensor().get_element_type().c_type_string() << " "
+                                   << tv->get_tensor().get_name() << "_cpu["
+                                   << c_value_strings.size() << "] =\n";
+                            writer << "{\n";
+                            writer.indent++;
+                            writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
+                            writer.indent--;
+                            writer << "\n};\n\n";
+                            writer << "static "
+                                   << tv->get_tensor().get_element_type().c_type_string() << " *"
+                                   << tv->get_tensor().get_name() << ";\n";
+                            m_variable_name_map[tv->get_tensor().get_name()] =
+                                tv->get_tensor().get_name();
+                        }
+                    }
                }
-            }
-        }
-        for (const string& s : names)
-        {
-            writer << "ngraph::stopwatch timer_" << s << ";\n";
-        }
-        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
-               << "; }\n";
-        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
-        writer << "{\n";
-        writer.indent++;
-        writer << "const char* rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
-        }
-        writer << "default: rc = \"\";\n";
-        writer << "}\n";
-        writer << "return rc;\n";
-        writer.indent--;
-        writer << "}\n";
-        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-        writer << "{\n";
-        writer.indent++;
-        writer << "size_t rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = timer_" << names[i]
-                   << ".get_total_microseconds(); break;\n";
-        }
-        writer << "default: rc = 0;\n";
-        writer << "}\n";
-        writer << "return rc;\n";
-        writer.indent--;
-        writer << "}\n";
-        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-        writer << "{\n";
-        writer.indent++;
-        writer << "size_t rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
-        }
-        writer << "default: rc = 0;\n";
-        writer << "}\n";
-        writer << "return rc;\n";
-        writer.indent--;
-        writer << "}\n";
-        writer << "\n";
-    }
-    //     // The "dso_handle" symbol is required by __cxa_atexit()
-    //     // which is enabled because the JIT uses it as the default mechanism
-    //     // to register cleanup handlers. We use it, and not atexit(), because
-    //     // atexit() happens too late, when the JIT is no longer alive
-
-    writer << "void *__dso_handle = 0;\n\n";
-    writer << "// Declare all constants\n";
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-    {
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
-            if (c)
-            {
-                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                auto c_value_strings = c->get_value_strings();
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
-                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
-                       << "] =\n";
-                writer << "{\n";
-                writer.indent++;
-                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
-                writer.indent--;
-                writer << "\n};\n\n";
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
-                       << tv->get_tensor().get_name() << ";\n";
-                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
-            }
-        }
-    }
-
-    writer << "// Declare all functions\n";
-    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
-    {
-        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
-                                                           "cublasHandle_t& cublas_handle, "
-                                                           "cudnnHandle_t& cudnn_handle);\n";
-    }

-    writer << "\n";
+                writer << "// Declare all functions\n";
+                for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
+                {
+                    writer << "extern \"C\" void " << f->get_name()
+                           << "(void** inputs, void** outputs, "
+                              "cublasHandle_t& cublas_handle, "
+                              "cudnnHandle_t& cudnn_handle);\n";
+                }

-    unordered_map<Node*, string> match_functions;
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-    {
-        bool temporaries_used = false;
-        size_t worst_case_tmp_size = 0;
+                writer << "\n";

-        set<string> output_names;
-        for (shared_ptr<Node> op : current_function->get_results())
-        {
-            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            output_names.insert(tv->get_tensor().get_name());
-        }
-        const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
-        if (tmp.size() < 2)
-        {
-            // Since we are comparing ops there must be at least two ops to proceed.
-            continue;
-        }
-        vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
-        for (size_t i = 0; i < op_list.size() - 1; i++)
-        {
-            if (op_list[i]->is_constant() || op_list[i]->is_parameter())
-            {
-                continue;
-            }
-            if (contains_key(match_functions, op_list[i].get()))
-            {
-                continue;
-            }
-            string match_function_name;
-            for (size_t j = i + 1; j < op_list.size(); j++)
-            {
-                if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
+                unordered_map<Node*, string> match_functions;
+                for (shared_ptr<Function> current_function :
+                     pass_manager.get_state().get_functions())
                {
-                    if (match_function_name.empty())
+                    bool temporaries_used = false;
+                    size_t worst_case_tmp_size = 0;
+
+                    set<string> output_names;
+                    for (shared_ptr<Node> op : current_function->get_results())
                    {
-                        match_function_name = "func_" + op_list[i]->get_name();
-                        match_functions.insert({op_list[i].get(), match_function_name});
+                        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+                        output_names.insert(tv->get_tensor().get_name());
                    }
-                    match_functions.insert({op_list[j].get(), match_function_name});
-                }
-            }
-            if (!match_function_name.empty())
-            {
-                writer << "static void " << match_function_name << "(";
-                writer.indent++;
-                // Work around a compiler warning (*node inside typeid may have effects
-                // with shared pointers, which is fine here but clang doesn't like it.)
-                auto& n = *op_list[i];
-                auto handler = dispatcher.find(type_index(typeid(n)));
-                vector<GPU_TensorViewWrapper> in;
-                size_t arg_index = 0;
-                set<string> arg_names;
-                for (const descriptor::Input& input : n.get_inputs())
-                {
-                    const descriptor::Output& output = input.get_output();
-                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                    GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
-                    if (!contains(arg_names, tvw.get_name()))
+                    const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
+                    if (tmp.size() < 2)
                    {
-                        arg_names.insert(tvw.get_name());
-                        if (arg_index++ > 0)
+                        // Since we are comparing ops there must be at least two ops to proceed.
+                        continue;
+                    }
+                    vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
+                    for (size_t i = 0; i < op_list.size() - 1; i++)
+                    {
+                        if (op_list[i]->is_constant() || op_list[i]->is_parameter())
+                        {
+                            continue;
+                        }
+                        if (contains_key(match_functions, op_list[i].get()))
+                        {
+                            continue;
+                        }
+                        string match_function_name;
+                        for (size_t j = i + 1; j < op_list.size(); j++)
+                        {
+                            if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
+                            {
+                                if (match_function_name.empty())
+                                {
+                                    match_function_name = "func_" + op_list[i]->get_name();
+                                    match_functions.insert({op_list[i].get(), match_function_name});
+                                }
+                                match_functions.insert({op_list[j].get(), match_function_name});
+                            }
+                        }
+                        if (!match_function_name.empty())
                        {
-                            writer << ",";
+                            writer << "static void " << match_function_name << "(";
+                            writer.indent++;
+                            // Work around a compiler warning (*node inside typeid may have effects
+                            // with shared pointers, which is fine here but clang doesn't like it.)
+                            auto& n = *op_list[i];
+                            auto handler = dispatcher.find(type_index(typeid(n)));
+                            vector<GPU_TensorViewWrapper> in;
+                            size_t arg_index = 0;
+                            set<string> arg_names;
+                            for (const descriptor::Input& input : n.get_inputs())
+                            {
+                                const descriptor::Output& output = input.get_output();
+                                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                                GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
+                                if (!contains(arg_names, tvw.get_name()))
+                                {
+                                    arg_names.insert(tvw.get_name());
+                                    if (arg_index++ > 0)
+                                    {
+                                        writer << ",";
+                                    }
+                                    writer << "\n";
+                                    writer << tvw.get_type() << "* " << tvw.get_name();
+                                }
+                                in.push_back(tvw);
+                            }
+                            vector<GPU_TensorViewWrapper> out;
+                            for (const descriptor::Output& output : n.get_outputs())
+                            {
+                                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                                GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
+                                if (arg_index++ > 0)
+                                {
+                                    writer << ",";
+                                }
+                                writer << "\n";
+                                writer << tvw.get_type() << "* " << tvw.get_name();
+                                out.push_back(tvw);
+                            }
+                            writer.indent--;
+                            writer << "\n)\n";
+                            writer << "{\n";
+                            writer.indent++;
+                            handler->second(this, writer, &n, in, out);
+                            writer.indent--;
+                            writer << "}\n";
                        }
-                        writer << "\n";
-                        writer << tvw.get_type() << "* " << tvw.get_name();
                    }
-                    in.push_back(tvw);
                }
-                vector<GPU_TensorViewWrapper> out;
-                for (const descriptor::Output& output : n.get_outputs())
+
+                for (shared_ptr<Function> current_function :
+                     pass_manager.get_state().get_functions())
                {
-                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                    GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
-                    if (arg_index++ > 0)
+                    set<string> output_names;
+                    for (shared_ptr<Node> op : current_function->get_results())
                    {
-                        writer << ",";
+                        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+                        output_names.insert(tv->get_tensor().get_name());
+                    }
+                    set<descriptor::TensorView*> constants;
+                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                    {
+                        if (dynamic_cast<ngraph::op::Constant*>(node.get()))
+                        {
+                            shared_ptr<descriptor::TensorView> tv =
+                                node->get_outputs()[0].get_tensor_view();
+                            constants.insert(tv.get());
+                        }
                    }
-                    writer << "\n";
-                    writer << tvw.get_type() << "* " << tvw.get_name();
-                    out.push_back(tvw);
-                }
-                writer.indent--;
-                writer << "\n)\n";
-                writer << "{\n";
-                writer.indent++;
-                handler->second(this, writer, &n, in, out);
-                writer.indent--;
-                writer << "}\n";
-            }
-        }
-    }
-
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-    {
-        set<string> output_names;
-        for (shared_ptr<Node> op : current_function->get_results())
-        {
-            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            output_names.insert(tv->get_tensor().get_name());
-        }
-        set<descriptor::TensorView*> constants;
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
-            {
-                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                constants.insert(tv.get());
-            }
-        }
-
-        writer << "extern \"C\" void " << current_function->get_name();
-        writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, cudnnHandle_t& "
-                  "cudnn_handle)\n";
-        writer << "{\n";
-        writer.indent++;

-        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-        {
-            for (shared_ptr<Node> node : current_function->get_ordered_ops())
-            {
-                const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
-                if (c)
-                {
-                    shared_ptr<descriptor::TensorView> tv =
-                        node->get_outputs()[0].get_tensor_view();
-                    writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
+                    writer << "extern \"C\" void " << current_function->get_name();
+                    writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
+                              "cudnnHandle_t& "
+                              "cudnn_handle)\n";
                    writer << "{\n";
                    writer.indent++;
-                    writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
-                           << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
-                           << ");\n";
-                    writer.indent--;
-                    writer << "}\n";
-                }
-            }
-        }
-        bool temporaries_used = false;
-        size_t worst_case_tmp_size = 0;
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            if (node->liveness_new_list.size() > 0)
-            {
-                temporaries_used = true;
-                for (descriptor::Tensor* tensor : node->liveness_new_list)
-                {
-                    worst_case_tmp_size += tensor->size();
-                }
-            }
-        }
-        if (temporaries_used)
-        {
-            size_t temp_pool_size = current_function->get_temporary_pool_size();
-            writer << "// Allocate the memory pool\n";
-            // TODO memory pool malloc.
-            writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
-                   << temp_pool_size << ");\n";
-
-            // Add temporaries to the variable name map
-            for (shared_ptr<Node> node : current_function->get_ordered_ops())
-            {
-                for (descriptor::Tensor* tensor : node->liveness_new_list)
-                {
-                    stringstream ss;
-                    ss << "((" << tensor->get_element_type().c_type_string()
-                       << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
-                    m_variable_name_map[tensor->get_name()] = ss.str();
-                }
-            }
-        }

-        // Add inputs to the variable name map
-        size_t arg_index = 0;
-        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-        {
-            for (size_t i = 0; i < param->get_output_size(); ++i)
-            {
-                shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
-                const element::Type& et = tv->get_tensor_view_type()->get_element_type();
-                string type = et.c_type_string();
-                stringstream ss;
-                ss << "((" << type << "*)(inputs[" << arg_index << "]))";
-                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
-                arg_index++;
-            }
-        }
+                    for (shared_ptr<Function> current_function :
+                         pass_manager.get_state().get_functions())
+                    {
+                        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                        {
+                            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+                            if (c)
+                            {
+                                shared_ptr<descriptor::TensorView> tv =
+                                    node->get_outputs()[0].get_tensor_view();
+                                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
+                                writer << "{\n";
+                                writer.indent++;
+                                writer << "runtime::gpu::cuda_memcpyHtD("
+                                       << tv->get_tensor().get_name() << ", "
+                                       << tv->get_tensor().get_name() << "_cpu, "
+                                       << tv->get_tensor().size() << ");\n";
+                                writer.indent--;
+                                writer << "}\n";
+                            }
+                        }
+                    }
+                    bool temporaries_used = false;
+                    size_t worst_case_tmp_size = 0;
+                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                    {
+                        if (node->liveness_new_list.size() > 0)
+                        {
+                            temporaries_used = true;
+                            for (descriptor::Tensor* tensor : node->liveness_new_list)
+                            {
+                                worst_case_tmp_size += tensor->size();
+                            }
+                        }
+                    }
+                    if (temporaries_used)
+                    {
+                        size_t temp_pool_size = current_function->get_temporary_pool_size();
+                        writer << "// Allocate the memory pool\n";
+                        // TODO memory pool malloc.
+                        writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
+                               << temp_pool_size << ");\n";
+
+                        // Add temporaries to the variable name map
+                        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                        {
+                            for (descriptor::Tensor* tensor : node->liveness_new_list)
+                            {
+                                stringstream ss;
+                                ss << "((" << tensor->get_element_type().c_type_string()
+                                   << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset()
+                                   << "))";
+                                m_variable_name_map[tensor->get_name()] = ss.str();
+                            }
+                        }
+                    }

-        // create output alias map
-        size_t output_index = 0;
-        unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
-        vector<size_t> aliases;
-        for (size_t i = 0; i < current_function->get_output_size(); ++i)
-        {
-            shared_ptr<Node> op = current_function->get_output_op(i);
-            shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
-            vector<size_t>& al = output_alias_map[otv.get()];
-            al.push_back(output_index);
-            if (al.size() > 1)
-            {
-                aliases.push_back(output_index);
-            }
-            output_index++;
-        }
+                    // Add inputs to the variable name map
+                    size_t arg_index = 0;
+                    for (shared_ptr<ngraph::op::Parameter> param :
+                         current_function->get_parameters())
+                    {
+                        for (size_t i = 0; i < param->get_output_size(); ++i)
+                        {
+                            shared_ptr<descriptor::TensorView> tv =
+                                param->get_output_tensor_view(i);
+                            const element::Type& et =
+                                tv->get_tensor_view_type()->get_element_type();
+                            string type = et.c_type_string();
+                            stringstream ss;
+                            ss << "((" << type << "*)(inputs[" << arg_index << "]))";
+                            m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+                            arg_index++;
+                        }
+                    }

-        // Add outputs to the variable name map
-        output_index = 0;
-        for (size_t i = 0; i < current_function->get_output_size(); ++i)
-        {
-            shared_ptr<Node> op = current_function->get_output_op(i);
-            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
-            bool parameter_as_output = false;
-            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-            {
-                for (const descriptor::Output& pout : param->get_outputs())
-                {
-                    shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
-                    if (tv == ptv)
+                    // create output alias map
+                    size_t output_index = 0;
+                    unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
+                    vector<size_t> aliases;
+                    for (size_t i = 0; i < current_function->get_output_size(); ++i)
                    {
-                        parameter_as_output = true;
-                        writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
-                               << et.c_type_string() << "*>(outputs[" << output_index << "]), "
-                               << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
-                               << ptv->get_tensor().size() << ");\n";
-                        break;
+                        shared_ptr<Node> op = current_function->get_output_op(i);
+                        shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
+                        vector<size_t>& al = output_alias_map[otv.get()];
+                        al.push_back(output_index);
+                        if (al.size() > 1)
+                        {
+                            aliases.push_back(output_index);
+                        }
+                        output_index++;
+                    }
+
+                    // Add outputs to the variable name map
+                    output_index = 0;
+                    for (size_t i = 0; i < current_function->get_output_size(); ++i)
+                    {
+                        shared_ptr<Node> op = current_function->get_output_op(i);
+                        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+                        const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+                        bool parameter_as_output = false;
+                        for (shared_ptr<ngraph::op::Parameter> param :
+                             current_function->get_parameters())
+                        {
+                            for (const descriptor::Output& pout : param->get_outputs())
+                            {
+                                shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
+                                if (tv == ptv)
+                                {
+                                    parameter_as_output = true;
+                                    writer
+                                        << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
+                                        << et.c_type_string() << "*>(outputs[" << output_index
+                                        << "]), "
+                                        << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
+                                        << ptv->get_tensor().size() << ");\n";
+                                    break;
+                                }
+                            }
+                        }
+                        if (!parameter_as_output && !contains(aliases, output_index))
+                        {
+                            if (contains(constants, tv.get()))
+                            {
+                                writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
+                                       << output_index << "], " << tv->get_tensor().get_name()
+                                       << ", " << tv->get_tensor().size() << ");\n";
+                            }
+                            else
+                            {
+                                string type = et.c_type_string();
+                                stringstream ss;
+                                ss << "((" << type << "*)(outputs[" << output_index << "]))";
+                                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+                            }
+                        }
+                        output_index++;
                    }
+
+                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                    {
+                        auto& n =
+                            *node; // Work around a compiler warning (*node inside typeid may have effects
+                        // with shared pointers, which is fine here but clang doesn't like it.)
+                        auto handler = dispatcher.find(type_index(typeid(n)));
+                        if (handler == dispatcher.end())
+                        {
+                            throw ngraph_error("Unhandled op during code generation : " +
+                                               node->description());
+                        }
+                        vector<GPU_TensorViewWrapper> in;
+                        for (const descriptor::Input& input : node->get_inputs())
+                        {
+                            const descriptor::Output& output = input.get_output();
+                            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                            in.push_back(GPU_TensorViewWrapper(
+                                tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                        }
+                        vector<GPU_TensorViewWrapper> out;
+                        for (const descriptor::Output& output : node->get_outputs())
+                        {
+                            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                            out.push_back(GPU_TensorViewWrapper(
+                                tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                        }
+
+                        // Emit operation prologue
+                        if (!node->is_parameter() && !node->is_constant())
+                        {
+                            if (m_emit_timing)
+                            {
+                                emit_debug_function_entry(writer, node.get(), in, out);
+                            }
+                        }
+
+                        // Emit operation body
+                        string func_name;
+                        auto it = match_functions.find(node.get());
+                        if (it != match_functions.end())
+                        {
+                            func_name = it->second;
+                        }
+                        if (func_name.empty())
+                        {
+                            handler->second(this, writer, node.get(), in, out);
+                        }
+                        else
+                        {
+                            vector<string> names;
+                            for (const GPU_TensorViewWrapper& tv : in)
+                            {
+                                names.push_back(tv.get_name());
+                            }
+                            for (const GPU_TensorViewWrapper& tv : out)
+                            {
+                                names.push_back(tv.get_name());
+                            }
+                            writer << func_name << "(" << join(names) << ");\n";
+                        }
+
+                        // Emit operation epilogue
+                        if (!node->is_parameter() && !node->is_constant())
+                        {
+                            if (m_emit_timing)
+                            {
+                                emit_debug_function_exit(writer, node.get(), in, out);
+                            }
+                        }
+                    }
+
+                    writer.indent--;
+                    // End generated function
+                    writer += "}\n\n";
                }
-            }
-            if (!parameter_as_output && !contains(aliases, output_index))
-            {
-                if (contains(constants, tv.get()))
+                // TODO: Cleanup and make this a utility function
+
+                file_util::make_directory(s_output_dir);
+                string filename =
+                    file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
+                ofstream out(filename);
+                string code = writer.get_code();
+                out << code;
+                out.close();
+
+                m_compiler.reset(new codegen::Compiler());
+                m_execution_engine.reset(new codegen::ExecutionEngine());
+
+                m_compiler->set_precompiled_header_source(pch_header_source);
+
+                auto codegen_module = m_compiler->compile(code);
+
+                if (codegen_module == nullptr)
                {
-                    writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
-                           << "], " << tv->get_tensor().get_name() << ", "
-                           << tv->get_tensor().size() << ");\n";
+                    throw runtime_error("function failed to compile");
                }
-                else
+                m_execution_engine->add_module(codegen_module);
+                m_execution_engine->finalize();
+                m_compiled_function =
+                    m_execution_engine->find_function<EntryPoint_t>(function_name);
+                assert(m_compiled_function);
+
+                m_is_compiled = true;
+                if (m_release_function)
                {
-                    string type = et.c_type_string();
-                    stringstream ss;
-                    ss << "((" << type << "*)(outputs[" << output_index << "]))";
-                    m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+                    release_function();
                }
            }
-            output_index++;
-        }

-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
-            // with shared pointers, which is fine here but clang doesn't like it.)
-            auto handler = dispatcher.find(type_index(typeid(n)));
-            if (handler == dispatcher.end())
+            void GPU_ExternalFunction::handle_output_alias(
+                codegen::CodeWriter& writer,
+                const Node& node,
+                const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
            {
-                throw ngraph_error("Unhandled op during code generation : " + node->description());
-            }
-            vector<GPU_TensorViewWrapper> in;
-            for (const descriptor::Input& input : node->get_inputs())
-            {
-                const descriptor::Output& output = input.get_output();
-                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                in.push_back(
-                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
-            }
-            vector<GPU_TensorViewWrapper> out;
-            for (const descriptor::Output& output : node->get_outputs())
-            {
-                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                out.push_back(
-                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
-            }
-
-            // Emit operation prologue
-            if (!node->is_parameter() && !node->is_constant())
-            {
-                if (m_emit_timing)
+                for (const descriptor::Output& output : node.get_outputs())
                {
-                    emit_debug_function_entry(writer, node.get(), in, out);
+                    shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
+                    auto it = output_alias_map.find(otv.get());
+                    if (it != output_alias_map.end())
+                    {
+                        const vector<size_t>& outputs = it->second;
+                        if (outputs.size() > 1)
+                        {
+                            writer << "{    // handle output alias for previous op\n";
+                            writer.indent++;
+                            for (size_t i = 1; i < outputs.size(); i++)
+                            {
+                                writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
+                                          "outputs["
+                                       << outputs[i] << "]), static_cast<void*>(outputs["
+                                       << outputs[0] << "]), " << otv->get_tensor().size()
+                                       << ");\n";
+                            }
+                            writer.indent--;
+                            writer << "}\n";
+                        }
+                    }
                }
            }

-            // Emit operation body
-            string func_name;
-            auto it = match_functions.find(node.get());
-            if (it != match_functions.end())
-            {
-                func_name = it->second;
-            }
-            if (func_name.empty())
-            {
-                handler->second(this, writer, node.get(), in, out);
-            }
-            else
+            shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame()
            {
-                vector<string> names;
-                for (const GPU_TensorViewWrapper& tv : in)
+                if (!m_is_compiled)
                {
-                    names.push_back(tv.get_name());
+                    compile();
                }
-                for (const GPU_TensorViewWrapper& tv : out)
-                {
-                    names.push_back(tv.get_name());
-                }
-                writer << func_name << "(" << join(names) << ");\n";
+
+                return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
            }

-            // Emit operation epilogue
-            if (!node->is_parameter() && !node->is_constant())
+            void GPU_ExternalFunction::emit_debug_function_entry(
+                codegen::CodeWriter& writer,
+                Node* node,
+                const std::vector<GPU_TensorViewWrapper>& in,
+                const std::vector<GPU_TensorViewWrapper>& out)
            {
-                if (m_emit_timing)
-                {
-                    emit_debug_function_exit(writer, node.get(), in, out);
-                }
+                writer << "timer_" << node->get_name() << ".start();\n";
            }
-        }

-        writer.indent--;
-        // End generated function
-        writer += "}\n\n";
-    }
-    // TODO: Cleanup and make this a utility function
-
-    file_util::make_directory(s_output_dir);
-    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
-    ofstream out(filename);
-    string code = writer.get_code();
-    out << code;
-    out.close();
-
-    m_compiler.reset(new codegen::Compiler());
-    m_execution_engine.reset(new codegen::ExecutionEngine());
-
-    m_compiler->set_precompiled_header_source(pch_header_source);
-
-    auto codegen_module = m_compiler->compile(code);
-
-    if (codegen_module == nullptr)
-    {
-        throw runtime_error("function failed to compile");
-    }
-    m_execution_engine->add_module(codegen_module);
-    m_execution_engine->finalize();
-    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
-    assert(m_compiled_function);
-
-    m_is_compiled = true;
-    if (m_release_function)
-    {
-        release_function();
-    }
-}
-
-void GPU_ExternalFunction::handle_output_alias(
-    codegen::CodeWriter& writer,
-    const Node& node,
-    const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
-{
-    for (const descriptor::Output& output : node.get_outputs())
-    {
-        shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
-        auto it = output_alias_map.find(otv.get());
-        if (it != output_alias_map.end())
-        {
-            const vector<size_t>& outputs = it->second;
-            if (outputs.size() > 1)
+            void GPU_ExternalFunction::emit_debug_function_exit(
+                codegen::CodeWriter& writer,
+                Node* node,
+                const std::vector<GPU_TensorViewWrapper>& in,
+                const std::vector<GPU_TensorViewWrapper>& out)
            {
-                writer << "{    // handle output alias for previous op\n";
-                writer.indent++;
-                for (size_t i = 1; i < outputs.size(); i++)
-                {
-                    writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>(outputs["
-                           << outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
-                           << "]), " << otv->get_tensor().size() << ");\n";
-                }
-                writer.indent--;
-                writer << "}\n";
+                writer << "timer_" << node->get_name() << ".stop();\n";
            }
        }
    }
-}
-
-shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame()
-{
-    if (!m_is_compiled)
-    {
-        compile();
-    }
-
-    return make_shared<GPU_CallFrame>(shared_from_this(),
-                                                            m_compiled_function);
-}
-
-void GPU_ExternalFunction::emit_debug_function_entry(
-    codegen::CodeWriter& writer,
-    Node* node,
-    const std::vector<GPU_TensorViewWrapper>& in,
-    const std::vector<GPU_TensorViewWrapper>& out)
-{
-    writer << "timer_" << node->get_name() << ".start();\n";
-}
-
-void GPU_ExternalFunction::emit_debug_function_exit(
-    codegen::CodeWriter& writer,
-    Node* node,
-    const std::vector<GPU_TensorViewWrapper>& in,
-    const std::vector<GPU_TensorViewWrapper>& out)
-{
-    writer << "timer_" << node->get_name() << ".stop();\n";
-}
-        }
-    }
 }
\ No newline at end of file