gpu_external_function and gpu constant memory refactor (#1189)

* refactor external function * wokring version * fix bug * add emit_fucntions, emit_declare_constants, emit_declare_functions * add std:: * add functions declaration * fix bugs * fix bugs * separate temp memory allocation and release * add invoke_constant_ptr function, clean up outputs for function * fix bugs, compiled ok * add ctx to emit_declare_constant * cleanup code, code style * remove using std, code style * revert std changes * change function names based Chris's comments * add ResultCopyElimination to pass_manager * clang format

gpu_external_function and gpu constant memory refactor (#1189)
* refactor external function * wokring version * fix bug * add emit_fucntions, emit_declare_constants, emit_declare_functions * add std:: * add functions declaration * fix bugs * fix bugs * separate temp memory allocation and release * add invoke_constant_ptr function, clean up outputs for function * fix bugs, compiled ok * add ctx to emit_declare_constant * cleanup code, code style * remove using std, code style * revert std changes * change function names based Chris's comments * add ResultCopyElimination to pass_manager * clang format
260cb90d · Fenglei · Scott Cyphers · 2c345798 · 260cb90d · 260cb90d
Commit 260cb90d authored Jul 13, 2018 by Fenglei Committed by Scott Cyphers Jul 13, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 335 additions and 374 deletions

gpu_external_function.cpp src/ngraph/runtime/gpu/gpu_external_function.cpp +299 -352

gpu_external_function.hpp src/ngraph/runtime/gpu/gpu_external_function.hpp +36 -22

No files found.
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -20,16 +20,9 @@
 #include <cuda_runtime.h>
 #include <cudnn.h>
 #include <fstream>
-#include <memory>
 #include <string>
 #include <tuple>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>
-#include "ngraph/codegen/code_writer.hpp"
-#include "ngraph/codegen/compiler.hpp"
-#include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/descriptor/input.hpp"
 #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
 #include "ngraph/descriptor/output.hpp"
@@ -104,13 +97,7 @@
 #include "ngraph/op/sum.hpp"
 #include "ngraph/op/tan.hpp"
 #include "ngraph/op/tanh.hpp"
-#include "ngraph/pass/assign_layout.hpp"
-#include "ngraph/pass/dump_sorted.hpp"
-#include "ngraph/pass/liveness.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
@@ -252,11 +239,12 @@ static const runtime::gpu::OpMap dispatcher{
 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
    const shared_ptr<ngraph::Function>& function, bool release_function)
    : m_compiled_function(nullptr)
-    , m_emit_timing(false)
+    , m_ctx(new GPURuntimeContext)
    , m_function(function)
-    , m_release_function(release_function)
+    , m_emit_timing(false)
    , m_is_compiled(false)
-    , m_ctx(new GPURuntimeContext)
+    , m_release_function(release_function)
+    , m_temporaries_used(false)
 {
    // Create context use driver API and make it current, the runtime call will pickup the context
    // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
@@ -289,37 +277,10 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
    delete m_ctx->compiled_kernel_pool;
 }
-void runtime::gpu::GPU_ExternalFunction::compile()
+void runtime::gpu::GPU_ExternalFunction::emit_header()
 {
-    if (m_is_compiled)
+    m_writer += R"(
-    {
+// Generated by the nGraph GPU backend
-        return;
-    }
-    m_primitive_emitter.reset(new GPUPrimitiveEmitter());
-    string function_name = m_function->get_name();
-    string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
-    pass::Manager pass_manager;
-    // pass_manager.register_pass<pass::TopologicalSort>();
-    // For now, just make everyone row-major.
-    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
-    pass_manager.register_pass<pass::Liveness>();
-    pass_manager.register_pass<pass::MemoryLayout>(64);
-    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
-    pass_manager.run_passes(m_function);
-    unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-    {
-        function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
-    }
-    codegen::CodeWriter writer;
-    writer +=
-        R"(// Generated by the nGraph GPU backend
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -348,9 +309,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
 #include "ngraph/util.hpp"
 )";
-    string pch_header_source = writer.get_code();
+    m_pch_header_source = m_writer.get_code();
-    writer += R"(
+    m_writer += R"(
 using namespace ngraph;
 using namespace ngraph::runtime;
 using namespace std;
@@ -360,15 +321,19 @@ using namespace std;
    // which is enabled because the JIT uses it as the default mechanism
    // to register cleanup handlers. We use it, and not atexit(), because
    // atexit() happens too late, when the JIT is no longer alive
-    writer << "void *__dso_handle = 0;\n\n";
+    m_writer << "void *__dso_handle = 0;\n\n";
+}
+void runtime::gpu::GPU_ExternalFunction::emit_timer_functions()
+{
    if (m_emit_timing)
    {
-        writer << "// Declare debug timers\n";
+        m_writer << "// Declare debug timers\n";
        vector<string> names;
        size_t index = 0;
-        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
        {
-            for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
+            for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
            {
                if (!node->is_parameter() && !node->is_constant())
                {
@@ -377,77 +342,114 @@ using namespace std;
                }
            }
        }
-        writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
+        m_writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
-        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
+        m_writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
-               << "; }\n";
+                 << "; }\n";
-        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+        m_writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
-        writer.block_begin();
+        m_writer.block_begin();
-        writer << "static const char* timer_names[" << names.size() << "] =\n";
+        m_writer << "static const char* timer_names[" << names.size() << "] =\n";
-        writer.block_begin();
+        m_writer.block_begin();
        vector<string> quoted_names;
        for (const string& name : names)
        {
            quoted_names.push_back("\"" + name + "\"");
        }
-        writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
+        m_writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
-        writer.indent--;
+        m_writer.indent--;
-        writer << "\n};\n";
+        m_writer << "\n};\n";
-        writer << "return timer_names[index];\n";
+        m_writer << "return timer_names[index];\n";
-        writer.block_end();
+        m_writer.block_end();
-        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+        m_writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-        writer.block_begin();
+        m_writer.block_begin();
-        writer << "return (index < " << names.size()
+        m_writer << "return (index < " << names.size()
-               << " ? timers[index].get_total_microseconds() : 0);\n";
+                 << " ? timers[index].get_total_microseconds() : 0);\n";
-        writer.block_end();
+        m_writer.block_end();
-        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+        m_writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-        writer.block_begin();
+        m_writer.block_begin();
-        writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
+        m_writer << "return (index < " << names.size()
-        writer.block_end();
+                 << " ? timers[index].get_call_count() : 0);\n";
-        writer << "\n";
+        m_writer.block_end();
+        m_writer << "\n";
    }
-    writer << "// Declare all constants\n";
+}
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
+{
+    m_writer << "// Declare all constants\n";
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
    {
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
        {
            const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                auto c_value_strings = c->get_value_strings();
+                // get an allocator for transient per kernel gpu memory
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
+                GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
-                       << tv->get_tensor().get_name() << ";\n";
+                size_t idx = allocator.reserve_argspace(
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
+                    c->get_data_ptr(),
-                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
+                    tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
-                       << "] =\n";
+                m_writer << "static size_t " << tv->get_tensor().get_name() << "_idx = " << idx
-                writer << "{\n";
+                         << ";\n";
-                writer.indent++;
+                m_writer << "static " << tv->get_tensor().get_element_type().c_type_string() << "* "
-                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
+                         << tv->get_tensor().get_name() << " = nullptr;\n";
-                writer.indent--;
-                writer << "\n};\n\n";
                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
            }
        }
    }
-    writer << "// Declare all functions\n";
+    m_writer << "\nstatic bool is_constant_mem_ptr_null = true;\n\n";
-    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
+    m_writer << "static void invoke_constant_mem_ptr(gpu::GPURuntimeContext* ctx)\n";
+    m_writer.block_begin();
    {
-        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
+        m_writer << "if(is_constant_mem_ptr_null)\n";
-               << "gpu::GPURuntimeContext* ctx);\n";
+        m_writer.block_begin();
+        {
+            for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
+            {
+                for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
+                {
+                    const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
+                    if (c)
+                    {
+                        shared_ptr<descriptor::TensorView> tv =
+                            node->get_outputs()[0].get_tensor_view();
+                        m_writer << tv->get_tensor().get_name() << " = reinterpret_cast<"
+                                 << tv->get_tensor().get_element_type().c_type_string()
+                                 << "*>(runtime::gpu::invoke_memory_primitive(ctx, "
+                                 << tv->get_tensor().get_name() << "_idx));\n";
+                    }
+                }
+            }
+            m_writer << "is_constant_mem_ptr_null = false;\n";
+        }
+        m_writer.block_end();
    }
-    writer << "\n";
+    m_writer.block_end();
+}
+void runtime::gpu::GPU_ExternalFunction::emit_function_declarations()
+{
+    m_writer << "// Declare all functions\n";
+    for (shared_ptr<Function> f : m_pass_manager.get_state().get_functions())
+    {
+        m_writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
+                 << "gpu::GPURuntimeContext* ctx);\n";
+    }
+    m_writer << "\n";
+}
+void runtime::gpu::GPU_ExternalFunction::collect_unique_functions()
+{
    // This for loop creates a collection of functions that are called more than once
    // and emitting them as globally callable functions.
    // ops implement the is_functionally_identical method
    unordered_map<string, string> match_function_map;
-    unordered_map<const Node*, string> node_function_map;
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
    {
-        list<shared_ptr<Node>> tmp = current_function->get_ordered_ops();
+        list<shared_ptr<Node>> tmp = m_function_ordered_ops.at(current_function);
        if (tmp.size() < 2)
        {
            // Since we are comparing ops there must be at least two ops to proceed.
@@ -481,13 +483,62 @@ using namespace std;
                match_function_name = "func_" + node.get_name();
                emitted_function.replace(offset, 5, match_function_name);
                match_function_map.insert({match_function, match_function_name});
-                writer << emitted_function << "\n";
+                m_writer << emitted_function << "\n";
            }
-            node_function_map.insert({&node, match_function_name});
+            m_node_function_map.insert({&node, match_function_name});
        }
    }
+}
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_allocation(
+    shared_ptr<Function> current_function)
+{
+    m_temporaries_used = false;
+    size_t worst_case_tmp_size = 0;
+    for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
+    {
+        if (node->liveness_new_list.size() > 0)
+        {
+            m_temporaries_used = true;
+            for (descriptor::Tensor* tensor : node->liveness_new_list)
+            {
+                worst_case_tmp_size += tensor->size();
+            }
+        }
+    }
+    if (m_temporaries_used)
+    {
+        size_t temp_pool_size = current_function->get_temporary_pool_size();
+        m_writer << "// Allocate the memory pool\n";
+        // TODO memory pool malloc.
+        m_writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
+                 << temp_pool_size << ");\n";
+        // Add temporaries to the variable name map
+        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
+        {
+            for (descriptor::Tensor* tensor : node->liveness_new_list)
+            {
+                stringstream ss;
+                ss << "((" << tensor->get_element_type().c_type_string()
+                   << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
+                m_variable_name_map[tensor->get_name()] = ss.str();
+            }
+        }
+    }
+}
+void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_release()
+{
+    if (m_temporaries_used)
+    {
+        m_writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
+    }
+}
+void runtime::gpu::GPU_ExternalFunction::emit_functions()
+{
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
    {
        set<string> output_names;
        for (shared_ptr<Node> op : current_function->get_results())
@@ -496,7 +547,7 @@ using namespace std;
            output_names.insert(tv->get_tensor().get_name());
        }
        set<descriptor::TensorView*> constants;
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
        {
            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
            {
@@ -505,258 +556,192 @@ using namespace std;
            }
        }
-        writer << "extern \"C\" void " << current_function->get_name();
+        m_writer << "extern \"C\" void " << current_function->get_name();
-        writer << "(void** inputs, void** outputs, "
+        m_writer << "(void** inputs, void** outputs, "
-               << "gpu::GPURuntimeContext* ctx)\n";
+                 << "gpu::GPURuntimeContext* ctx)\n";
-        writer << "{\n";
+        m_writer.block_begin();
-        writer.indent++;
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+            //set constant pointers during the first run
-            if (c)
+            m_writer << "invoke_constant_mem_ptr(ctx);\n";
-            {
-                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
-                writer << "{\n";
-                writer.indent++;
-                writer << tv->get_tensor().get_name() << " = ("
-                       << tv->get_tensor().get_element_type().c_type_string()
-                       << " *) runtime::gpu::create_gpu_buffer(" << tv->get_tensor().size()
-                       << ");\n";
-                writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
-                       << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
-                       << ");\n";
-                writer.indent--;
-                writer << "}\n";
-            }
-        }
-        bool temporaries_used = false;
+            //alocate temp memory pool
-        size_t worst_case_tmp_size = 0;
+            emit_temp_mem_pool_allocation(current_function);
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
+            // Add inputs to the variable name map
-            if (node->liveness_new_list.size() > 0)
+            size_t arg_index = 0;
-            {
+            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-                temporaries_used = true;
-                for (descriptor::Tensor* tensor : node->liveness_new_list)
-                {
-                    worst_case_tmp_size += tensor->size();
-                }
-            }
-        }
-        if (temporaries_used)
-        {
-            size_t temp_pool_size = current_function->get_temporary_pool_size();
-            writer << "// Allocate the memory pool\n";
-            // TODO memory pool malloc.
-            writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
-                   << temp_pool_size << ");\n";
-            // Add temporaries to the variable name map
-            for (shared_ptr<Node> node : current_function->get_ordered_ops())
            {
-                for (descriptor::Tensor* tensor : node->liveness_new_list)
+                for (size_t i = 0; i < param->get_output_size(); ++i)
                {
+                    shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
+                    const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+                    string type = et.c_type_string();
                    stringstream ss;
-                    ss << "((" << tensor->get_element_type().c_type_string()
+                    ss << "((" << type << "*)(inputs[" << arg_index << "]))";
-                       << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
+                    m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
-                    m_variable_name_map[tensor->get_name()] = ss.str();
+                    arg_index++;
                }
            }
-        }
-        // Add inputs to the variable name map
+            // Add outputs to the variable name map
-        size_t arg_index = 0;
+            for (size_t i = 0; i < current_function->get_output_size(); ++i)
-        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-        {
-            for (size_t i = 0; i < param->get_output_size(); ++i)
            {
-                shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
+                shared_ptr<Node> op = current_function->get_output_op(i);
-                const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+                shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-                string type = et.c_type_string();
+                string type = tv->get_tensor_view_type()->get_element_type().c_type_string();
                stringstream ss;
-                ss << "((" << type << "*)(inputs[" << arg_index << "]))";
+                ss << "((" << type << "*)(outputs[" << i << "]))";
                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
-                arg_index++;
-            }
-        }
-        // create output alias map
+                //it should be safe to assign both descriptors to one output*
-        size_t output_index = 0;
+                //since needs_copy == false makes `op::Result` an nop
-        unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
+                auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
-        vector<size_t> aliases;
+                if (!res->needs_copy())
-        for (size_t i = 0; i < current_function->get_output_size(); ++i)
+                {
-        {
+                    shared_ptr<descriptor::TensorView> itv =
-            shared_ptr<Node> op = current_function->get_output_op(i);
+                        res->get_inputs().at(0).get_output().get_tensor_view();
-            shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
+                    m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
-            vector<size_t>& al = output_alias_map[otv.get()];
+                }
-            al.push_back(output_index);
-            if (al.size() > 1)
-            {
-                aliases.push_back(output_index);
            }
-            output_index++;
-        }
-        // Add outputs to the variable name map
+            for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
-        output_index = 0;
-        for (size_t i = 0; i < current_function->get_output_size(); ++i)
-        {
-            shared_ptr<Node> op = current_function->get_output_op(i);
-            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
-            bool parameter_as_output = false;
-            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
            {
-                for (const descriptor::Output& pout : param->get_outputs())
+                auto& n =
+                    *node; // Work around a compiler warning (*node inside typeid may have effects
+                // with shared pointers, which is fine here but clang doesn't like it.)
+                auto handler = dispatcher.find(type_index(typeid(n)));
+                if (handler == dispatcher.end())
                {
-                    shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
+                    throw ngraph_error("Unhandled op during code generation : " +
-                    if (tv == ptv)
+                                       node->description());
-                    {
-                        parameter_as_output = true;
-                        writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
-                               << et.c_type_string() << "*>(outputs[" << output_index << "]), "
-                               << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
-                               << ptv->get_tensor().size() << ");\n";
-                        break;
-                    }
                }
-            }
+                vector<GPU_TensorViewWrapper> in;
-            if (!parameter_as_output && !contains(aliases, output_index))
+                vector<string> node_input_names;
-            {
+                vector<string> node_output_names;
-                if (contains(constants, tv.get()))
+                for (const descriptor::Input& input : node->get_inputs())
                {
-                    writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
+                    const descriptor::Output& output = input.get_output();
-                           << "], " << tv->get_tensor().get_name() << ", "
+                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                           << tv->get_tensor().size() << ");\n";
+                    in.push_back(GPU_TensorViewWrapper(
+                        tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                    node_input_names.emplace_back(tv->get_tensor().get_name());
                }
-                else
+                vector<GPU_TensorViewWrapper> out;
+                for (const descriptor::Output& output : node->get_outputs())
                {
-                    string type = et.c_type_string();
+                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                    stringstream ss;
+                    out.push_back(GPU_TensorViewWrapper(
-                    ss << "((" << type << "*)(outputs[" << output_index << "]))";
+                        tv, m_variable_name_map[tv->get_tensor().get_name()]));
-                    m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+                    node_output_names.emplace_back(tv->get_tensor().get_name());
                }
-            }
-            output_index++;
-        }
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
-            // with shared pointers, which is fine here but clang doesn't like it.)
-            auto handler = dispatcher.find(type_index(typeid(n)));
-            if (handler == dispatcher.end())
-            {
-                throw ngraph_error("Unhandled op during code generation : " + node->description());
-            }
-            vector<GPU_TensorViewWrapper> in;
-            vector<string> node_input_names;
-            vector<string> node_output_names;
-            for (const descriptor::Input& input : node->get_inputs())
-            {
-                const descriptor::Output& output = input.get_output();
-                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                in.push_back(
-                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
-                node_input_names.emplace_back(tv->get_tensor().get_name());
-            }
-            vector<GPU_TensorViewWrapper> out;
-            for (const descriptor::Output& output : node->get_outputs())
-            {
-                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                out.push_back(
-                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
-                node_output_names.emplace_back(tv->get_tensor().get_name());
-            }
-            // Emit function description comment
-            if (!node->is_parameter() && !node->is_constant())
-            {
-                writer << "\n// " << node->get_name() << "(";
-                vector<string> parameter_nodes = node_input_names;
-                parameter_nodes.insert(
-                    parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
-                writer << join(parameter_nodes);
-                writer << ")\n";
-            }
-            // Emit operation prologue
+                // Emit function description comment
-            if (!node->is_parameter() && !node->is_constant())
+                if (!node->is_parameter() && !node->is_constant())
-            {
-                if (m_emit_timing)
                {
-                    emit_debug_function_entry(writer, node.get(), in, out);
+                    m_writer << "\n// " << node->get_name() << "(";
+                    vector<string> parameter_nodes = node_input_names;
+                    parameter_nodes.insert(
+                        parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
+                    m_writer << join(parameter_nodes);
+                    m_writer << ")\n";
+                    emit_debug_function_entry(node.get());
                }
-            }
-            // Emit operation body
+                // Emit operation body
-            string func_name;
+                string func_name;
-            func_name = node_function_map[node.get()];
+                func_name = m_node_function_map[node.get()];
-            if (func_name.empty())
+                if (func_name.empty())
-            {
-                //throw runtime_error("No matching function found for '" + node->get_name() + "'");
-                handler->second(this, writer, node.get(), in, out);
-            }
-            else
-            {
-                vector<string> names;
-                for (const GPU_TensorViewWrapper& tv : in)
                {
-                    names.push_back(tv.get_name());
+                    //throw runtime_error("No matching function found for '" + node->get_name() + "'");
+                    handler->second(this, m_writer, node.get(), in, out);
                }
-                for (const GPU_TensorViewWrapper& tv : out)
+                else
                {
-                    names.push_back(tv.get_name());
+                    vector<string> names;
+                    for (const GPU_TensorViewWrapper& tv : in)
+                    {
+                        names.push_back(tv.get_name());
+                    }
+                    for (const GPU_TensorViewWrapper& tv : out)
+                    {
+                        names.push_back(tv.get_name());
+                    }
+                    names.push_back("ctx");
+                    m_writer << func_name << "(" << join(names) << ");\n";
                }
-                names.push_back("ctx");
-                writer << func_name << "(" << join(names) << ");\n";
-            }
-            // Emit operation epilogue
+                // Emit operation epilogue
-            if (!node->is_parameter() && !node->is_constant())
+                if (!node->is_parameter() && !node->is_constant())
-            {
-                if (m_emit_timing)
                {
-                    emit_debug_function_exit(writer, node.get(), in, out);
+                    emit_debug_function_exit(node.get());
                }
            }
+            emit_temp_mem_pool_release();
        }
-        if (temporaries_used)
+        m_writer.block_end(); // End generated function
-        {
-            writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
-        }
-        writer.indent--;
-        // End generated function
-        writer += "}\n\n";
    }
+}
-    // allocate device buffers for primitive arguments and workspace
+void runtime::gpu::GPU_ExternalFunction::store_emitted_functions(const string& code)
-    m_primitive_emitter->allocate_primitive_memory();
+{
    // TODO: Cleanup and make this a utility function
-    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
+    string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
    ofstream out(filename);
-    string code = writer.get_code();
    out << code;
    out.close();
+}
+void runtime::gpu::GPU_ExternalFunction::compile()
+{
+    if (m_is_compiled)
+    {
+        return;
+    }
+    m_primitive_emitter.reset(new GPUPrimitiveEmitter());
+    m_function_name = m_function->get_name();
+    string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
+    // For now, just make everyone row-major.
+    m_pass_manager.register_pass<pass::ResultCopyElimination>();
+    m_pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+    m_pass_manager.register_pass<pass::Liveness>();
+    m_pass_manager.register_pass<pass::MemoryLayout>(64);
+    m_pass_manager.register_pass<pass::DumpSorted>(dump_filename);
+    m_pass_manager.run_passes(m_function);
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
+    {
+        m_function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
+    }
+    emit_header();
+    emit_timer_functions();
+    emit_constant_declarations();
+    emit_function_declarations();
+    collect_unique_functions();
+    emit_functions();
+    // allocate device buffers for primitive arguments and workspace
+    m_primitive_emitter->allocate_primitive_memory();
+    string code = m_writer.get_code();
+    store_emitted_functions(code);
    m_compiler.reset(new codegen::Compiler());
    m_execution_engine.reset(new codegen::ExecutionEngine());
+    m_compiler->set_precompiled_header_source(m_pch_header_source);
-    m_compiler->set_precompiled_header_source(pch_header_source);
    auto codegen_module = m_compiler->compile(code);
    if (codegen_module == nullptr)
    {
        throw runtime_error("Function failed to compile to bitcode");
    }
    m_execution_engine->add_module(codegen_module);
    m_execution_engine->finalize();
-    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);
    if (!m_compiled_function)
    {
        throw runtime_error("Function failed to compile");
@@ -769,36 +754,6 @@ using namespace std;
    }
 }
-void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
-    codegen::CodeWriter& writer,
-    const Node& node,
-    const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
-{
-    for (const descriptor::Output& output : node.get_outputs())
-    {
-        shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
-        auto it = output_alias_map.find(otv.get());
-        if (it != output_alias_map.end())
-        {
-            const vector<size_t>& outputs = it->second;
-            if (outputs.size() > 1)
-            {
-                writer << "{    // handle output alias for previous op\n";
-                writer.indent++;
-                for (size_t i = 1; i < outputs.size(); i++)
-                {
-                    writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
-                              "outputs["
-                           << outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
-                           << "]), " << otv->get_tensor().size() << ");\n";
-                }
-                writer.indent--;
-                writer << "}\n";
-            }
-        }
-    }
-}
 shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
    runtime::gpu::GPU_ExternalFunction::make_call_frame()
 {
@@ -810,35 +765,27 @@ shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
    return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
 }
-void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(Node* node)
-    codegen::CodeWriter& writer,
-    Node* node,
-    const std::vector<GPU_TensorViewWrapper>& in,
-    const std::vector<GPU_TensorViewWrapper>& out)
 {
-    writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
+    if (m_emit_timing)
+    {
+        m_writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
+    }
 }
-void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
-    codegen::CodeWriter& writer,
-    Node* node,
-    const std::vector<GPU_TensorViewWrapper>& in,
-    const std::vector<GPU_TensorViewWrapper>& out)
 {
-    writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
+    if (m_emit_timing)
+    {
+        m_writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
+    }
 }
-std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
+unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
 {
    return m_ctx;
 }
-bool runtime::gpu::GPU_ExternalFunction::is_functionally_identical(
-    const Node& n1, const Node& n2, const unordered_map<const Node*, string>& node_cache) const
-{
-    return node_cache.at(&n1) == node_cache.at(&n2);
-}
 string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
                                                               const string& function_name)
 {

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -26,6 +26,12 @@
 #include "ngraph/codegen/compiler.hpp"
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/function.hpp"
+#include "ngraph/pass/assign_layout.hpp"
+#include "ngraph/pass/dump_sorted.hpp"
+#include "ngraph/pass/liveness.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/pass/result_copy_elimination.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
@@ -58,6 +64,7 @@ namespace ngraph
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                     bool release_function = true);
                ~GPU_ExternalFunction();
                std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame();
                std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
                const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
@@ -71,39 +78,46 @@ namespace ngraph
                EntryPoint m_compiled_function;
            private:
-                void emit_debug_function_entry(codegen::CodeWriter& writer,
+                void collect_unique_functions();
-                                               Node* node,
+                void emit_header();
-                                               const std::vector<GPU_TensorViewWrapper>& in,
+                void emit_timer_functions();
-                                               const std::vector<GPU_TensorViewWrapper>& out);
+                void emit_constant_declarations();
-                void emit_debug_function_exit(codegen::CodeWriter& writer,
+                void emit_function_declarations();
-                                              Node* node,
+                void emit_functions();
-                                              const std::vector<GPU_TensorViewWrapper>& in,
+                void emit_debug_function_entry(Node* node);
-                                              const std::vector<GPU_TensorViewWrapper>& out);
+                void emit_debug_function_exit(Node* node);
-                void handle_output_alias(
+                void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
-                    codegen::CodeWriter& writer,
+                void emit_temp_mem_pool_release();
-                    const Node&,
-                    const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
                void release_function() { m_function = nullptr; }
+                void store_emitted_functions(const std::string& code);
                std::string emit_op_as_function(const Node& node, const std::string& function_name);
                std::string strip_comments(const std::string& s) const;
-                bool is_functionally_identical(
-                    const Node& n1,
+                codegen::CodeWriter m_writer;
-                    const Node& n2,
+                pass::Manager m_pass_manager;
-                    const std::unordered_map<const Node*, std::string>& node_cache) const;
                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
-                bool m_emit_timing;
+                std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
-                std::unordered_map<std::string, std::string> m_variable_name_map;
+                std::unique_ptr<GPURuntimeContext> m_ctx;
-                std::map<std::string, size_t> m_name_index_map;
                std::shared_ptr<ngraph::Function> m_function;
-                bool m_release_function;
+                std::map<std::string, size_t> m_name_index_map;
+                std::unordered_map<std::string, std::string> m_variable_name_map;
+                std::unordered_map<const Node*, std::string> m_node_function_map;
+                std::unordered_map<std::shared_ptr<Function>, std::list<std::shared_ptr<Node>>>
+                    m_function_ordered_ops;
+                bool m_emit_timing;
                bool m_is_compiled;
+                bool m_release_function;
+                bool m_temporaries_used;
+                std::string m_function_name;
+                std::string m_pch_header_source;
                cublasHandle_t m_cublas_handle;
                cudnnHandle_t m_cudnn_handle;
-                std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
-                std::unique_ptr<GPURuntimeContext> m_ctx;
            };
        }
    }