gpu_external_function and gpu constant memory refactor (#1189)

* refactor external function * wokring version * fix bug * add emit_fucntions, emit_declare_constants, emit_declare_functions * add std:: * add functions declaration * fix bugs * fix bugs * separate temp memory allocation and release * add invoke_constant_ptr function, clean up outputs for function * fix bugs, compiled ok * add ctx to emit_declare_constant * cleanup code, code style * remove using std, code style * revert std changes * change function names based Chris's comments * add ResultCopyElimination to pass_manager * clang format

gpu_external_function and gpu constant memory refactor (#1189)
* refactor external function * wokring version * fix bug * add emit_fucntions, emit_declare_constants, emit_declare_functions * add std:: * add functions declaration * fix bugs * fix bugs * separate temp memory allocation and release * add invoke_constant_ptr function, clean up outputs for function * fix bugs, compiled ok * add ctx to emit_declare_constant * cleanup code, code style * remove using std, code style * revert std changes * change function names based Chris's comments * add ResultCopyElimination to pass_manager * clang format
260cb90d · Fenglei · Scott Cyphers · 2c345798 · 260cb90d · 260cb90d
Commit 260cb90d authored Jul 13, 2018 by Fenglei Committed by Scott Cyphers Jul 13, 2018
Show whitespace changes
Inline Side-by-side

Showing with 269 additions and 308 deletions

gpu_external_function.cpp src/ngraph/runtime/gpu/gpu_external_function.cpp +233 -286

gpu_external_function.hpp src/ngraph/runtime/gpu/gpu_external_function.hpp +36 -22

No files found.
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -20,16 +20,9 @@
 #include <cuda_runtime.h>
 #include <cudnn.h>
 #include <fstream>
-#include <memory>
 #include <string>
 #include <tuple>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>

-#include "ngraph/codegen/code_writer.hpp"
-#include "ngraph/codegen/compiler.hpp"
-#include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/descriptor/input.hpp"
 #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
 #include "ngraph/descriptor/output.hpp"
@@ -104,13 +97,7 @@
 #include "ngraph/op/sum.hpp"
 #include "ngraph/op/tan.hpp"
 #include "ngraph/op/tanh.hpp"
-#include "ngraph/pass/assign_layout.hpp"
-#include "ngraph/pass/dump_sorted.hpp"
-#include "ngraph/pass/liveness.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
-#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
@@ -252,11 +239,12 @@ static const runtime::gpu::OpMap dispatcher{
 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
    const shared_ptr<ngraph::Function>& function, bool release_function)
    : m_compiled_function(nullptr)
-    , m_emit_timing(false)
+    , m_ctx(new GPURuntimeContext)
    , m_function(function)
-    , m_release_function(release_function)
+    , m_emit_timing(false)
    , m_is_compiled(false)
-    , m_ctx(new GPURuntimeContext)
+    , m_release_function(release_function)
+    , m_temporaries_used(false)
 {
    // Create context use driver API and make it current, the runtime call will pickup the context
    // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
@@ -289,37 +277,10 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
    delete m_ctx->compiled_kernel_pool;
 }

-void runtime::gpu::GPU_ExternalFunction::compile()
+void runtime::gpu::GPU_ExternalFunction::emit_header()
 {
-    if (m_is_compiled)
-    {
-        return;
-    }
-
-    m_primitive_emitter.reset(new GPUPrimitiveEmitter());
-
-    string function_name = m_function->get_name();
-    string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
-
-    pass::Manager pass_manager;
-    // pass_manager.register_pass<pass::TopologicalSort>();
-    // For now, just make everyone row-major.
-    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
-    pass_manager.register_pass<pass::Liveness>();
-    pass_manager.register_pass<pass::MemoryLayout>(64);
-    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
-    pass_manager.run_passes(m_function);
-
-    unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-    {
-        function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
-    }
-
-    codegen::CodeWriter writer;
-
-    writer +=
-        R"(// Generated by the nGraph GPU backend
+    m_writer += R"(
+// Generated by the nGraph GPU backend
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -348,9 +309,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
 #include "ngraph/util.hpp"
 )";

-    string pch_header_source = writer.get_code();
+    m_pch_header_source = m_writer.get_code();

-    writer += R"(
+    m_writer += R"(
 using namespace ngraph;
 using namespace ngraph::runtime;
 using namespace std;
@@ -360,15 +321,19 @@ using namespace std;
    // which is enabled because the JIT uses it as the default mechanism
    // to register cleanup handlers. We use it, and not atexit(), because
    // atexit() happens too late, when the JIT is no longer alive
-    writer << "void *__dso_handle = 0;\n\n";
+    m_writer << "void *__dso_handle = 0;\n\n";
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_timer_functions()
+{
    if (m_emit_timing)
    {
-        writer << "// Declare debug timers\n";
+        m_writer << "// Declare debug timers\n";
        vector<string> names;
        size_t index = 0;
-        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
        {
-            for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
+            for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
            {
                if (!node->is_parameter() && !node->is_constant())
                {
@@ -377,77 +342,114 @@ using namespace std;
                }
            }
        }
-        writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
-        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
+        m_writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
+        m_writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
                 << "; }\n";
-        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
-        writer.block_begin();
-        writer << "static const char* timer_names[" << names.size() << "] =\n";
-        writer.block_begin();
+        m_writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+        m_writer.block_begin();
+        m_writer << "static const char* timer_names[" << names.size() << "] =\n";
+        m_writer.block_begin();
        vector<string> quoted_names;
        for (const string& name : names)
        {
            quoted_names.push_back("\"" + name + "\"");
        }
-        writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
-        writer.indent--;
-        writer << "\n};\n";
-        writer << "return timer_names[index];\n";
-        writer.block_end();
+        m_writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
+        m_writer.indent--;
+        m_writer << "\n};\n";
+        m_writer << "return timer_names[index];\n";
+        m_writer.block_end();

-        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-        writer.block_begin();
-        writer << "return (index < " << names.size()
+        m_writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+        m_writer.block_begin();
+        m_writer << "return (index < " << names.size()
                 << " ? timers[index].get_total_microseconds() : 0);\n";
-        writer.block_end();
+        m_writer.block_end();

-        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-        writer.block_begin();
-        writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
-        writer.block_end();
-        writer << "\n";
+        m_writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+        m_writer.block_begin();
+        m_writer << "return (index < " << names.size()
+                 << " ? timers[index].get_call_count() : 0);\n";
+        m_writer.block_end();
+        m_writer << "\n";
    }
-    writer << "// Declare all constants\n";
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
+{
+    m_writer << "// Declare all constants\n";
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
    {
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
        {
            const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                auto c_value_strings = c->get_value_strings();
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
-                       << tv->get_tensor().get_name() << ";\n";
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
-                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
-                       << "] =\n";
-                writer << "{\n";
-                writer.indent++;
-                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
-                writer.indent--;
-                writer << "\n};\n\n";
+                // get an allocator for transient per kernel gpu memory
+                GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+                size_t idx = allocator.reserve_argspace(
+                    c->get_data_ptr(),
+                    tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
+                m_writer << "static size_t " << tv->get_tensor().get_name() << "_idx = " << idx
+                         << ";\n";
+                m_writer << "static " << tv->get_tensor().get_element_type().c_type_string() << "* "
+                         << tv->get_tensor().get_name() << " = nullptr;\n";
                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
            }
        }
    }

-    writer << "// Declare all functions\n";
-    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
+    m_writer << "\nstatic bool is_constant_mem_ptr_null = true;\n\n";
+    m_writer << "static void invoke_constant_mem_ptr(gpu::GPURuntimeContext* ctx)\n";
+    m_writer.block_begin();
+    {
+        m_writer << "if(is_constant_mem_ptr_null)\n";
+        m_writer.block_begin();
+        {
+            for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
            {
-        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
+                for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
+                {
+                    const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
+                    if (c)
+                    {
+                        shared_ptr<descriptor::TensorView> tv =
+                            node->get_outputs()[0].get_tensor_view();
+                        m_writer << tv->get_tensor().get_name() << " = reinterpret_cast<"
+                                 << tv->get_tensor().get_element_type().c_type_string()
+                                 << "*>(runtime::gpu::invoke_memory_primitive(ctx, "
+                                 << tv->get_tensor().get_name() << "_idx));\n";
+                    }
+                }
+            }
+            m_writer << "is_constant_mem_ptr_null = false;\n";
+        }
+        m_writer.block_end();
+    }
+    m_writer.block_end();
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_function_declarations()
+{
+    m_writer << "// Declare all functions\n";
+    for (shared_ptr<Function> f : m_pass_manager.get_state().get_functions())
+    {
+        m_writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
                 << "gpu::GPURuntimeContext* ctx);\n";
    }
-    writer << "\n";
+    m_writer << "\n";
+}

+void runtime::gpu::GPU_ExternalFunction::collect_unique_functions()
+{
    // This for loop creates a collection of functions that are called more than once
    // and emitting them as globally callable functions.
    // ops implement the is_functionally_identical method
    unordered_map<string, string> match_function_map;
-    unordered_map<const Node*, string> node_function_map;
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
    {
-        list<shared_ptr<Node>> tmp = current_function->get_ordered_ops();
+        list<shared_ptr<Node>> tmp = m_function_ordered_ops.at(current_function);
        if (tmp.size() < 2)
        {
            // Since we are comparing ops there must be at least two ops to proceed.
@@ -481,80 +483,39 @@ using namespace std;
                match_function_name = "func_" + node.get_name();
                emitted_function.replace(offset, 5, match_function_name);
                match_function_map.insert({match_function, match_function_name});
-                writer << emitted_function << "\n";
-            }
-            node_function_map.insert({&node, match_function_name});
-        }
+                m_writer << emitted_function << "\n";
            }
-
-    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-    {
-        set<string> output_names;
-        for (shared_ptr<Node> op : current_function->get_results())
-        {
-            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            output_names.insert(tv->get_tensor().get_name());
-        }
-        set<descriptor::TensorView*> constants;
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
-            {
-                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                constants.insert(tv.get());
-            }
-        }
-
-        writer << "extern \"C\" void " << current_function->get_name();
-        writer << "(void** inputs, void** outputs, "
-               << "gpu::GPURuntimeContext* ctx)\n";
-        writer << "{\n";
-        writer.indent++;
-
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-        {
-            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
-            if (c)
-            {
-                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
-                writer << "{\n";
-                writer.indent++;
-                writer << tv->get_tensor().get_name() << " = ("
-                       << tv->get_tensor().get_element_type().c_type_string()
-                       << " *) runtime::gpu::create_gpu_buffer(" << tv->get_tensor().size()
-                       << ");\n";
-                writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
-                       << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
-                       << ");\n";
-                writer.indent--;
-                writer << "}\n";
+            m_node_function_map.insert({&node, match_function_name});
        }
    }
+}

-        bool temporaries_used = false;
+void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_allocation(
+    shared_ptr<Function> current_function)
+{
+    m_temporaries_used = false;
    size_t worst_case_tmp_size = 0;
-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+    for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
    {
        if (node->liveness_new_list.size() > 0)
        {
-                temporaries_used = true;
+            m_temporaries_used = true;
            for (descriptor::Tensor* tensor : node->liveness_new_list)
            {
                worst_case_tmp_size += tensor->size();
            }
        }
    }
-        if (temporaries_used)
+    if (m_temporaries_used)
    {
        size_t temp_pool_size = current_function->get_temporary_pool_size();
-            writer << "// Allocate the memory pool\n";
+        m_writer << "// Allocate the memory pool\n";
        // TODO memory pool malloc.
-            writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
+        m_writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
                 << temp_pool_size << ");\n";

        // Add temporaries to the variable name map
-            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
        {
            for (descriptor::Tensor* tensor : node->liveness_new_list)
            {
@@ -565,6 +526,46 @@ using namespace std;
            }
        }
    }
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_release()
+{
+    if (m_temporaries_used)
+    {
+        m_writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
+    }
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_functions()
+{
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
+    {
+        set<string> output_names;
+        for (shared_ptr<Node> op : current_function->get_results())
+        {
+            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+            output_names.insert(tv->get_tensor().get_name());
+        }
+        set<descriptor::TensorView*> constants;
+        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
+        {
+            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
+            {
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
+                constants.insert(tv.get());
+            }
+        }
+
+        m_writer << "extern \"C\" void " << current_function->get_name();
+        m_writer << "(void** inputs, void** outputs, "
+                 << "gpu::GPURuntimeContext* ctx)\n";
+        m_writer.block_begin();
+        {
+            //set constant pointers during the first run
+            m_writer << "invoke_constant_mem_ptr(ctx);\n";
+
+            //alocate temp memory pool
+            emit_temp_mem_pool_allocation(current_function);

            // Add inputs to the variable name map
            size_t arg_index = 0;
@@ -582,74 +583,37 @@ using namespace std;
                }
            }

-        // create output alias map
-        size_t output_index = 0;
-        unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
-        vector<size_t> aliases;
-        for (size_t i = 0; i < current_function->get_output_size(); ++i)
-        {
-            shared_ptr<Node> op = current_function->get_output_op(i);
-            shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
-            vector<size_t>& al = output_alias_map[otv.get()];
-            al.push_back(output_index);
-            if (al.size() > 1)
-            {
-                aliases.push_back(output_index);
-            }
-            output_index++;
-        }
-
            // Add outputs to the variable name map
-        output_index = 0;
            for (size_t i = 0; i < current_function->get_output_size(); ++i)
            {
                shared_ptr<Node> op = current_function->get_output_op(i);
                shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
-            bool parameter_as_output = false;
-            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-            {
-                for (const descriptor::Output& pout : param->get_outputs())
-                {
-                    shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
-                    if (tv == ptv)
-                    {
-                        parameter_as_output = true;
-                        writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
-                               << et.c_type_string() << "*>(outputs[" << output_index << "]), "
-                               << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
-                               << ptv->get_tensor().size() << ");\n";
-                        break;
-                    }
-                }
-            }
-            if (!parameter_as_output && !contains(aliases, output_index))
-            {
-                if (contains(constants, tv.get()))
-                {
-                    writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
-                           << "], " << tv->get_tensor().get_name() << ", "
-                           << tv->get_tensor().size() << ");\n";
-                }
-                else
-                {
-                    string type = et.c_type_string();
+                string type = tv->get_tensor_view_type()->get_element_type().c_type_string();
                stringstream ss;
-                    ss << "((" << type << "*)(outputs[" << output_index << "]))";
+                ss << "((" << type << "*)(outputs[" << i << "]))";
                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+
+                //it should be safe to assign both descriptors to one output*
+                //since needs_copy == false makes `op::Result` an nop
+                auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
+                if (!res->needs_copy())
+                {
+                    shared_ptr<descriptor::TensorView> itv =
+                        res->get_inputs().at(0).get_output().get_tensor_view();
+                    m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
                }
            }
-            output_index++;
-        }

-        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
            {
-            auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
+                auto& n =
+                    *node; // Work around a compiler warning (*node inside typeid may have effects
                // with shared pointers, which is fine here but clang doesn't like it.)
                auto handler = dispatcher.find(type_index(typeid(n)));
                if (handler == dispatcher.end())
                {
-                throw ngraph_error("Unhandled op during code generation : " + node->description());
+                    throw ngraph_error("Unhandled op during code generation : " +
+                                       node->description());
                }
                vector<GPU_TensorViewWrapper> in;
                vector<string> node_input_names;
@@ -658,46 +622,38 @@ using namespace std;
                {
                    const descriptor::Output& output = input.get_output();
                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                in.push_back(
-                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                    in.push_back(GPU_TensorViewWrapper(
+                        tv, m_variable_name_map[tv->get_tensor().get_name()]));
                    node_input_names.emplace_back(tv->get_tensor().get_name());
                }
                vector<GPU_TensorViewWrapper> out;
                for (const descriptor::Output& output : node->get_outputs())
                {
                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                out.push_back(
-                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                    out.push_back(GPU_TensorViewWrapper(
+                        tv, m_variable_name_map[tv->get_tensor().get_name()]));
                    node_output_names.emplace_back(tv->get_tensor().get_name());
                }

                // Emit function description comment
                if (!node->is_parameter() && !node->is_constant())
                {
-                writer << "\n// " << node->get_name() << "(";
+                    m_writer << "\n// " << node->get_name() << "(";
                    vector<string> parameter_nodes = node_input_names;
                    parameter_nodes.insert(
                        parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
-                writer << join(parameter_nodes);
-                writer << ")\n";
-            }
-
-            // Emit operation prologue
-            if (!node->is_parameter() && !node->is_constant())
-            {
-                if (m_emit_timing)
-                {
-                    emit_debug_function_entry(writer, node.get(), in, out);
-                }
+                    m_writer << join(parameter_nodes);
+                    m_writer << ")\n";
+                    emit_debug_function_entry(node.get());
                }

                // Emit operation body
                string func_name;
-            func_name = node_function_map[node.get()];
+                func_name = m_node_function_map[node.get()];
                if (func_name.empty())
                {
                    //throw runtime_error("No matching function found for '" + node->get_name() + "'");
-                handler->second(this, writer, node.get(), in, out);
+                    handler->second(this, m_writer, node.get(), in, out);
                }
                else
                {
@@ -711,52 +667,81 @@ using namespace std;
                        names.push_back(tv.get_name());
                    }
                    names.push_back("ctx");
-                writer << func_name << "(" << join(names) << ");\n";
+                    m_writer << func_name << "(" << join(names) << ");\n";
                }

                // Emit operation epilogue
                if (!node->is_parameter() && !node->is_constant())
                {
-                if (m_emit_timing)
-                {
-                    emit_debug_function_exit(writer, node.get(), in, out);
+                    emit_debug_function_exit(node.get());
+                }
            }
+            emit_temp_mem_pool_release();
        }
+        m_writer.block_end(); // End generated function
    }
-        if (temporaries_used)
+}
+
+void runtime::gpu::GPU_ExternalFunction::store_emitted_functions(const string& code)
+{
+    // TODO: Cleanup and make this a utility function
+    string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
+    ofstream out(filename);
+    out << code;
+    out.close();
+}
+
+void runtime::gpu::GPU_ExternalFunction::compile()
+{
+    if (m_is_compiled)
    {
-            writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
+        return;
    }

-        writer.indent--;
-        // End generated function
-        writer += "}\n\n";
+    m_primitive_emitter.reset(new GPUPrimitiveEmitter());
+
+    m_function_name = m_function->get_name();
+    string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
+
+    // For now, just make everyone row-major.
+    m_pass_manager.register_pass<pass::ResultCopyElimination>();
+    m_pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+    m_pass_manager.register_pass<pass::Liveness>();
+    m_pass_manager.register_pass<pass::MemoryLayout>(64);
+    m_pass_manager.register_pass<pass::DumpSorted>(dump_filename);
+    m_pass_manager.run_passes(m_function);
+
+    for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
+    {
+        m_function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
    }

+    emit_header();
+    emit_timer_functions();
+    emit_constant_declarations();
+    emit_function_declarations();
+    collect_unique_functions();
+    emit_functions();
    // allocate device buffers for primitive arguments and workspace
    m_primitive_emitter->allocate_primitive_memory();

-    // TODO: Cleanup and make this a utility function
-    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
-    ofstream out(filename);
-    string code = writer.get_code();
-    out << code;
-    out.close();
+    string code = m_writer.get_code();
+    store_emitted_functions(code);

    m_compiler.reset(new codegen::Compiler());
    m_execution_engine.reset(new codegen::ExecutionEngine());
-
-    m_compiler->set_precompiled_header_source(pch_header_source);
+    m_compiler->set_precompiled_header_source(m_pch_header_source);

    auto codegen_module = m_compiler->compile(code);
-
    if (codegen_module == nullptr)
    {
        throw runtime_error("Function failed to compile to bitcode");
    }
+
    m_execution_engine->add_module(codegen_module);
    m_execution_engine->finalize();
-    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
+
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);
    if (!m_compiled_function)
    {
        throw runtime_error("Function failed to compile");
@@ -769,36 +754,6 @@ using namespace std;
    }
 }

-void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
-    codegen::CodeWriter& writer,
-    const Node& node,
-    const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
-{
-    for (const descriptor::Output& output : node.get_outputs())
-    {
-        shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
-        auto it = output_alias_map.find(otv.get());
-        if (it != output_alias_map.end())
-        {
-            const vector<size_t>& outputs = it->second;
-            if (outputs.size() > 1)
-            {
-                writer << "{    // handle output alias for previous op\n";
-                writer.indent++;
-                for (size_t i = 1; i < outputs.size(); i++)
-                {
-                    writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
-                              "outputs["
-                           << outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
-                           << "]), " << otv->get_tensor().size() << ");\n";
-                }
-                writer.indent--;
-                writer << "}\n";
-            }
-        }
-    }
-}
-
 shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
    runtime::gpu::GPU_ExternalFunction::make_call_frame()
 {
@@ -810,35 +765,27 @@ shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
    return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
 }

-void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
-    codegen::CodeWriter& writer,
-    Node* node,
-    const std::vector<GPU_TensorViewWrapper>& in,
-    const std::vector<GPU_TensorViewWrapper>& out)
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(Node* node)
 {
-    writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
+    if (m_emit_timing)
+    {
+        m_writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
+    }
 }

-void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
-    codegen::CodeWriter& writer,
-    Node* node,
-    const std::vector<GPU_TensorViewWrapper>& in,
-    const std::vector<GPU_TensorViewWrapper>& out)
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
 {
-    writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
+    if (m_emit_timing)
+    {
+        m_writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
+    }
 }

-std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
+unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
 {
    return m_ctx;
 }

-bool runtime::gpu::GPU_ExternalFunction::is_functionally_identical(
-    const Node& n1, const Node& n2, const unordered_map<const Node*, string>& node_cache) const
-{
-    return node_cache.at(&n1) == node_cache.at(&n2);
-}
-
 string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
                                                               const string& function_name)
 {

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -26,6 +26,12 @@
 #include "ngraph/codegen/compiler.hpp"
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/function.hpp"
+#include "ngraph/pass/assign_layout.hpp"
+#include "ngraph/pass/dump_sorted.hpp"
+#include "ngraph/pass/liveness.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/pass/result_copy_elimination.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
@@ -58,6 +64,7 @@ namespace ngraph
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                     bool release_function = true);
                ~GPU_ExternalFunction();
+
                std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame();
                std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
                const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
@@ -71,39 +78,46 @@ namespace ngraph
                EntryPoint m_compiled_function;

            private:
-                void emit_debug_function_entry(codegen::CodeWriter& writer,
-                                               Node* node,
-                                               const std::vector<GPU_TensorViewWrapper>& in,
-                                               const std::vector<GPU_TensorViewWrapper>& out);
-                void emit_debug_function_exit(codegen::CodeWriter& writer,
-                                              Node* node,
-                                              const std::vector<GPU_TensorViewWrapper>& in,
-                                              const std::vector<GPU_TensorViewWrapper>& out);
-                void handle_output_alias(
-                    codegen::CodeWriter& writer,
-                    const Node&,
-                    const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
+                void collect_unique_functions();
+                void emit_header();
+                void emit_timer_functions();
+                void emit_constant_declarations();
+                void emit_function_declarations();
+                void emit_functions();
+                void emit_debug_function_entry(Node* node);
+                void emit_debug_function_exit(Node* node);
+                void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
+                void emit_temp_mem_pool_release();
                void release_function() { m_function = nullptr; }
+                void store_emitted_functions(const std::string& code);
                std::string emit_op_as_function(const Node& node, const std::string& function_name);
                std::string strip_comments(const std::string& s) const;
-                bool is_functionally_identical(
-                    const Node& n1,
-                    const Node& n2,
-                    const std::unordered_map<const Node*, std::string>& node_cache) const;
+
+                codegen::CodeWriter m_writer;
+                pass::Manager m_pass_manager;

                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
-                bool m_emit_timing;
-                std::unordered_map<std::string, std::string> m_variable_name_map;
-                std::map<std::string, size_t> m_name_index_map;
+                std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
+                std::unique_ptr<GPURuntimeContext> m_ctx;
                std::shared_ptr<ngraph::Function> m_function;
-                bool m_release_function;
+
+                std::map<std::string, size_t> m_name_index_map;
+                std::unordered_map<std::string, std::string> m_variable_name_map;
+                std::unordered_map<const Node*, std::string> m_node_function_map;
+                std::unordered_map<std::shared_ptr<Function>, std::list<std::shared_ptr<Node>>>
+                    m_function_ordered_ops;
+
+                bool m_emit_timing;
                bool m_is_compiled;
+                bool m_release_function;
+                bool m_temporaries_used;
+
+                std::string m_function_name;
+                std::string m_pch_header_source;

                cublasHandle_t m_cublas_handle;
                cudnnHandle_t m_cudnn_handle;
-                std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
-                std::unique_ptr<GPURuntimeContext> m_ctx;
            };
        }
    }