Drwebb/gpu external function (#367)

* Initial GPU_ExternalFunction implementation Other changes: Add GPU runtime to same cmake block as GPU, include CUDA headers if GPU enabled Initial passing (a+b)*c test Properly link cuda libraries Simple GPUTensorView implementation Initial GPU emitter GPU codegen initial function gen, no kernels yet Rename GPU emitter and tensor_view_wrapper to match naming convention * GPU external function based on BASE * Fix stray base -> gpu * TensorViewWrapper -> GPU_TensorViewWrapper * Copy over emitter from base transformer * Fix for naming dense layout * Copy kernel emitters from base -> gpu and strip out kernel_utils * Add aliases to GPU_TensorViewWrappers * More fixes for naming descriptor::TensorViews * Move in call_frame implementation from base -> gpu * apply code format * GPU codegen running A+B*C gpu emitters gpu ctx setup cuda_module kernels Remove GPU_CF perf counters Use gpu kernels in external function Add GPU 1d dot test Review Changes: * Remove CPU specific kernel emitting method bodies * Use copy_data from test/util.cpp, uncomment compileTest * Use test_utils copy_data function * Grab function name from pass manager for def, clean up indentation

Drwebb/gpu external function (#367)
* Initial GPU_ExternalFunction implementation Other changes: Add GPU runtime to same cmake block as GPU, include CUDA headers if GPU enabled Initial passing (a+b)*c test Properly link cuda libraries Simple GPUTensorView implementation Initial GPU emitter GPU codegen initial function gen, no kernels yet Rename GPU emitter and tensor_view_wrapper to match naming convention * GPU external function based on BASE * Fix stray base -> gpu * TensorViewWrapper -> GPU_TensorViewWrapper * Copy over emitter from base transformer * Fix for naming dense layout * Copy kernel emitters from base -> gpu and strip out kernel_utils * Add aliases to GPU_TensorViewWrappers * More fixes for naming descriptor::TensorViews * Move in call_frame implementation from base -> gpu * apply code format * GPU codegen running A+B*C gpu emitters gpu ctx setup cuda_module kernels Remove GPU_CF perf counters Use gpu kernels in external function Add GPU 1d dot test Review Changes: * Remove CPU specific kernel emitting method bodies * Use copy_data from test/util.cpp, uncomment compileTest * Use test_utils copy_data function * Grab function name from pass manager for def, clean up indentation
c5549682 · Tristan Webb · GitHub · e433e55a · c5549682 · c5549682
Unverified Commit c5549682 authored Jan 17, 2018 by Tristan Webb Committed by GitHub Jan 17, 2018
15 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -190,9 +190,12 @@ endif()
        set(SRC ${SRC}
            runtime/gpu/gpu_call_frame.cpp
            runtime/gpu/gpu_backend.cpp
-            runtime/gpu/gpu_manager.cpp
+            runtime/gpu/gpu_emitter.cpp
            runtime/gpu/gpu_external_function.cpp
+            runtime/gpu/gpu_kernel_emitters.cpp
+            runtime/gpu/gpu_manager.cpp
            runtime/gpu/gpu_tensor_view.cpp
+            runtime/gpu/gpu_tensor_view_wrapper.cpp
            )
        set_property(SOURCE codegen/compiler.cpp APPEND_STRING PROPERTY COMPILE_DEFINITIONS
        "CUDA_HEADER_PATHS=\"${CUDA_INCLUDE_DIRS}\";")
@@ -265,7 +268,7 @@ if(NGRAPH_CPU_ENABLE)
 endif()
 if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
-    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES})
+    target_link_libraries(ngraph PRIVATE cuda)
 endif()
 # Argon

--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -13,6 +13,7 @@
 // ----------------------------------------------------------------------------
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
@@ -29,6 +30,6 @@ std::shared_ptr<ngraph::runtime::TensorView>
    runtime::gpu::GPU_Backend::make_primary_tensor_view(const ngraph::element::Type& element_type,
                                                        const Shape& shape)
 {
-    auto rc = make_shared<runtime::gpu::GPU_TensorView>(element_type, shape);
+    auto rc = make_shared<runtime::cpu::CPU_TensorView>(element_type, shape);
    return dynamic_pointer_cast<runtime::TensorView>(rc);
 }
--- a/src/ngraph/runtime/gpu/gpu_call_frame.cpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.cpp
@@ -12,25 +12,66 @@
 // see the license for the specific language governing permissions and
 // ----------------------------------------------------------------------------
+#include <cstdlib>
+#include <fstream>
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
 using namespace std;
-using namespace ngraph::runtime::gpu;
+using namespace ngraph;
-GPU_CallFrame::GPU_CallFrame(shared_ptr<GPU_ExternalFunction> external_function,
+runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
-                             shared_ptr<Function> func)
+                                           EntryPoint compiled_function)
    : m_external_function(external_function)
-    , m_function(func)
+    , m_compiled_function(compiled_function)
 {
 }
-void GPU_CallFrame::call(const vector<shared_ptr<Value>>& input_tvs,
+void runtime::gpu::GPU_CallFrame::tensor_call(
-                         const vector<shared_ptr<Value>>& output_tvs)
+    const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs,
+    const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs)
 {
+    // Host tensors
+    vector<void*> inputs;
+    vector<void*> outputs;
+    for (size_t i = 0; i < input_tvs.size(); i++)
+    {
+        shared_ptr<runtime::cpu::CPU_TensorView> tv =
+            static_pointer_cast<runtime::cpu::CPU_TensorView>(input_tvs[i]);
+        inputs.push_back(tv->get_data_ptr());
+    }
+    for (size_t i = 0; i < output_tvs.size(); i++)
+    {
+        shared_ptr<runtime::cpu::CPU_TensorView> tv =
+            static_pointer_cast<runtime::cpu::CPU_TensorView>(output_tvs[i]);
+        outputs.push_back(tv->get_data_ptr());
+    }
+    // Invoke compiled computation
+    m_compiled_function(inputs.data(), outputs.data());
 }
-void GPU_CallFrame::tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
+void runtime::gpu::GPU_CallFrame::call(
-                                const std::vector<std::shared_ptr<TensorView>>& outputs)
+    const std::vector<std::shared_ptr<runtime::TensorView>>& arguments,
+    const std::vector<std::shared_ptr<runtime::TensorView>>& results)
 {
+    // TODO: Check types of args and result
+    vector<shared_ptr<runtime::TensorView>> inputs;
+    for (shared_ptr<runtime::TensorView> argument : arguments)
+    {
+        argument->collect_tensor_views(inputs, argument);
+    }
+    vector<shared_ptr<runtime::TensorView>> outputs;
+    for (shared_ptr<runtime::TensorView> result : results)
+    {
+        result->collect_tensor_views(outputs, result);
+    }
+    tensor_call(inputs, outputs);
 }
--- a/src/ngraph/runtime/gpu/gpu_call_frame.hpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.hpp
@@ -26,6 +26,8 @@ namespace ngraph
 {
    namespace runtime
    {
+        class PrimaryTensorView;
        namespace gpu
        {
            class GPU_CallFrame;
@@ -40,22 +42,23 @@ namespace ngraph
            {
            public:
                GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
-                              std::shared_ptr<Function> func);
+                              EntryPoint compiled_function);
                /// @brief Invoke the function with values matching the signature of the function.
                ///
                /// Tuples will be expanded into their tensor views to build the call frame.
-                void call(const std::vector<std::shared_ptr<ngraph::runtime::Value>>& inputs,
+                void
-                          const std::vector<std::shared_ptr<ngraph::runtime::Value>>& outputs);
+                    call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
+                         const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;
                /// @brief Invoke the function with tuples pre-expanded to their underlying
                /// tensor views.
                void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
-                                 const std::vector<std::shared_ptr<TensorView>>& outputs);
+                                 const std::vector<std::shared_ptr<TensorView>>& outputs) override;
            protected:
                std::shared_ptr<GPU_ExternalFunction> m_external_function;
-                std::shared_ptr<Function> m_function;
+                EntryPoint m_compiled_function;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/node.hpp"
+#include "ngraph/ops/broadcast.hpp"
+#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/dot.hpp"
+#include "ngraph/ops/function_call.hpp"
+#include "ngraph/ops/get_output_element.hpp"
+#include "ngraph/ops/max_pool.hpp"
+#include "ngraph/ops/one_hot.hpp"
+#include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/replace_slice.hpp"
+#include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/reverse.hpp"
+#include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/sum.hpp"
+#include "ngraph/runtime/gpu/gpu_emitter.hpp"
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+void runtime::gpu::GPU_Emitter::EmitNop(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitAdd(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitDot(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitDivide(const ngraph::Node* n,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitEqual(const ngraph::Node* n,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitGreater(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitGreaterEq(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitLess(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitLessEq(const ngraph::Node* n,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitLog(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitMaximum(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitMinimum(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitNegative(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitNotEqual(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSelect(const ngraph::Node* n,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSubtract(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitBroadcast(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitConvert(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitConstant(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitReshape(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitFunctionCall(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitReduce(const ngraph::Node* n,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSign(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSlice(const ngraph::Node* n,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSum(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitExp(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSin(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSinh(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitCos(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitCosh(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitTan(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitTanh(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitAsin(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitAcos(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitAtan(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitPower(const ngraph::Node* n,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitOneHot(const ngraph::Node* n,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitCeiling(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitFloor(const ngraph::Node* n,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitSqrt(const ngraph::Node* n,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitConvolution(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitNot(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitMaxPool(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitReverse(const ngraph::Node* n,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+//------------------------------------------------------------------------------------------------
+// Utility methods
+//------------------------------------------------------------------------------------------------
+void runtime::gpu::GPU_Emitter::generate_call(
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out,
+    shared_ptr<Function> function)
+{
+    vector<string> input_names;
+    vector<string> output_names;
+    for (const runtime::gpu::GPU_TensorViewWrapper& input : args)
+    {
+        input_names.push_back(input.get_name());
+    }
+    for (const runtime::gpu::GPU_TensorViewWrapper& output : out)
+    {
+        output_names.push_back(output.get_name());
+    }
+    m_out << "void* args[] =\n{";
+    m_out.indent++;
+    m_out << "\n" << join(input_names, ",\n");
+    m_out.indent--;
+    m_out << "\n};\n";
+    m_out << "void* out[] =\n{";
+    m_out.indent++;
+    m_out << "\n" << join(output_names, ",\n");
+    m_out.indent--;
+    m_out << "\n};\n";
+    m_out << "\n";
+    m_out << function->get_name() << "(args, out);\n";
+}
+static string format_name(const string& name)
+{
+    string rc;
+    if (!name.empty())
+    {
+        rc = " " + name;
+    }
+    return rc;
+}
+void runtime::gpu::GPU_Emitter::EmitAbs(const ngraph::Node* n,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitConcat(const ngraph::Node* n,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
+void runtime::gpu::GPU_Emitter::EmitMultiply(
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+{
+}
--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include <string>
+#include <vector>
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
+#define EMITTER_DECL(E)                                                                            \
+    E(const ngraph::Node* n,                                                                       \
+      const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& args,                        \
+      const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& out)
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class GPU_Emitter
+            {
+            protected:
+                codegen::CodeWriter m_out;
+                bool m_use_ref_kernels;
+            public:
+                GPU_Emitter()
+                    : m_out()
+                    , m_use_ref_kernels(std::getenv("NGRAPH_GPU_USE_REF_KERNELS") != nullptr)
+                {
+                }
+                std::string get_code() { return m_out.get_code(); }
+                codegen::CodeWriter& get_code_writer() { return m_out; }
+                void EMITTER_DECL(EmitNop);
+                void EMITTER_DECL(EmitAdd);
+                void EMITTER_DECL(EmitDot);
+                void EMITTER_DECL(EmitMultiply);
+                void EMITTER_DECL(EmitGetOutputElement);
+                void EMITTER_DECL(EmitXLAGetTupleElement);
+                void EMITTER_DECL(EmitTuple);
+                void EMITTER_DECL(EmitAbs);
+                void EMITTER_DECL(EmitConcat);
+                void EMITTER_DECL(EmitDivide);
+                void EMITTER_DECL(EmitEqual);
+                void EMITTER_DECL(EmitGreater);
+                void EMITTER_DECL(EmitGreaterEq);
+                void EMITTER_DECL(EmitLess);
+                void EMITTER_DECL(EmitLessEq);
+                void EMITTER_DECL(EmitLog);
+                void EMITTER_DECL(EmitMaximum);
+                void EMITTER_DECL(EmitMinimum);
+                void EMITTER_DECL(EmitNegative);
+                void EMITTER_DECL(EmitNotEqual);
+                void EMITTER_DECL(EmitSelect);
+                void EMITTER_DECL(EmitSubtract);
+                void EMITTER_DECL(EmitBroadcast);
+                void EMITTER_DECL(EmitConvert);
+                void EMITTER_DECL(EmitConstant);
+                void EMITTER_DECL(EmitReshape);
+                void EMITTER_DECL(EmitFunctionCall);
+                void EMITTER_DECL(EmitReduce);
+                void EMITTER_DECL(EmitSign);
+                void EMITTER_DECL(EmitSlice);
+                void EMITTER_DECL(EmitSum);
+                void EMITTER_DECL(EmitExp);
+                void EMITTER_DECL(EmitSin);
+                void EMITTER_DECL(EmitSinh);
+                void EMITTER_DECL(EmitCos);
+                void EMITTER_DECL(EmitCosh);
+                void EMITTER_DECL(EmitTan);
+                void EMITTER_DECL(EmitTanh);
+                void EMITTER_DECL(EmitAsin);
+                void EMITTER_DECL(EmitAcos);
+                void EMITTER_DECL(EmitAtan);
+                void EMITTER_DECL(EmitPower);
+                void EMITTER_DECL(EmitReplaceSlice);
+                void EMITTER_DECL(EmitOneHot);
+                void EMITTER_DECL(EmitFloor);
+                void EMITTER_DECL(EmitCeiling);
+                void EMITTER_DECL(EmitSqrt);
+                void EMITTER_DECL(EmitConvolution);
+                void EMITTER_DECL(EmitNot);
+                void EMITTER_DECL(EmitMaxPool);
+                void EMITTER_DECL(EmitReverse);
+            private:
+                void generate_call(const std::vector<GPU_TensorViewWrapper>& args,
+                                   const std::vector<GPU_TensorViewWrapper>& out,
+                                   std::shared_ptr<Function> function);
+                std::string emit_vector(const GPU_TensorViewWrapper&, const std::string& name = "");
+                std::string emit_array1d(const GPU_TensorViewWrapper&,
+                                         const std::string& name = "");
+                std::string emit_matrix(const GPU_TensorViewWrapper&, const std::string& name = "");
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -12,35 +12,1012 @@
 // See the License for the specific language governing permissions and
 // ----------------------------------------------------------------------------
+#include <cstdlib>
+#include <fstream>
 #include <memory>
 #include <string>
+#include <tuple>
+#include <typeindex>
+#include <typeinfo>
 #include <unordered_map>
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/codegen/compiler.hpp"
+#include "ngraph/codegen/execution_engine.hpp"
+#include "ngraph/descriptor/input.hpp"
+#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
+#include "ngraph/descriptor/output.hpp"
+#include "ngraph/descriptor/primary_tensor_view.hpp"
+#include "ngraph/file_util.hpp"
 #include "ngraph/function.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/ops/abs.hpp"
+#include "ngraph/ops/acos.hpp"
+#include "ngraph/ops/add.hpp"
+#include "ngraph/ops/asin.hpp"
+#include "ngraph/ops/atan.hpp"
+#include "ngraph/ops/broadcast.hpp"
+#include "ngraph/ops/ceiling.hpp"
+#include "ngraph/ops/concatenate.hpp"
+#include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/convert.hpp"
+#include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/cos.hpp"
+#include "ngraph/ops/cosh.hpp"
+#include "ngraph/ops/divide.hpp"
+#include "ngraph/ops/dot.hpp"
+#include "ngraph/ops/equal.hpp"
+#include "ngraph/ops/exp.hpp"
+#include "ngraph/ops/floor.hpp"
+#include "ngraph/ops/function_call.hpp"
+#include "ngraph/ops/greater.hpp"
+#include "ngraph/ops/greater_eq.hpp"
+#include "ngraph/ops/less.hpp"
+#include "ngraph/ops/less_eq.hpp"
+#include "ngraph/ops/log.hpp"
+#include "ngraph/ops/max_pool.hpp"
+#include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/minimum.hpp"
+#include "ngraph/ops/multiply.hpp"
+#include "ngraph/ops/negative.hpp"
+#include "ngraph/ops/not.hpp"
+#include "ngraph/ops/not_equal.hpp"
+#include "ngraph/ops/one_hot.hpp"
+#include "ngraph/ops/power.hpp"
+#include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/replace_slice.hpp"
+#include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/reverse.hpp"
+#include "ngraph/ops/select.hpp"
+#include "ngraph/ops/sign.hpp"
+#include "ngraph/ops/sin.hpp"
+#include "ngraph/ops/sinh.hpp"
+#include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/sqrt.hpp"
+#include "ngraph/ops/subtract.hpp"
+#include "ngraph/ops/sum.hpp"
+#include "ngraph/ops/tan.hpp"
+#include "ngraph/ops/tanh.hpp"
+#include "ngraph/pass/assign_layout.hpp"
+#include "ngraph/pass/dump_sorted.hpp"
+#include "ngraph/pass/liveness.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
+#include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
 using namespace std;
-using namespace ngraph::runtime::gpu;
 using namespace ngraph;
-ngraph::runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
+static const string s_output_dir = "gpu_codegen";
-    const std::shared_ptr<ngraph::Function>& function, bool release_function)
-    : runtime::ExternalFunction(function, release_function)
+class StaticInitializers
-    , m_function(function)
+{
+public:
+    StaticInitializers() { ngraph::file_util::remove_directory(s_output_dir); }
+};
+static string emit_string_array(const vector<string>& s, size_t max_line_length)
+{
+    stringstream ss;
+    stringstream line;
+    for (size_t i = 0; i < s.size(); i++)
+    {
+        if (i != 0)
+        {
+            line << ",";
+        }
+        stringstream value;
+        value << s[i];
+        string value_string = value.str();
+        if (static_cast<size_t>(line.tellp()) + value_string.size() + 1 <= max_line_length)
+        {
+            if (i > 0)
+            {
+                line << " ";
+            }
+            line << value_string;
+        }
+        else
+        {
+            ss << line.str() << "\n";
+            line.str("");
+            line << value_string;
+        }
+    }
+    ss << line.str();
+    return ss.str();
+}
+static StaticInitializers s_static_initializers;
+#define TI(x) type_index(typeid(x))
+static const runtime::gpu::OpMap dispatcher{
+    {TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::EmitAdd},
+    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
+    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
+    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
+    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
+    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
+    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
+    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
+    {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitGreater},
+    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
+    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
+    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
+    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
+    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
+    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
+    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
+    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitNotEqual},
+    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitPower},
+    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::EmitSelect},
+    {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitSubtract},
+    {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::EmitBroadcast},
+    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitConvert},
+    {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::EmitConstant},
+    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
+    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
+    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
+    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
+    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
+    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
+    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
+    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
+    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
+    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
+    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
+    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
+    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
+    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
+    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
+    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
+    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
+    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
+    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
+    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
+    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
+    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
+    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
+    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
+    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
+};
+runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
+    const shared_ptr<ngraph::Function>& function, bool release_function)
+    : ngraph::runtime::ExternalFunction(function, release_function)
+    , m_compiled_function(nullptr)
+    , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
+    , m_use_tbb(std::getenv("NGRAPH_GPU_USE_TBB") != nullptr)
 {
 }
 void runtime::gpu::GPU_ExternalFunction::compile()
 {
+    if (m_is_compiled)
+    {
+        return;
+    }
+    string function_name = m_function->get_name();
+    string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
+    pass::Manager pass_manager;
+    // pass_manager.register_pass<pass::TopologicalSort>();
+    // For now, just make everyone row-major.
+    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+    pass_manager.register_pass<pass::Liveness>();
+    pass_manager.register_pass<pass::MemoryLayout>(64);
+    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
+    pass_manager.run_passes(m_function);
+    GPU_Emitter emitter;
+    codegen::CodeWriter& writer = emitter.get_code_writer();
+    writer +=
+        R"(// Generated by the NGraph GPU backend
+    #include <cassert>
+    #include <cmath>
+    #include <cstdlib>
+    #include <fstream>
+    #include <fstream>
+    #include <iostream>
+    #include <memory>
+    #include <string>
+    #include <tuple>
+    #include <typeindex>
+    #include <typeinfo>
+    #include <unordered_map>
+    #include "cuda.h"
+    #include "ngraph/codegen/code_writer.hpp"
+    #include "ngraph/codegen/compiler.hpp"
+    #include "ngraph/codegen/execution_engine.hpp"
+    #include "ngraph/descriptor/input.hpp"
+    #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
+    #include "ngraph/descriptor/output.hpp"
+    #include "ngraph/descriptor/primary_tensor_view.hpp"
+    #include "ngraph/file_util.hpp"
+    #include "ngraph/function.hpp"
+    #include "ngraph/graph_util.hpp"
+    #include "ngraph/node.hpp"
+    #include "ngraph/ops/abs.hpp"
+    #include "ngraph/ops/acos.hpp"
+    #include "ngraph/ops/add.hpp"
+    #include "ngraph/ops/asin.hpp"
+    #include "ngraph/ops/atan.hpp"
+    #include "ngraph/ops/broadcast.hpp"
+    #include "ngraph/ops/ceiling.hpp"
+    #include "ngraph/ops/concatenate.hpp"
+    #include "ngraph/ops/constant.hpp"
+    #include "ngraph/ops/convert.hpp"
+    #include "ngraph/ops/convolution.hpp"
+    #include "ngraph/ops/cos.hpp"
+    #include "ngraph/ops/cosh.hpp"
+    #include "ngraph/ops/divide.hpp"
+    #include "ngraph/ops/dot.hpp"
+    #include "ngraph/ops/equal.hpp"
+    #include "ngraph/ops/exp.hpp"
+    #include "ngraph/ops/floor.hpp"
+    #include "ngraph/ops/function_call.hpp"
+    #include "ngraph/ops/greater.hpp"
+    #include "ngraph/ops/greater_eq.hpp"
+    #include "ngraph/ops/less.hpp"
+    #include "ngraph/ops/less_eq.hpp"
+    #include "ngraph/ops/log.hpp"
+    #include "ngraph/ops/max_pool.hpp"
+    #include "ngraph/ops/maximum.hpp"
+    #include "ngraph/ops/minimum.hpp"
+    #include "ngraph/ops/multiply.hpp"
+    #include "ngraph/ops/negative.hpp"
+    #include "ngraph/ops/not.hpp"
+    #include "ngraph/ops/not_equal.hpp"
+    #include "ngraph/ops/one_hot.hpp"
+    #include "ngraph/ops/power.hpp"
+    #include "ngraph/ops/reduce.hpp"
+    #include "ngraph/ops/replace_slice.hpp"
+    #include "ngraph/ops/reshape.hpp"
+    #include "ngraph/ops/reverse.hpp"
+    #include "ngraph/ops/select.hpp"
+    #include "ngraph/ops/sign.hpp"
+    #include "ngraph/ops/sin.hpp"
+    #include "ngraph/ops/sinh.hpp"
+    #include "ngraph/ops/slice.hpp"
+    #include "ngraph/ops/sqrt.hpp"
+    #include "ngraph/ops/subtract.hpp"
+    #include "ngraph/ops/sum.hpp"
+    #include "ngraph/ops/tan.hpp"
+    #include "ngraph/ops/tanh.hpp"
+    #include "ngraph/pass/assign_layout.hpp"
+    #include "ngraph/pass/dump_sorted.hpp"
+    #include "ngraph/pass/liveness.hpp"
+    #include "ngraph/pass/manager.hpp"
+    #include "ngraph/pass/memory_layout.hpp"
+    #include "ngraph/runtime/aligned_buffer.hpp"
+    #include "ngraph/util.hpp"
+)";
+    string pch_header_source = writer.get_code();
+    writer += R"(
+    using namespace ngraph::runtime;
+    using namespace std;
+    void check_cuda_errors(CUresult err) {
+      assert(err == CUDA_SUCCESS);
+      // assert(err == err);
+    }
+)";
+    //     // The "dso_handle" symbol is required by __cxa_atexit()
+    //     // which is enabled because the JIT uses it as the default mechanism
+    //     // to register cleanup handlers. We use it, and not atexit(), because
+    //     // atexit() happens too late, when the JIT is no longer alive
+    writer << "void *__dso_handle = 0;\n\n";
+    writer << "// Declare all constants\n";
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    {
+        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        {
+            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+            if (c)
+            {
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
+                auto c_value_strings = c->get_value_strings();
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
+                       << tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n";
+                writer << "{\n";
+                writer.indent++;
+                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
+                writer.indent--;
+                writer << "\n};\n\n";
+                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
+            }
+        }
+    }
+    writer << "// Declare all functions\n";
+    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
+    {
+        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs);\n";
+    }
+    writer << "\n";
+    writer << "extern \"C\" void " << pass_manager.get_state().get_functions()[0]->get_name()
+           << "(void** inputs, void** outputs){\n";
+    writer += R"(
+    CUdevice    device;
+    CUmodule    cuda_module;
+    CUcontext   context;
+    CUfunction  add_function;
+    CUfunction  mult_function;
+    CUlinkState linker;
+    int         dev_count;
+    check_cuda_errors(cuInit(0));
+    check_cuda_errors(cuDeviceGetCount(&dev_count));
+    check_cuda_errors(cuDeviceGet(&device, 0));
+    // char name[128];
+    // check_cuda_errors(cuDeviceGetName(name, 128, device));
+    // std::cout << "Using CUDA Device [0]: " << name << "\n";
+    // int dev_major, dev_minor;
+    // check_cuda_errors(cuDeviceComputeCapability(&dev_major, &dev_minor, device));
+    // std::cout << "Device Compute Capability: "
+    //           << dev_major << "." << dev_minor << "\n";
+    // if (dev_major < 2) {
+    //   std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
+    // }
+    const auto kernels = R"#(
+    .version 5.0
+    .target sm_60
+    .address_size 64
+      // .globl	_Z7ew_multPfS_S_ // -- Begin function _Z7ew_multPfS_S_
+    .global .align 1 .b8 threadIdx[1];
+                                            // @_Z7ew_multPfS_S_
+    .visible .entry _Z7ew_multPfS_S_(
+      .param .u64 _Z7ew_multPfS_S__param_0,
+      .param .u64 _Z7ew_multPfS_S__param_1,
+      .param .u64 _Z7ew_multPfS_S__param_2
+    )
+    {
+      .local .align 8 .b8 	__local_depot0[24];
+      .reg .b64 	%SP;
+      .reg .b64 	%SPL;
+      .reg .f32 	%f<4>;
+      .reg .b32 	%r<2>;
+      .reg .b64 	%rd<17>;
+    // BB#0:
+      mov.u64 	%SPL, __local_depot0;
+      cvta.local.u64 	%SP, %SPL;
+      ld.param.u64 	%rd3, [_Z7ew_multPfS_S__param_2];
+      ld.param.u64 	%rd2, [_Z7ew_multPfS_S__param_1];
+      ld.param.u64 	%rd1, [_Z7ew_multPfS_S__param_0];
+      cvta.to.global.u64 	%rd4, %rd3;
+      cvta.global.u64 	%rd5, %rd4;
+      cvta.to.global.u64 	%rd6, %rd2;
+      cvta.global.u64 	%rd7, %rd6;
+      cvta.to.global.u64 	%rd8, %rd1;
+      cvta.global.u64 	%rd9, %rd8;
+      st.u64 	[%SP+0], %rd9;
+      st.u64 	[%SP+8], %rd7;
+      st.u64 	[%SP+16], %rd5;
+      ld.u64 	%rd10, [%SP+0];
+      mov.u32 	%r1, %tid.x;
+      mul.wide.u32 	%rd11, %r1, 4;
+      add.s64 	%rd12, %rd10, %rd11;
+      ld.f32 	%f1, [%rd12];
+      ld.u64 	%rd13, [%SP+8];
+      add.s64 	%rd14, %rd13, %rd11;
+      ld.f32 	%f2, [%rd14];
+      mul.rn.f32 	%f3, %f1, %f2;
+      ld.u64 	%rd15, [%SP+16];
+      add.s64 	%rd16, %rd15, %rd11;
+      st.f32 	[%rd16], %f3;
+      ret;
+    }
+                                            // -- End function
+      // .globl	_Z6ew_addPfS_S_ // -- Begin function _Z6ew_addPfS_S_
+    .visible .entry _Z6ew_addPfS_S_(
+      .param .u64 _Z6ew_addPfS_S__param_0,
+      .param .u64 _Z6ew_addPfS_S__param_1,
+      .param .u64 _Z6ew_addPfS_S__param_2
+    )                                       // @_Z6ew_addPfS_S_
+    {
+      .local .align 8 .b8 	__local_depot1[24];
+      .reg .b64 	%SP;
+      .reg .b64 	%SPL;
+      .reg .f32 	%f<4>;
+      .reg .b32 	%r<2>;
+      .reg .b64 	%rd<17>;
+    // BB#0:
+      mov.u64 	%SPL, __local_depot1;
+      cvta.local.u64 	%SP, %SPL;
+      ld.param.u64 	%rd3, [_Z6ew_addPfS_S__param_2];
+      ld.param.u64 	%rd2, [_Z6ew_addPfS_S__param_1];
+      ld.param.u64 	%rd1, [_Z6ew_addPfS_S__param_0];
+      cvta.to.global.u64 	%rd4, %rd3;
+      cvta.global.u64 	%rd5, %rd4;
+      cvta.to.global.u64 	%rd6, %rd2;
+      cvta.global.u64 	%rd7, %rd6;
+      cvta.to.global.u64 	%rd8, %rd1;
+      cvta.global.u64 	%rd9, %rd8;
+      st.u64 	[%SP+0], %rd9;
+      st.u64 	[%SP+8], %rd7;
+      st.u64 	[%SP+16], %rd5;
+      ld.u64 	%rd10, [%SP+0];
+      mov.u32 	%r1, %tid.x;
+      mul.wide.u32 	%rd11, %r1, 4;
+      add.s64 	%rd12, %rd10, %rd11;
+      ld.f32 	%f1, [%rd12];
+      ld.u64 	%rd13, [%SP+8];
+      add.s64 	%rd14, %rd13, %rd11;
+      ld.f32 	%f2, [%rd14];
+      add.rn.f32 	%f3, %f1, %f2;
+      ld.u64 	%rd15, [%SP+16];
+      add.s64 	%rd16, %rd15, %rd11;
+      st.f32 	[%rd16], %f3;
+      ret;
+    }
+                                            // -- End function
+    )#";
+    // Create driver context
+    check_cuda_errors(cuCtxCreate(&context, 0, device));
+    // Create module for object
+    check_cuda_errors(cuModuleLoadDataEx(&cuda_module, kernels, 0, 0, 0));
+    // Get kernel function
+    check_cuda_errors(cuModuleGetFunction(&add_function, cuda_module, "_Z6ew_addPfS_S_"));
+    check_cuda_errors(cuModuleGetFunction(&mult_function, cuda_module, "_Z7ew_multPfS_S_"));
+    // Device data
+    CUdeviceptr dev_bufferA;
+    CUdeviceptr dev_bufferB;
+    CUdeviceptr dev_bufferC;
+    check_cuda_errors(cuMemAlloc(&dev_bufferA, sizeof(float) * 4));
+    check_cuda_errors(cuMemAlloc(&dev_bufferB, sizeof(float) * 4));
+    check_cuda_errors(cuMemAlloc(&dev_bufferC, sizeof(float) * 4));
+    float* host_A = new float[4];
+    float* host_B = new float[4];
+    float* host_C = new float[4];
+    // Populate input
+    memcpy(host_A, (float*)(inputs[0]), sizeof(float) * 4);
+    memcpy(host_B, (float*)(inputs[1]), sizeof(float) * 4);
+    memcpy(host_C, (float*)(inputs[2]), sizeof(float) * 4);
+    check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float) * 4));
+    check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float) * 4));
+    // check_cuda_errors(cuMemcpyHtoD(dev_bufferC, &host_C[0], sizeof(float) * 4));
+    unsigned block_size_X = 4;
+    unsigned block_size_Y = 1;
+    unsigned block_size_Z = 1;
+    unsigned grid_size_X = 1;
+    unsigned grid_size_Y = 1;
+    unsigned grid_size_Z = 1;
+    // Kernel parameters
+    void* kernel_params[] = {&dev_bufferA, &dev_bufferB, &dev_bufferC};
+    // Add Kernel launch
+    check_cuda_errors(cuLaunchKernel(add_function,
+                                        grid_size_X,
+                                        grid_size_Y,
+                                        grid_size_Z,
+                                        block_size_X,
+                                        block_size_Y,
+                                        block_size_Z,
+                                        0,
+                                        NULL,
+                                        kernel_params,
+                                        NULL));
+    check_cuda_errors(cuMemcpyDtoH(&host_A[0], dev_bufferC, sizeof(float) * 4));
+    host_B = &host_C[0];
+      check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float) * 4));
+      check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float) * 4));
+    // Mult Kernel launch
+    check_cuda_errors(cuLaunchKernel(mult_function,
+                                        grid_size_X,
+                                        grid_size_Y,
+                                        grid_size_Z,
+                                        block_size_X,
+                                        block_size_Y,
+                                        block_size_Z,
+                                        0,
+                                        NULL,
+                                        kernel_params,
+                                        NULL));
+    // Write final output 
+    check_cuda_errors(cuMemcpyDtoH(&((float*)(outputs[0]))[0], dev_bufferC, sizeof(float) * 4));
+    // Clean up after ourselves
+    // // Clean-up must do this in tensor view!!!
+    check_cuda_errors(cuMemFree(dev_bufferA));
+    check_cuda_errors(cuMemFree(dev_bufferB));
+    check_cuda_errors(cuMemFree(dev_bufferC));
+    check_cuda_errors(cuModuleUnload(cuda_module));
+    check_cuda_errors(cuCtxDestroy(context));})";
+    if (m_emit_timing)
+    {
+        writer << "// Declare debug timers\n";
+        vector<string> names;
+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        {
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                if (!node->is_parameter() && !node->is_constant())
+                {
+                    names.push_back(node->get_name());
+                }
+            }
+        }
+        for (const string& s : names)
+        {
+            writer << "ngraph::stopwatch timer_" << s << ";\n";
+        }
+        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
+               << "; }\n";
+        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "const char* rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
+        }
+        writer << "default: rc = \"\";\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "size_t rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = timer_" << names[i]
+                   << ".get_total_microseconds(); break;\n";
+        }
+        writer << "default: rc = 0;\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "size_t rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
+        }
+        writer << "default: rc = 0;\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "\n";
+    }
+    //     // This for loop creates a collection of functions that are called more than once
+    //     // and emitting them as globally callable functions.
+    //     // ops implement the is_functionally_identical method
+    //     unordered_map<Node*, string> match_functions;
+    //     for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    //     {
+    //         const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
+    //         vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
+    //         for (size_t i = 0; i < op_list.size() - 1; i++)
+    //         {
+    //             if (op_list[i]->is_constant() || op_list[i]->is_parameter())
+    //             {
+    //                 continue;
+    //             }
+    //             if (contains_key(match_functions, op_list[i].get()))
+    //             {
+    //                 continue;
+    //             }
+    //             string match_function_name;
+    //             for (size_t j = i + 1; j < op_list.size(); j++)
+    //             {
+    //                 if (op_list[i]->is_functionally_identical(*op_list[j]))
+    //                 {
+    //                     if (match_function_name.empty())
+    //                     {
+    //                         match_function_name = "func_" + op_list[i]->get_name();
+    //                         match_functions.insert({op_list[i].get(), match_function_name});
+    //                     }
+    //                     match_functions.insert({op_list[j].get(), match_function_name});
+    //                 }
+    //             }
+    //             if (!match_function_name.empty())
+    //             {
+    //                 writer << "static void " << match_function_name << "(";
+    //                 writer.indent++;
+    //                 // Work around a compiler warning (*node inside typeid may have effects
+    //                 // with shared pointers, which is fine here but clang doesn't like it.)
+    //                 auto& n = *op_list[i];
+    //                 auto handler = dispatcher.find(type_index(typeid(n)));
+    //                 vector<GPU_TensorViewWrapper> in;
+    //                 size_t arg_index = 0;
+    //                 set<string> arg_names;
+    //                 for (const descriptor::Input& input : n.get_inputs())
+    //                 {
+    //                     const descriptor::Output& output = input.get_output();
+    //                     shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+    //                     GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
+    //                     if (!contains(arg_names, tvw.get_name()))
+    //                     {
+    //                         arg_names.insert(tvw.get_name());
+    //                         if (arg_index++ > 0)
+    //                         {
+    //                             writer << ",";
+    //                         }
+    //                         writer << "\n";
+    //                         writer << tvw.get_type() << "* " << tvw.get_name();
+    //                     }
+    //                     in.push_back(tvw);
+    //                 }
+    //                 vector<GPU_TensorViewWrapper> out;
+    //                 for (const descriptor::Output& output : n.get_outputs())
+    //                 {
+    //                     shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+    //                     GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
+    //                     if (arg_index++ > 0)
+    //                     {
+    //                         writer << ",";
+    //                     }
+    //                     writer << "\n";
+    //                     writer << tvw.get_type() << "* " << tvw.get_name();
+    //                     out.push_back(tvw);
+    //                 }
+    //                 writer.indent--;
+    // if (node->liveness_new_list.size() > 0)
+    //     {
+    //         temporaries_used = true;
+    //         for (descriptor::Tensor* tensor : node->liveness_new_list)
+    //         {
+    //             worst_case_tmp_size += tensor->size();
+    //         }
+    //     }
+    // }
+    // if (temporaries_used)
+    // {
+    //     size_t temp_pool_size = current_function->get_temporary_pool_size();
+    //     writer << "// Allocate the memory pool\n";
+    //     writer << "// Memory pool size is " << temp_pool_size << " bytes\n";
+    //     writer << "// Worst case size is " << worst_case_tmp_size << " bytes\n";
+    //     writer << "ngraph::runtime::AlignedBuffer memory_handler(" << temp_pool_size << ", "
+    //            << ngraph::runtime::gpu::alignment << ");\n";
+    //     writer << "size_t pool_gpu_ptr = (size_t)memory_handler.get_ptr();\n";
+    //     writer << "\n";
+    //     // Add temporaries to the variable name map
+    //     for (shared_ptr<Node> node : current_function->get_ordered_ops())
+    //     {
+    //         for (descriptor::Tensor* tensor : node->liveness_new_list)
+    //         {
+    //             stringstream ss;
+    //             ss << "((" << tensor->get_element_type().c_type_string() << "*)(pool_gpu_ptr + "
+    //                << tensor->get_pool_offset() << "))";
+    //             m_variable_name_map[tensor->get_name()] = ss.str();
+    //         }
+    //     }
+    // }
+    // // Add inputs to the variable name map
+    // size_t arg_index = 0;
+    // for (shared_ptr<op::Parameter> param : current_function->get_parameters())
+    // {
+    //     for (size_t i = 0; i < param->get_output_size(); ++i)
+    //     {
+    //         shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
+    //         const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+    //         string type = et.c_type_string();
+    //         stringstream ss;
+    //         ss << "((" << type << "*)(inputs[" << arg_index << "]))";
+    //         m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+    //         arg_index++;
+    //     }
+    // }
+    // // create output alias map
+    // size_t output_index = 0;
+    // unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
+    // vector<size_t> aliases;
+    // for (size_t i = 0; i < current_function->get_output_size(); ++i)
+    // {
+    //     shared_ptr<Node> op = current_function->get_output_op(i);
+    //     shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
+    //     vector<size_t>& al = output_alias_map[otv.get()];
+    //     al.push_back(output_index);
+    //     if (al.size() > 1)
+    //     {
+    //         aliases.push_back(output_index);
+    //     }
+    //     output_index++;
+    // }
+    // // Add outputs to the variable name map
+    // output_index = 0;
+    // for (size_t i = 0; i < current_function->get_output_size(); ++i)
+    // {
+    //     shared_ptr<Node> op = current_function->get_output_op(i);
+    //     shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+    //     const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+    //     bool parameter_as_output = false;
+    //     for (shared_ptr<op::Parameter> param : current_function->get_parameters())
+    //     {
+    //         for (const descriptor::Output& pout : param->get_outputs())
+    //         {
+    //             shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
+    //             if (tv == ptv)
+    //             {
+    //                 parameter_as_output = true;
+    //                 writer << "memcpy(static_cast<" << et.c_type_string() << "*>(outputs["
+    //                        << output_index << "]), "
+    //                        << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
+    //                        << ptv->get_tensor().size() << ");\n";
+    //                 break;
+    //             }
+    //         }
+    //     }
+    //     if (!parameter_as_output && !contains(aliases, output_index))
+    //     {
+    //         if (contains(constants, tv.get()))
+    //         {
+    //             writer << "memcpy(outputs[" << output_index << "], " << tv->get_tensor().get_name()
+    //                    << ", " << tv->get_tensor().size() << ");\n";
+    //         }
+    //         else
+    //         {
+    //             string type = et.c_type_string();
+    //             stringstream ss;
+    //             ss << "((" << type << "*)(outputs[" << output_index << "]))";
+    //             m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+    //         }
+    //     }
+    //     output_index++;
+    // }
+    // for (shared_ptr<Node> node : current_function->get_ordered_ops())
+    // {
+    //     auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
+    //     // with shared pointers, which is fine here but clang doesn't like it.)
+    //     auto handler = dispatcher.find(type_index(typeid(n)));
+    //     if (handler == dispatcher.end())
+    //     {
+    //         throw ngraph_error("Unhandled op during code generation : " + node->description());
+    //     }
+    //     vector<GPU_TensorViewWrapper> in;
+    //     for (const descriptor::Input& input : node->get_inputs())
+    //     {
+    //         const descriptor::Output& output = input.get_output();
+    //         shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+    //         in.push_back(GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+    //     }
+    //     vector<GPU_TensorViewWrapper> out;
+    //     for (const descriptor::Output& output : node->get_outputs())
+    //     {
+    //         shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+    //         out.push_back(GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+    //     }
+    //     // Emit operation prologue
+    //     if (!node->is_parameter() && !node->is_constant())
+    //     {
+    //         if (m_use_tbb)
+    //         {
+    //             writer << "tbb::flow::continue_node<tbb::flow::continue_msg> "
+    //                       "flowgraph_node_"
+    //                    << node->get_name() << "(G, [&](const tbb::flow::continue_msg &msg)\n{\n";
+    //             writer.indent++;
+    //         }
+    //         if (m_emit_timing)
+    //         {
+    //             emit_debug_function_entry(writer, node.get(), in, out);
+    //         }
+    //     }
+    //     // Emit operation body
+    //     string func_name;
+    //     auto it = match_functions.find(node.get());
+    //     if (it != match_functions.end())
+    //     {
+    //         func_name = it->second;
+    //     }
+    //     if (func_name.empty())
+    //     {
+    //         handler->second(&emitter, node.get(), in, out);
+    //     }
+    //     else
+    //     {
+    //         vector<string> names;
+    //         for (const GPU_TensorViewWrapper& tv : in)
+    //         {
+    //             names.push_back(tv.get_name());
+    //         }
+    //         for (const GPU_TensorViewWrapper& tv : out)
+    //         {
+    //             names.push_back(tv.get_name());
+    //         }
+    //         writer << func_name << "(" << join(names) << ");\n";
+    //     }
+    //     // Emit operation epilogue
+    //     if (!node->is_parameter() && !node->is_constant())
+    //     {
+    //         handle_output_alias(writer, *node, output_alias_map);
+    //         if (m_emit_timing)
+    //         {
+    //             emit_debug_function_exit(writer, node.get(), in, out);
+    //         }
+    //         if (m_use_tbb)
+    //         {
+    //             writer.indent--;
+    //             writer << "});\n";
+    //         }
+    //     }
+    // }
+    // if (m_use_tbb)
+    // {
+    //     writer << "\n";
+    //     // Build the flow graph
+    //     vector<Node*> dependence_graph_heads;
+    //     traverse_nodes(current_function, [&writer, &dependence_graph_heads](shared_ptr<Node> n) {
+    //         if (!n->is_parameter() && !n->is_constant())
+    //         {
+    //             bool is_head = true;
+    //             for (auto arg : n->get_input_ops())
+    //             {
+    //                 if (!arg->is_parameter() && !arg->is_constant())
+    //                 {
+    //                     is_head = false;
+    //                     writer << "tbb::flow::make_edge(flowgraph_node_" << arg->get_name()
+    //                            << ", flowgraph_node_" << n->get_name() << ");\n";
+    //                 }
+    //             }
+    //             if (is_head)
+    //             {
+    //                 dependence_graph_heads.emplace_back(n.get());
+    //             }
+    //         }
+    //     });
+    //     writer << "\n";
+    //     // Execute the flow graph
+    //     if (!dependence_graph_heads.empty())
+    //     {
+    //         for (Node* n : dependence_graph_heads)
+    //         {
+    //             writer << "flowgraph_node_" << n->get_name()
+    //                    << ".try_put(tbb::flow::continue_msg());\n";
+    //         }
+    //         writer << "try { G.wait_for_all(); } catch(...) { throw; }\n";
+    //     }
+    // }
+    // writer.indent--;
+    // // End generated function
+    // writer += "}\n\n";
+    // }
+    // TODO: Cleanup and make this a utility function
+    file_util::make_directory(s_output_dir);
+    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
+    ofstream out(filename);
+    string code = writer.get_code();
+    out << code;
+    out.close();
+    m_compiler.reset(new codegen::Compiler());
+    m_execution_engine.reset(new codegen::ExecutionEngine());
+    m_compiler->set_precompiled_header_source(pch_header_source);
+    auto codegen_module = m_compiler->compile(code);
+    if (codegen_module == nullptr)
+    {
+        throw runtime_error("function failed to compile");
+    }
+    m_execution_engine->add_module(codegen_module);
+    m_execution_engine->finalize();
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
+    assert(m_compiled_function);
+    m_is_compiled = true;
+    if (m_release_function)
+    {
+        release_function();
+    }
 }
-shared_ptr<runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
+void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
+    codegen::CodeWriter& writer,
+    const Node& node,
+    const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
+{
+    for (const descriptor::Output& output : node.get_outputs())
+    {
+        shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
+        auto it = output_alias_map.find(otv.get());
+        if (it != output_alias_map.end())
+        {
+            const vector<size_t>& outputs = it->second;
+            if (outputs.size() > 1)
+            {
+                writer << "{    // handle output alias for previous op\n";
+                writer.indent++;
+                for (size_t i = 1; i < outputs.size(); i++)
+                {
+                    writer << "memcpy(static_cast<void*>(outputs[" << outputs[i]
+                           << "]), static_cast<void*>(outputs[" << outputs[0] << "]), "
+                           << otv->get_tensor().size() << ");\n";
+                }
+                writer.indent--;
+                writer << "}\n";
+            }
+        }
+    }
+}
+shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
 {
    if (!m_is_compiled)
    {
        compile();
    }
-    return make_shared<runtime::gpu::GPU_CallFrame>(shared_from_this(), m_function);
+    return make_shared<ngraph::runtime::gpu::GPU_CallFrame>(shared_from_this(),
+                                                            m_compiled_function);
+}
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
+    codegen::CodeWriter& writer,
+    Node* node,
+    const std::vector<GPU_TensorViewWrapper>& in,
+    const std::vector<GPU_TensorViewWrapper>& out)
+{
+    writer << "timer_" << node->get_name() << ".start();\n";
+}
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
+    codegen::CodeWriter& writer,
+    Node* node,
+    const std::vector<GPU_TensorViewWrapper>& in,
+    const std::vector<GPU_TensorViewWrapper>& out)
+{
+    writer << "timer_" << node->get_name() << ".stop();\n";
 }
--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -20,9 +20,13 @@
 #include <typeinfo>
 #include <unordered_map>
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/codegen/compiler.hpp"
+#include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
 namespace ngraph
 {
@@ -30,9 +34,23 @@ namespace ngraph
    {
        namespace gpu
        {
+            class GPU_ExternalFunction;
+            class GPU_Emitter;
+            class GPU_CallFrame;
+            using OpFunction =
+                std::function<void(GPU_Emitter*,
+                                   const ngraph::Node*,
+                                   const std::vector<GPU_TensorViewWrapper>& inputs,
+                                   const std::vector<GPU_TensorViewWrapper>& outputs)>;
+            using OpMap = std::unordered_map<std::type_index, OpFunction>;
            class GPU_ExternalFunction : public ngraph::runtime::ExternalFunction,
                                         public std::enable_shared_from_this<GPU_ExternalFunction>
            {
+                friend class GPU_CallFrame;
            public:
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                     bool release_function = true);
@@ -41,7 +59,27 @@ namespace ngraph
            protected:
                void compile();
-                std::shared_ptr<ngraph::Function> m_function;
+                EntryPoint m_compiled_function;
+            private:
+                void emit_debug_function_entry(codegen::CodeWriter& writer,
+                                               Node* node,
+                                               const std::vector<GPU_TensorViewWrapper>& in,
+                                               const std::vector<GPU_TensorViewWrapper>& out);
+                void emit_debug_function_exit(codegen::CodeWriter& writer,
+                                              Node* node,
+                                              const std::vector<GPU_TensorViewWrapper>& in,
+                                              const std::vector<GPU_TensorViewWrapper>& out);
+                void handle_output_alias(
+                    codegen::CodeWriter& writer,
+                    const Node&,
+                    const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
+                std::unique_ptr<codegen::Compiler> m_compiler;
+                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
+                bool m_emit_timing;
+                bool m_use_tbb;
+                std::unordered_map<std::string, std::string> m_variable_name_map;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include <algorithm>
+#include <map>
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+using namespace ngraph;
+using namespace ngraph::runtime::gpu::kernel;
+void ngraph::runtime::gpu::kernel::emit_broadcast(codegen::CodeWriter& writer,
+                                                  const std::string& element_type,
+                                                  const std::string& arg0, // replacement context
+                                                  const std::string& out,
+                                                  const Shape& arg0_shape,
+                                                  const Shape& out_shape,
+                                                  const AxisSet& broadcast_axes)
+{
+}
+//
+// For the reference kernel this is gpud on, see ngraph/runtime/kernel/concat.hpp.
+//
+void ngraph::runtime::gpu::kernel::emit_concat(codegen::CodeWriter& writer,
+                                               const std::string& element_type,
+                                               const std::vector<std::string>& args,
+                                               const std::string& out,
+                                               const std::vector<Shape>& in_shapes,
+                                               const Shape& out_shape,
+                                               size_t concatenation_axis)
+{
+}
+void ngraph::runtime::gpu::kernel::emit_replace_slice(
+    codegen::CodeWriter& writer,
+    const std::string& element_type,
+    const std::string& arg0, // replacement context
+    const std::string& arg1, // replacement value
+    const std::string& out,
+    const Shape& arg1_shape,
+    const Shape& out_shape,
+    const Coordinate& lower_bounds,
+    const Coordinate& upper_bounds,
+    const Strides& strides)
+{
+}
+void ngraph::runtime::gpu::kernel::emit_slice(codegen::CodeWriter& writer,
+                                              const std::string& element_type,
+                                              const std::string& arg0, // replacement context
+                                              const std::string& out,
+                                              const Shape& arg0_shape,
+                                              const Shape& out_shape,
+                                              const Coordinate& lower_bounds,
+                                              const Coordinate& upper_bounds,
+                                              const Strides& strides)
+{
+}
+void ngraph::runtime::gpu::kernel::emit_reshape(codegen::CodeWriter& writer,
+                                                const std::string& element_type,
+                                                const std::string& arg0, // replacement context
+                                                const std::string& out,
+                                                const Shape& arg0_shape,
+                                                const Shape& out_shape,
+                                                const AxisVector& arg0_axis_order)
+{
+}
+void ngraph::runtime::gpu::kernel::emit_sum(codegen::CodeWriter& writer,
+                                            const std::string& element_type,
+                                            const std::string& arg0, // replacement context
+                                            const std::string& out,
+                                            const Shape& arg0_shape,
+                                            const Shape& out_shape,
+                                            const AxisSet& reduction_axes)
+{
+}
--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace kernel
+            {
+                void emit_broadcast(codegen::CodeWriter& writer,
+                                    const std::string& element_type,
+                                    const std::string& arg0, // replacement context
+                                    const std::string& out,
+                                    const Shape& arg0_shape,
+                                    const Shape& out_shape,
+                                    const AxisSet& broadcast_axes);
+                void emit_concat(codegen::CodeWriter& writer,
+                                 const std::string& element_type,
+                                 const std::vector<std::string>& args,
+                                 const std::string& out,
+                                 const std::vector<Shape>& in_shapes,
+                                 const Shape& out_shape,
+                                 const size_t concatenation_axis);
+                void emit_replace_slice(codegen::CodeWriter& writer,
+                                        const std::string& element_type,
+                                        const std::string& arg0, // replacement context
+                                        const std::string& arg1, // replacement value
+                                        const std::string& out,
+                                        const Shape& arg1_shape,
+                                        const Shape& out_shape,
+                                        const Coordinate& lower_bounds,
+                                        const Coordinate& upper_bounds,
+                                        const Strides& strides);
+                void emit_slice(codegen::CodeWriter& writer,
+                                const std::string& element_type,
+                                const std::string& arg0, // replacement context
+                                const std::string& out,
+                                const Shape& arg0_shape,
+                                const Shape& out_shape,
+                                const Coordinate& lower_bounds,
+                                const Coordinate& upper_bounds,
+                                const Strides& strides);
+                void emit_reshape(codegen::CodeWriter& writer,
+                                  const std::string& element_type,
+                                  const std::string& arg0, // replacement context
+                                  const std::string& out,
+                                  const Shape& arg0_shape,
+                                  const Shape& out_shape,
+                                  const AxisVector& arg0_axis_order);
+                void emit_sum(codegen::CodeWriter& writer,
+                              const std::string& element_type,
+                              const std::string& arg0, // replacement context
+                              const std::string& out,
+                              const Shape& arg0_shape,
+                              const Shape& out_shape,
+                              const AxisSet& reduction_axes);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
@@ -14,6 +14,8 @@
 #include <memory>
+#include <cuda.h>
 #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
@@ -30,59 +32,26 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
          true,
          true,
          false))
-    , m_allocated_buffer_pool(nullptr)
-    , m_aligned_buffer_pool(nullptr)
 {
+    // Need to check type and have host/device tensors
    m_descriptor->set_tensor_view_layout(
        std::make_shared<ngraph::descriptor::layout::DenseTensorViewLayout>(*m_descriptor));
    m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
-    if (m_buffer_size > 0)
-    {
-        size_t allocation_size = m_buffer_size + runtime::gpu::alignment;
-        m_allocated_buffer_pool = static_cast<char*>(malloc(allocation_size));
-        m_aligned_buffer_pool = m_allocated_buffer_pool;
-        size_t mod = size_t(m_aligned_buffer_pool) % alignment;
-        if (mod != 0)
-        {
-            m_aligned_buffer_pool += (alignment - mod);
-        }
-    }
-}
-runtime::gpu::GPU_TensorView::~GPU_TensorView()
-{
-    if (m_allocated_buffer_pool != nullptr)
-    {
-        free(m_allocated_buffer_pool);
-    }
-}
-char* runtime::gpu::GPU_TensorView::get_data_ptr()
+    // cuMemAlloc(&dev_buffer, m_buffer_size);
-{
-    return m_aligned_buffer_pool;
 }
-const char* runtime::gpu::GPU_TensorView::get_data_ptr() const
+runtime::gpu::GPU_TensorView::~GPU_TensorView()
 {
-    return m_aligned_buffer_pool;
+    // cuMemFree(dev_buffer);
 }
 void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)
 {
-    if (tensor_offset + n > m_buffer_size)
+    // cuMemcpyHtoD(dev_buffer, source, n);
-    {
-        throw out_of_range("write access past end of tensor");
-    }
-    char* target = get_data_ptr();
 }
 void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const
 {
-    if (tensor_offset + n > m_buffer_size)
+    // cuMemcpyDtoH(target, dev_buffer, n);
-    {
-        throw out_of_range("read access past end of tensor");
-    }
-    const char* source = get_data_ptr();
 }
--- a/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
@@ -14,6 +14,7 @@
 #pragma once
+#include <cuda.h>
 #include <memory>
 #include "ngraph/runtime/tensor_view.hpp"
@@ -36,9 +37,6 @@ public:
    GPU_TensorView(const ngraph::element::Type& element_type, const Shape& shape);
    virtual ~GPU_TensorView();
-    char* get_data_ptr();
-    const char* get_data_ptr() const;
    /// @brief Write bytes directly into the tensor
    /// @param p Pointer to source of data
    /// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
@@ -51,8 +49,12 @@ public:
    /// @param n Number of bytes to read, must be integral number of elements.
    void read(void* p, size_t tensor_offset, size_t n) const override;
+    // const char* get_data_ptr();
+    // const char* get_data_ptr() const;
 private:
-    char* m_allocated_buffer_pool;
+    CUdeviceptr dev_buffer;
-    char* m_aligned_buffer_pool;
+    // At some point need to deal with alignment
    size_t m_buffer_size;
 };
--- a/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
+#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
+#include "ngraph/descriptor/tensor.hpp"
+using namespace std;
+using namespace ngraph;
+runtime::gpu::GPU_TensorViewWrapper::GPU_TensorViewWrapper(
+    const shared_ptr<descriptor::TensorView>& tv, const string& alias)
+    : m_tensor_view(tv)
+    , m_alias(alias)
+{
+}
+size_t runtime::gpu::GPU_TensorViewWrapper::get_size() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_size();
+}
+const vector<size_t>& runtime::gpu::GPU_TensorViewWrapper::get_shape() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_shape();
+}
+const vector<size_t>& runtime::gpu::GPU_TensorViewWrapper::get_strides() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_strides();
+}
+const element::Type& runtime::gpu::GPU_TensorViewWrapper::get_element_type() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_element_type();
+}
+const std::string& runtime::gpu::GPU_TensorViewWrapper::get_name() const
+{
+    if (m_alias.empty())
+    {
+        return m_tensor_view->get_tensor().get_name();
+    }
+    else
+    {
+        return m_alias;
+    }
+}
+const std::string& runtime::gpu::GPU_TensorViewWrapper::get_type() const
+{
+    return get_element_type().c_type_string();
+}
+bool runtime::gpu::GPU_TensorViewWrapper::is_output() const
+{
+    return m_tensor_view->get_tensor().is_output();
+}
--- a/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include <memory>
+#include "ngraph/descriptor/tensor_view.hpp"
+#include "ngraph/types/element_type.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class GPU_TensorViewWrapper;
+        }
+    }
+}
+class ngraph::runtime::gpu::GPU_TensorViewWrapper
+{
+public:
+    GPU_TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&,
+                          const std::string& alias = "");
+    size_t get_size() const;
+    const std::vector<size_t>& get_shape() const;
+    const std::vector<size_t>& get_strides() const;
+    const element::Type& get_element_type() const;
+    const std::string& get_name() const;
+    const std::string& get_type() const;
+    bool is_output() const;
+private:
+    std::shared_ptr<descriptor::TensorView> m_tensor_view;
+    std::string m_alias;
+};
--- a/test/cudnn.cpp
+++ b/test/cudnn.cpp
@@ -22,7 +22,11 @@
 #include <cudnn.h>
 #include "ngraph/codegen/compiler.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/ngraph.hpp"
+#include "util/ndarray.hpp"
+#include "util/test_tools.hpp"
 using namespace ngraph;
 using namespace std;
@@ -41,7 +45,6 @@ TEST(cudnn, compileTest)
 #include <iostream>
 #include "cuda.h"
 void check_cuda_errors(CUresult err) {
  assert(err == CUDA_SUCCESS);
 }
@@ -202,7 +205,6 @@ const auto str = R"(
  check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float)*16));
  check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float)*16));
  unsigned block_size_X = 16;
  unsigned block_size_Y = 1;
  unsigned block_size_Z = 1;
@@ -223,13 +225,11 @@ const auto str = R"(
  // Retrieve device data
  check_cuda_errors(cuMemcpyDtoH(&host_C[0], dev_bufferC, sizeof(float)*16));
  std::cout << "Results:\n";
  for (unsigned i = 0; i != 16; ++i) {
    std::cout << host_A[i] << " + " << host_B[i] << " = " << host_C[i] << "\n";
  }
  // Clean up after ourselves
  delete [] host_A;
  delete [] host_B;
@@ -261,4 +261,50 @@ TEST(cudnn, abc)
    auto external = manager->compile(f);
    auto backend = manager->allocate_backend();
    auto cf = backend->make_call_frame(external);
+    // Create some tensors for input/output
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
+    shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
+    shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
+    cf->call({a, b, c}, {result});
+    EXPECT_EQ(result->get_vector<float>(),
+              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
+    cf->call({b, a, c}, {result});
+    EXPECT_EQ(result->get_vector<float>(),
+              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
+    cf->call({a, c, b}, {result});
+    EXPECT_EQ(result->get_vector<float>(),
+              (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
+}
+TEST(cudnn, dot1d)
+{
+    auto shape = Shape{4};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto shape_r = Shape{};
+    auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
+    auto manager = runtime::Manager::get("GPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{2, 4, 8, 16});
+    auto b = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(b, vector<float>{1, 2, 4, 8});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+    cf->call({a, b}, {result});
+    EXPECT_EQ((vector<float>{170}), result->get_vector<float>());
 }