Drwebb/gpu external function (#367)

* Initial GPU_ExternalFunction implementation Other changes: Add GPU runtime to same cmake block as GPU, include CUDA headers if GPU enabled Initial passing (a+b)*c test Properly link cuda libraries Simple GPUTensorView implementation Initial GPU emitter GPU codegen initial function gen, no kernels yet Rename GPU emitter and tensor_view_wrapper to match naming convention * GPU external function based on BASE * Fix stray base -> gpu * TensorViewWrapper -> GPU_TensorViewWrapper * Copy over emitter from base transformer * Fix for naming dense layout * Copy kernel emitters from base -> gpu and strip out kernel_utils * Add aliases to GPU_TensorViewWrappers * More fixes for naming descriptor::TensorViews * Move in call_frame implementation from base -> gpu * apply code format * GPU codegen running A+B*C gpu emitters gpu ctx setup cuda_module kernels Remove GPU_CF perf counters Use gpu kernels in external function Add GPU 1d dot test Review Changes: * Remove CPU specific kernel emitting method bodies * Use copy_data from test/util.cpp, uncomment compileTest * Use test_utils copy_data function * Grab function name from pass manager for def, clean up indentation

Drwebb/gpu external function (#367)
* Initial GPU_ExternalFunction implementation Other changes: Add GPU runtime to same cmake block as GPU, include CUDA headers if GPU enabled Initial passing (a+b)*c test Properly link cuda libraries Simple GPUTensorView implementation Initial GPU emitter GPU codegen initial function gen, no kernels yet Rename GPU emitter and tensor_view_wrapper to match naming convention * GPU external function based on BASE * Fix stray base -> gpu * TensorViewWrapper -> GPU_TensorViewWrapper * Copy over emitter from base transformer * Fix for naming dense layout * Copy kernel emitters from base -> gpu and strip out kernel_utils * Add aliases to GPU_TensorViewWrappers * More fixes for naming descriptor::TensorViews * Move in call_frame implementation from base -> gpu * apply code format * GPU codegen running A+B*C gpu emitters gpu ctx setup cuda_module kernels Remove GPU_CF perf counters Use gpu kernels in external function Add GPU 1d dot test Review Changes: * Remove CPU specific kernel emitting method bodies * Use copy_data from test/util.cpp, uncomment compileTest * Use test_utils copy_data function * Grab function name from pass manager for def, clean up indentation
c5549682 · Tristan Webb · GitHub · e433e55a · c5549682 · c5549682
Unverified Commit c5549682 authored Jan 17, 2018 by Tristan Webb Committed by GitHub Jan 17, 2018
15 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -190,9 +190,12 @@ endif()
        set(SRC ${SRC}
            runtime/gpu/gpu_call_frame.cpp
            runtime/gpu/gpu_backend.cpp
-            runtime/gpu/gpu_manager.cpp
+            runtime/gpu/gpu_emitter.cpp
            runtime/gpu/gpu_external_function.cpp
+            runtime/gpu/gpu_kernel_emitters.cpp
+            runtime/gpu/gpu_manager.cpp
            runtime/gpu/gpu_tensor_view.cpp
+            runtime/gpu/gpu_tensor_view_wrapper.cpp
            )
        set_property(SOURCE codegen/compiler.cpp APPEND_STRING PROPERTY COMPILE_DEFINITIONS
        "CUDA_HEADER_PATHS=\"${CUDA_INCLUDE_DIRS}\";")
@@ -265,7 +268,7 @@ if(NGRAPH_CPU_ENABLE)
 endif()

 if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
-    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES})
+    target_link_libraries(ngraph PRIVATE cuda)
 endif()

 # Argon

--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -13,6 +13,7 @@
 // ----------------------------------------------------------------------------

 #include "ngraph/runtime/gpu/gpu_backend.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view.hpp"

@@ -29,6 +30,6 @@ std::shared_ptr<ngraph::runtime::TensorView>
    runtime::gpu::GPU_Backend::make_primary_tensor_view(const ngraph::element::Type& element_type,
                                                        const Shape& shape)
 {
-    auto rc = make_shared<runtime::gpu::GPU_TensorView>(element_type, shape);
+    auto rc = make_shared<runtime::cpu::CPU_TensorView>(element_type, shape);
    return dynamic_pointer_cast<runtime::TensorView>(rc);
 }
--- a/src/ngraph/runtime/gpu/gpu_call_frame.cpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.cpp
@@ -12,25 +12,66 @@
 // see the license for the specific language governing permissions and
 // ----------------------------------------------------------------------------

+#include <cstdlib>
+#include <fstream>
+
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
+
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor_view.hpp"

 using namespace std;
-using namespace ngraph::runtime::gpu;
+using namespace ngraph;

-GPU_CallFrame::GPU_CallFrame(shared_ptr<GPU_ExternalFunction> external_function,
-                             shared_ptr<Function> func)
+runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
+                                           EntryPoint compiled_function)
    : m_external_function(external_function)
-    , m_function(func)
+    , m_compiled_function(compiled_function)
 {
 }

-void GPU_CallFrame::call(const vector<shared_ptr<Value>>& input_tvs,
-                         const vector<shared_ptr<Value>>& output_tvs)
-
+void runtime::gpu::GPU_CallFrame::tensor_call(
+    const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs,
+    const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs)
 {
+    // Host tensors
+    vector<void*> inputs;
+    vector<void*> outputs;
+
+    for (size_t i = 0; i < input_tvs.size(); i++)
+    {
+        shared_ptr<runtime::cpu::CPU_TensorView> tv =
+            static_pointer_cast<runtime::cpu::CPU_TensorView>(input_tvs[i]);
+        inputs.push_back(tv->get_data_ptr());
+    }
+    for (size_t i = 0; i < output_tvs.size(); i++)
+    {
+        shared_ptr<runtime::cpu::CPU_TensorView> tv =
+            static_pointer_cast<runtime::cpu::CPU_TensorView>(output_tvs[i]);
+        outputs.push_back(tv->get_data_ptr());
+    }
+
+    // Invoke compiled computation
+    m_compiled_function(inputs.data(), outputs.data());
 }

-void GPU_CallFrame::tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
-                                const std::vector<std::shared_ptr<TensorView>>& outputs)
+void runtime::gpu::GPU_CallFrame::call(
+    const std::vector<std::shared_ptr<runtime::TensorView>>& arguments,
+    const std::vector<std::shared_ptr<runtime::TensorView>>& results)
 {
+    // TODO: Check types of args and result
+    vector<shared_ptr<runtime::TensorView>> inputs;
+    for (shared_ptr<runtime::TensorView> argument : arguments)
+    {
+        argument->collect_tensor_views(inputs, argument);
+    }
+
+    vector<shared_ptr<runtime::TensorView>> outputs;
+    for (shared_ptr<runtime::TensorView> result : results)
+    {
+        result->collect_tensor_views(outputs, result);
+    }
+
+    tensor_call(inputs, outputs);
 }
--- a/src/ngraph/runtime/gpu/gpu_call_frame.hpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.hpp
@@ -26,6 +26,8 @@ namespace ngraph
 {
    namespace runtime
    {
+        class PrimaryTensorView;
+
        namespace gpu
        {
            class GPU_CallFrame;
@@ -40,22 +42,23 @@ namespace ngraph
            {
            public:
                GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
-                              std::shared_ptr<Function> func);
+                              EntryPoint compiled_function);

                /// @brief Invoke the function with values matching the signature of the function.
                ///
                /// Tuples will be expanded into their tensor views to build the call frame.
-                void call(const std::vector<std::shared_ptr<ngraph::runtime::Value>>& inputs,
-                          const std::vector<std::shared_ptr<ngraph::runtime::Value>>& outputs);
+                void
+                    call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
+                         const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;

                /// @brief Invoke the function with tuples pre-expanded to their underlying
                /// tensor views.
                void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
-                                 const std::vector<std::shared_ptr<TensorView>>& outputs);
+                                 const std::vector<std::shared_ptr<TensorView>>& outputs) override;

            protected:
                std::shared_ptr<GPU_ExternalFunction> m_external_function;
-                std::shared_ptr<Function> m_function;
+                EntryPoint m_compiled_function;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
+
+#define EMITTER_DECL(E)                                                                            \
+    E(const ngraph::Node* n,                                                                       \
+      const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& args,                        \
+      const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& out)
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class GPU_Emitter
+            {
+            protected:
+                codegen::CodeWriter m_out;
+                bool m_use_ref_kernels;
+
+            public:
+                GPU_Emitter()
+                    : m_out()
+                    , m_use_ref_kernels(std::getenv("NGRAPH_GPU_USE_REF_KERNELS") != nullptr)
+                {
+                }
+                std::string get_code() { return m_out.get_code(); }
+                codegen::CodeWriter& get_code_writer() { return m_out; }
+                void EMITTER_DECL(EmitNop);
+                void EMITTER_DECL(EmitAdd);
+                void EMITTER_DECL(EmitDot);
+                void EMITTER_DECL(EmitMultiply);
+                void EMITTER_DECL(EmitGetOutputElement);
+                void EMITTER_DECL(EmitXLAGetTupleElement);
+                void EMITTER_DECL(EmitTuple);
+                void EMITTER_DECL(EmitAbs);
+                void EMITTER_DECL(EmitConcat);
+                void EMITTER_DECL(EmitDivide);
+                void EMITTER_DECL(EmitEqual);
+                void EMITTER_DECL(EmitGreater);
+                void EMITTER_DECL(EmitGreaterEq);
+                void EMITTER_DECL(EmitLess);
+                void EMITTER_DECL(EmitLessEq);
+                void EMITTER_DECL(EmitLog);
+                void EMITTER_DECL(EmitMaximum);
+                void EMITTER_DECL(EmitMinimum);
+                void EMITTER_DECL(EmitNegative);
+                void EMITTER_DECL(EmitNotEqual);
+                void EMITTER_DECL(EmitSelect);
+                void EMITTER_DECL(EmitSubtract);
+                void EMITTER_DECL(EmitBroadcast);
+                void EMITTER_DECL(EmitConvert);
+                void EMITTER_DECL(EmitConstant);
+                void EMITTER_DECL(EmitReshape);
+                void EMITTER_DECL(EmitFunctionCall);
+                void EMITTER_DECL(EmitReduce);
+                void EMITTER_DECL(EmitSign);
+                void EMITTER_DECL(EmitSlice);
+                void EMITTER_DECL(EmitSum);
+                void EMITTER_DECL(EmitExp);
+                void EMITTER_DECL(EmitSin);
+                void EMITTER_DECL(EmitSinh);
+                void EMITTER_DECL(EmitCos);
+                void EMITTER_DECL(EmitCosh);
+                void EMITTER_DECL(EmitTan);
+                void EMITTER_DECL(EmitTanh);
+                void EMITTER_DECL(EmitAsin);
+                void EMITTER_DECL(EmitAcos);
+                void EMITTER_DECL(EmitAtan);
+                void EMITTER_DECL(EmitPower);
+                void EMITTER_DECL(EmitReplaceSlice);
+                void EMITTER_DECL(EmitOneHot);
+                void EMITTER_DECL(EmitFloor);
+                void EMITTER_DECL(EmitCeiling);
+                void EMITTER_DECL(EmitSqrt);
+                void EMITTER_DECL(EmitConvolution);
+                void EMITTER_DECL(EmitNot);
+                void EMITTER_DECL(EmitMaxPool);
+                void EMITTER_DECL(EmitReverse);
+
+            private:
+                void generate_call(const std::vector<GPU_TensorViewWrapper>& args,
+                                   const std::vector<GPU_TensorViewWrapper>& out,
+                                   std::shared_ptr<Function> function);
+
+                std::string emit_vector(const GPU_TensorViewWrapper&, const std::string& name = "");
+                std::string emit_array1d(const GPU_TensorViewWrapper&,
+                                         const std::string& name = "");
+                std::string emit_matrix(const GPU_TensorViewWrapper&, const std::string& name = "");
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -20,9 +20,13 @@
 #include <typeinfo>
 #include <unordered_map>

+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/codegen/compiler.hpp"
+#include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"

 namespace ngraph
 {
@@ -30,9 +34,23 @@ namespace ngraph
    {
        namespace gpu
        {
+            class GPU_ExternalFunction;
+            class GPU_Emitter;
+            class GPU_CallFrame;
+
+            using OpFunction =
+                std::function<void(GPU_Emitter*,
+                                   const ngraph::Node*,
+                                   const std::vector<GPU_TensorViewWrapper>& inputs,
+                                   const std::vector<GPU_TensorViewWrapper>& outputs)>;
+
+            using OpMap = std::unordered_map<std::type_index, OpFunction>;
+
            class GPU_ExternalFunction : public ngraph::runtime::ExternalFunction,
                                         public std::enable_shared_from_this<GPU_ExternalFunction>
            {
+                friend class GPU_CallFrame;
+
            public:
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                     bool release_function = true);
@@ -41,7 +59,27 @@ namespace ngraph
            protected:
                void compile();

-                std::shared_ptr<ngraph::Function> m_function;
+                EntryPoint m_compiled_function;
+
+            private:
+                void emit_debug_function_entry(codegen::CodeWriter& writer,
+                                               Node* node,
+                                               const std::vector<GPU_TensorViewWrapper>& in,
+                                               const std::vector<GPU_TensorViewWrapper>& out);
+                void emit_debug_function_exit(codegen::CodeWriter& writer,
+                                              Node* node,
+                                              const std::vector<GPU_TensorViewWrapper>& in,
+                                              const std::vector<GPU_TensorViewWrapper>& out);
+                void handle_output_alias(
+                    codegen::CodeWriter& writer,
+                    const Node&,
+                    const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
+
+                std::unique_ptr<codegen::Compiler> m_compiler;
+                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
+                bool m_emit_timing;
+                bool m_use_tbb;
+                std::unordered_map<std::string, std::string> m_variable_name_map;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include <algorithm>
+#include <map>
+
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+
+using namespace ngraph;
+using namespace ngraph::runtime::gpu::kernel;
+
+void ngraph::runtime::gpu::kernel::emit_broadcast(codegen::CodeWriter& writer,
+                                                  const std::string& element_type,
+                                                  const std::string& arg0, // replacement context
+                                                  const std::string& out,
+                                                  const Shape& arg0_shape,
+                                                  const Shape& out_shape,
+                                                  const AxisSet& broadcast_axes)
+{
+}
+
+//
+// For the reference kernel this is gpud on, see ngraph/runtime/kernel/concat.hpp.
+//
+void ngraph::runtime::gpu::kernel::emit_concat(codegen::CodeWriter& writer,
+                                               const std::string& element_type,
+                                               const std::vector<std::string>& args,
+                                               const std::string& out,
+                                               const std::vector<Shape>& in_shapes,
+                                               const Shape& out_shape,
+                                               size_t concatenation_axis)
+{
+}
+
+void ngraph::runtime::gpu::kernel::emit_replace_slice(
+    codegen::CodeWriter& writer,
+    const std::string& element_type,
+    const std::string& arg0, // replacement context
+    const std::string& arg1, // replacement value
+    const std::string& out,
+    const Shape& arg1_shape,
+    const Shape& out_shape,
+    const Coordinate& lower_bounds,
+    const Coordinate& upper_bounds,
+    const Strides& strides)
+{
+}
+
+void ngraph::runtime::gpu::kernel::emit_slice(codegen::CodeWriter& writer,
+                                              const std::string& element_type,
+                                              const std::string& arg0, // replacement context
+                                              const std::string& out,
+                                              const Shape& arg0_shape,
+                                              const Shape& out_shape,
+                                              const Coordinate& lower_bounds,
+                                              const Coordinate& upper_bounds,
+                                              const Strides& strides)
+{
+}
+
+void ngraph::runtime::gpu::kernel::emit_reshape(codegen::CodeWriter& writer,
+                                                const std::string& element_type,
+                                                const std::string& arg0, // replacement context
+                                                const std::string& out,
+                                                const Shape& arg0_shape,
+                                                const Shape& out_shape,
+                                                const AxisVector& arg0_axis_order)
+{
+}
+
+void ngraph::runtime::gpu::kernel::emit_sum(codegen::CodeWriter& writer,
+                                            const std::string& element_type,
+                                            const std::string& arg0, // replacement context
+                                            const std::string& out,
+                                            const Shape& arg0_shape,
+                                            const Shape& out_shape,
+                                            const AxisSet& reduction_axes)
+{
+}
--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace kernel
+            {
+                void emit_broadcast(codegen::CodeWriter& writer,
+                                    const std::string& element_type,
+                                    const std::string& arg0, // replacement context
+                                    const std::string& out,
+                                    const Shape& arg0_shape,
+                                    const Shape& out_shape,
+                                    const AxisSet& broadcast_axes);
+                void emit_concat(codegen::CodeWriter& writer,
+                                 const std::string& element_type,
+                                 const std::vector<std::string>& args,
+                                 const std::string& out,
+                                 const std::vector<Shape>& in_shapes,
+                                 const Shape& out_shape,
+                                 const size_t concatenation_axis);
+
+                void emit_replace_slice(codegen::CodeWriter& writer,
+                                        const std::string& element_type,
+                                        const std::string& arg0, // replacement context
+                                        const std::string& arg1, // replacement value
+                                        const std::string& out,
+                                        const Shape& arg1_shape,
+                                        const Shape& out_shape,
+                                        const Coordinate& lower_bounds,
+                                        const Coordinate& upper_bounds,
+                                        const Strides& strides);
+                void emit_slice(codegen::CodeWriter& writer,
+                                const std::string& element_type,
+                                const std::string& arg0, // replacement context
+                                const std::string& out,
+                                const Shape& arg0_shape,
+                                const Shape& out_shape,
+                                const Coordinate& lower_bounds,
+                                const Coordinate& upper_bounds,
+                                const Strides& strides);
+                void emit_reshape(codegen::CodeWriter& writer,
+                                  const std::string& element_type,
+                                  const std::string& arg0, // replacement context
+                                  const std::string& out,
+                                  const Shape& arg0_shape,
+                                  const Shape& out_shape,
+                                  const AxisVector& arg0_axis_order);
+                void emit_sum(codegen::CodeWriter& writer,
+                              const std::string& element_type,
+                              const std::string& arg0, // replacement context
+                              const std::string& out,
+                              const Shape& arg0_shape,
+                              const Shape& out_shape,
+                              const AxisSet& reduction_axes);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
@@ -14,6 +14,8 @@

 #include <memory>

+#include <cuda.h>
+
 #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
@@ -30,59 +32,26 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
          true,
          true,
          false))
-    , m_allocated_buffer_pool(nullptr)
-    , m_aligned_buffer_pool(nullptr)
-
 {
+    // Need to check type and have host/device tensors
    m_descriptor->set_tensor_view_layout(
        std::make_shared<ngraph::descriptor::layout::DenseTensorViewLayout>(*m_descriptor));

    m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
-    if (m_buffer_size > 0)
-    {
-        size_t allocation_size = m_buffer_size + runtime::gpu::alignment;
-        m_allocated_buffer_pool = static_cast<char*>(malloc(allocation_size));
-        m_aligned_buffer_pool = m_allocated_buffer_pool;
-        size_t mod = size_t(m_aligned_buffer_pool) % alignment;
-        if (mod != 0)
-        {
-            m_aligned_buffer_pool += (alignment - mod);
-        }
-    }
-}
-
-runtime::gpu::GPU_TensorView::~GPU_TensorView()
-{
-    if (m_allocated_buffer_pool != nullptr)
-    {
-        free(m_allocated_buffer_pool);
-    }
-}

-char* runtime::gpu::GPU_TensorView::get_data_ptr()
-{
-    return m_aligned_buffer_pool;
+    // cuMemAlloc(&dev_buffer, m_buffer_size);
 }

-const char* runtime::gpu::GPU_TensorView::get_data_ptr() const
+runtime::gpu::GPU_TensorView::~GPU_TensorView()
 {
-    return m_aligned_buffer_pool;
+    // cuMemFree(dev_buffer);
 }
-
 void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)
 {
-    if (tensor_offset + n > m_buffer_size)
-    {
-        throw out_of_range("write access past end of tensor");
-    }
-    char* target = get_data_ptr();
+    // cuMemcpyHtoD(dev_buffer, source, n);
 }

 void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const
 {
-    if (tensor_offset + n > m_buffer_size)
-    {
-        throw out_of_range("read access past end of tensor");
-    }
-    const char* source = get_data_ptr();
+    // cuMemcpyDtoH(target, dev_buffer, n);
 }
--- a/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
@@ -14,6 +14,7 @@

 #pragma once

+#include <cuda.h>
 #include <memory>

 #include "ngraph/runtime/tensor_view.hpp"
@@ -36,9 +37,6 @@ public:
    GPU_TensorView(const ngraph::element::Type& element_type, const Shape& shape);
    virtual ~GPU_TensorView();

-    char* get_data_ptr();
-    const char* get_data_ptr() const;
-
    /// @brief Write bytes directly into the tensor
    /// @param p Pointer to source of data
    /// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
@@ -51,8 +49,12 @@ public:
    /// @param n Number of bytes to read, must be integral number of elements.
    void read(void* p, size_t tensor_offset, size_t n) const override;

+    // const char* get_data_ptr();
+
+    // const char* get_data_ptr() const;
+
 private:
-    char* m_allocated_buffer_pool;
-    char* m_aligned_buffer_pool;
+    CUdeviceptr dev_buffer;
+    // At some point need to deal with alignment
    size_t m_buffer_size;
 };
--- a/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+
+#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
+#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
+#include "ngraph/descriptor/tensor.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+runtime::gpu::GPU_TensorViewWrapper::GPU_TensorViewWrapper(
+    const shared_ptr<descriptor::TensorView>& tv, const string& alias)
+    : m_tensor_view(tv)
+    , m_alias(alias)
+{
+}
+
+size_t runtime::gpu::GPU_TensorViewWrapper::get_size() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_size();
+}
+
+const vector<size_t>& runtime::gpu::GPU_TensorViewWrapper::get_shape() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_shape();
+}
+
+const vector<size_t>& runtime::gpu::GPU_TensorViewWrapper::get_strides() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_strides();
+}
+
+const element::Type& runtime::gpu::GPU_TensorViewWrapper::get_element_type() const
+{
+    return m_tensor_view->get_tensor_view_layout()->get_element_type();
+}
+
+const std::string& runtime::gpu::GPU_TensorViewWrapper::get_name() const
+{
+    if (m_alias.empty())
+    {
+        return m_tensor_view->get_tensor().get_name();
+    }
+    else
+    {
+        return m_alias;
+    }
+}
+
+const std::string& runtime::gpu::GPU_TensorViewWrapper::get_type() const
+{
+    return get_element_type().c_type_string();
+}
+
+bool runtime::gpu::GPU_TensorViewWrapper::is_output() const
+{
+    return m_tensor_view->get_tensor().is_output();
+}
--- a/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include <memory>
+
+#include "ngraph/descriptor/tensor_view.hpp"
+#include "ngraph/types/element_type.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class GPU_TensorViewWrapper;
+        }
+    }
+}
+
+class ngraph::runtime::gpu::GPU_TensorViewWrapper
+{
+public:
+    GPU_TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&,
+                          const std::string& alias = "");
+
+    size_t get_size() const;
+    const std::vector<size_t>& get_shape() const;
+    const std::vector<size_t>& get_strides() const;
+    const element::Type& get_element_type() const;
+    const std::string& get_name() const;
+    const std::string& get_type() const;
+    bool is_output() const;
+
+private:
+    std::shared_ptr<descriptor::TensorView> m_tensor_view;
+    std::string m_alias;
+};
--- a/test/cudnn.cpp
+++ b/test/cudnn.cpp
@@ -22,7 +22,11 @@
 #include <cudnn.h>

 #include "ngraph/codegen/compiler.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+
 #include "ngraph/ngraph.hpp"
+#include "util/ndarray.hpp"
+#include "util/test_tools.hpp"

 using namespace ngraph;
 using namespace std;
@@ -41,7 +45,6 @@ TEST(cudnn, compileTest)
 #include <iostream>
 #include "cuda.h"

-
 void check_cuda_errors(CUresult err) {
  assert(err == CUDA_SUCCESS);
 }
@@ -202,7 +205,6 @@ const auto str = R"(
  check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float)*16));
  check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float)*16));

-
  unsigned block_size_X = 16;
  unsigned block_size_Y = 1;
  unsigned block_size_Z = 1;
@@ -223,13 +225,11 @@ const auto str = R"(
  // Retrieve device data
  check_cuda_errors(cuMemcpyDtoH(&host_C[0], dev_bufferC, sizeof(float)*16));

-
  std::cout << "Results:\n";
  for (unsigned i = 0; i != 16; ++i) {
    std::cout << host_A[i] << " + " << host_B[i] << " = " << host_C[i] << "\n";
  }

-
  // Clean up after ourselves
  delete [] host_A;
  delete [] host_B;
@@ -261,4 +261,50 @@ TEST(cudnn, abc)
    auto external = manager->compile(f);
    auto backend = manager->allocate_backend();
    auto cf = backend->make_call_frame(external);
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
+    shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
+    shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
+
+    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
+
+    cf->call({a, b, c}, {result});
+    EXPECT_EQ(result->get_vector<float>(),
+              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
+
+    cf->call({b, a, c}, {result});
+    EXPECT_EQ(result->get_vector<float>(),
+              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
+
+    cf->call({a, c, b}, {result});
+    EXPECT_EQ(result->get_vector<float>(),
+              (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
+}
+
+TEST(cudnn, dot1d)
+{
+    auto shape = Shape{4};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto shape_r = Shape{};
+    auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
+
+    auto manager = runtime::Manager::get("GPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{2, 4, 8, 16});
+    auto b = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(b, vector<float>{1, 2, 4, 8});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+
+    cf->call({a, b}, {result});
+    EXPECT_EQ((vector<float>{170}), result->get_vector<float>());
 }