enable TensorView to use pre-allocated mem (#795)

* enable TensorView to use pre-allocated mem * proper check for nullptr * add unittest for custom mem with tensorview and feedback * minor fix from feedback * support GPU TensorView custom mem * feedback fix and code format

enable TensorView to use pre-allocated mem (#795)
* enable TensorView to use pre-allocated mem * proper check for nullptr * add unittest for custom mem with tensorview and feedback * minor fix from feedback * support GPU TensorView custom mem * feedback fix and code format
e189f9c6 · Ashok Emani · Nick Korovaiko · 913855ec · e189f9c6 · e189f9c6
Commit e189f9c6 authored Apr 05, 2018 by Ashok Emani Committed by Nick Korovaiko Apr 05, 2018
14 changed files
--- a/src/ngraph/runtime/backend.hpp
+++ b/src/ngraph/runtime/backend.hpp
@@ -52,6 +52,12 @@ namespace ngraph
                make_primary_tensor_view(const ngraph::element::Type& element_type,
                                         const Shape& shape) = 0;

+            /// @brief Return a handle for a tensor for given mem on backend device
+            virtual std::shared_ptr<ngraph::runtime::TensorView>
+                make_primary_tensor_view(const ngraph::element::Type& element_type,
+                                         const Shape& shape,
+                                         void* memory_pointer) = 0;
+
            template <typename T>
            std::shared_ptr<ngraph::runtime::TensorView>
                make_primary_tensor_view(const Shape& shape)

--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -35,3 +35,10 @@ std::shared_ptr<ngraph::runtime::TensorView>
    auto rc = make_shared<runtime::cpu::CPUTensorView>(element_type, shape);
    return dynamic_pointer_cast<runtime::TensorView>(rc);
 }
+
+std::shared_ptr<ngraph::runtime::TensorView> runtime::cpu::CPU_Backend::make_primary_tensor_view(
+    const ngraph::element::Type& element_type, const Shape& shape, void* memory_pointer)
+{
+    auto rc = make_shared<runtime::cpu::CPUTensorView>(element_type, shape, memory_pointer);
+    return dynamic_pointer_cast<runtime::TensorView>(rc);
+}
--- a/src/ngraph/runtime/cpu/cpu_backend.hpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.hpp
@@ -34,6 +34,11 @@ namespace ngraph
                std::shared_ptr<ngraph::runtime::TensorView>
                    make_primary_tensor_view(const ngraph::element::Type& element_type,
                                             const Shape& shape) override;
+
+                std::shared_ptr<ngraph::runtime::TensorView>
+                    make_primary_tensor_view(const ngraph::element::Type& element_type,
+                                             const Shape& shape,
+                                             void* memory_pointer) override;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -36,6 +36,7 @@ const size_t runtime::cpu::CPUTensorView::BufferAlignment = 64;

 runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
                                           const Shape& shape,
+                                           void* memory_pointer,
                                           const string& name)
    : runtime::TensorView(std::make_shared<ngraph::descriptor::PrimaryTensorView>(
          std::make_shared<ngraph::TensorViewType>(element_type, shape), name, true, true, false))
@@ -49,7 +50,12 @@ runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_
        *m_descriptor, runtime::cpu::LayoutDescriptor::create_native_axis_order(shape.size())));

    buffer_size = shape_size(shape) * element_type.size();
-    if (buffer_size)
+
+    if (memory_pointer != nullptr)
+    {
+        aligned_buffer = static_cast<char*>(memory_pointer);
+    }
+    else if (buffer_size > 0)
    {
        size_t allocation_size = buffer_size + BufferAlignment;
        auto ptr = malloc(allocation_size);
@@ -72,6 +78,13 @@ runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_
    }
 }

+runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
+                                           const Shape& shape,
+                                           const string& name)
+    : CPUTensorView(element_type, shape, nullptr, name)
+{
+}
+
 runtime::cpu::CPUTensorView::~CPUTensorView()
 {
    free(buffer);

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
@@ -33,6 +33,10 @@ namespace ngraph
                CPUTensorView(const ngraph::element::Type& element_type,
                              const Shape& shape,
                              const std::string& name = "external");
+                CPUTensorView(const ngraph::element::Type& element_type,
+                              const Shape& shape,
+                              void* memory_pointer,
+                              const std::string& name = "external");
                virtual ~CPUTensorView() override;

                char* get_data_ptr();

--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -34,3 +34,10 @@ std::shared_ptr<ngraph::runtime::TensorView>
    auto rc = make_shared<runtime::gpu::GPU_TensorView>(element_type, shape);
    return dynamic_pointer_cast<runtime::TensorView>(rc);
 }
+
+std::shared_ptr<ngraph::runtime::TensorView> runtime::gpu::GPU_Backend::make_primary_tensor_view(
+    const ngraph::element::Type& element_type, const Shape& shape, void* memory_pointer)
+{
+    auto rc = make_shared<runtime::gpu::GPU_TensorView>(element_type, shape, memory_pointer);
+    return dynamic_pointer_cast<runtime::TensorView>(rc);
+}
--- a/src/ngraph/runtime/gpu/gpu_backend.hpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.hpp
@@ -36,6 +36,11 @@ namespace ngraph
                std::shared_ptr<ngraph::runtime::TensorView>
                    make_primary_tensor_view(const ngraph::element::Type& element_type,
                                             const Shape& shape) override;
+
+                std::shared_ptr<ngraph::runtime::TensorView>
+                    make_primary_tensor_view(const ngraph::element::Type& element_type,
+                                             const Shape& shape,
+                                             void* memory_pointer) override;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
@@ -27,27 +27,43 @@ using namespace ngraph;
 using namespace std;

 runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& element_type,
-                                             const Shape& shape)
+                                             const Shape& shape,
+                                             void* memory_pointer)
    : runtime::TensorView(std::make_shared<ngraph::descriptor::PrimaryTensorView>(
          std::make_shared<ngraph::TensorViewType>(element_type, shape),
          "external",
          true,
          true,
          false))
+    , m_custom_memory(false)
 {
    m_descriptor->set_tensor_view_layout(
        std::make_shared<ngraph::descriptor::layout::DenseTensorViewLayout>(*m_descriptor));

    m_buffer_size = shape_size(shape) * element_type.size();
-    if (m_buffer_size > 0)
+    if (memory_pointer != nullptr)
+    {
+        m_allocated_buffer_pool = memory_pointer;
+        m_custom_memory = true;
+    }
+    else if (m_buffer_size > 0)
    {
        cudaMalloc(static_cast<void**>(&m_allocated_buffer_pool), m_buffer_size);
    }
 }

+runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& element_type,
+                                             const Shape& shape)
+    : GPU_TensorView(element_type, shape, nullptr)
+{
+}
+
 runtime::gpu::GPU_TensorView::~GPU_TensorView()
 {
-    cudaFree(m_allocated_buffer_pool);
+    if (!m_custom_memory)
+    {
+        cudaFree(m_allocated_buffer_pool);
+    }
 }

 void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)

--- a/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
@@ -37,6 +37,9 @@ class ngraph::runtime::gpu::GPU_TensorView : public ngraph::runtime::TensorView
 {
 public:
    GPU_TensorView(const ngraph::element::Type& element_type, const Shape& shape);
+    GPU_TensorView(const ngraph::element::Type& element_type,
+                   const Shape& shape,
+                   void* memory_pointer);
    virtual ~GPU_TensorView();

    /// @brief Write bytes directly into the tensor
@@ -53,4 +56,5 @@ public:

    void* m_allocated_buffer_pool;
    size_t m_buffer_size;
+    bool m_custom_memory;
 };
--- a/src/ngraph/runtime/host_tensor_view.cpp
+++ b/src/ngraph/runtime/host_tensor_view.cpp
@@ -26,6 +26,7 @@ using namespace std;

 runtime::HostTensorView::HostTensorView(const ngraph::element::Type& element_type,
                                        const Shape& shape,
+                                        void* memory_pointer,
                                        const string& name)
    : runtime::TensorView(std::make_shared<ngraph::descriptor::PrimaryTensorView>(
          std::make_shared<ngraph::TensorViewType>(element_type, shape), name, true, true, false))
@@ -37,7 +38,12 @@ runtime::HostTensorView::HostTensorView(const ngraph::element::Type& element_typ
        std::make_shared<ngraph::descriptor::layout::DenseTensorViewLayout>(*m_descriptor));

    m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
-    if (m_buffer_size > 0)
+
+    if (memory_pointer != nullptr)
+    {
+        m_aligned_buffer_pool = static_cast<char*>(memory_pointer);
+    }
+    else if (m_buffer_size > 0)
    {
        size_t allocation_size = m_buffer_size + runtime::alignment;
        m_allocated_buffer_pool = static_cast<char*>(malloc(allocation_size));
@@ -50,6 +56,13 @@ runtime::HostTensorView::HostTensorView(const ngraph::element::Type& element_typ
    }
 }

+runtime::HostTensorView::HostTensorView(const ngraph::element::Type& element_type,
+                                        const Shape& shape,
+                                        const string& name)
+    : HostTensorView(element_type, shape, nullptr, name)
+{
+}
+
 runtime::HostTensorView::~HostTensorView()
 {
    if (m_allocated_buffer_pool != nullptr)

--- a/src/ngraph/runtime/host_tensor_view.hpp
+++ b/src/ngraph/runtime/host_tensor_view.hpp
@@ -37,6 +37,10 @@ public:
    HostTensorView(const ngraph::element::Type& element_type,
                   const Shape& shape,
                   const std::string& name = "external");
+    HostTensorView(const ngraph::element::Type& element_type,
+                   const Shape& shape,
+                   void* memory_pointer,
+                   const std::string& name = "external");
    virtual ~HostTensorView() override;

    char* get_data_ptr();

--- a/src/ngraph/runtime/interpreter/int_backend.cpp
+++ b/src/ngraph/runtime/interpreter/int_backend.cpp
@@ -35,3 +35,10 @@ shared_ptr<runtime::TensorView>
    auto rc = make_shared<runtime::HostTensorView>(element_type, shape, "external");
    return static_pointer_cast<runtime::TensorView>(rc);
 }
+
+shared_ptr<runtime::TensorView> runtime::interpreter::INT_Backend::make_primary_tensor_view(
+    const element::Type& element_type, const Shape& shape, void* memory_pointer)
+{
+    auto rc = make_shared<runtime::HostTensorView>(element_type, shape, memory_pointer, "external");
+    return static_pointer_cast<runtime::TensorView>(rc);
+}
--- a/src/ngraph/runtime/interpreter/int_backend.hpp
+++ b/src/ngraph/runtime/interpreter/int_backend.hpp
@@ -34,6 +34,11 @@ namespace ngraph
                std::shared_ptr<ngraph::runtime::TensorView>
                    make_primary_tensor_view(const ngraph::element::Type& element_type,
                                             const Shape& shape) override;
+
+                std::shared_ptr<ngraph::runtime::TensorView>
+                    make_primary_tensor_view(const ngraph::element::Type& element_type,
+                                             const Shape& shape,
+                                             void* memory_pointer) override;
            };
        }
    }

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -8582,3 +8582,37 @@ TEST(${BACKEND_NAME}, softmax_underflow)
        expf(low) / d0, expf(1) / d1, expf(2) / d2, expf(3) / d0, expf(4) / d1, expf(5) / d2};
    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
 }
+
+TEST(${BACKEND_NAME}, tensorview_custom_mem)
+{
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+
+    Shape shape{2, 2};
+
+    auto make_external = [&]() {
+        auto A = make_shared<op::Parameter>(element::f32, shape);
+        auto B = make_shared<op::Parameter>(element::f32, shape);
+        auto f = make_shared<Function>(make_shared<op::Divide>(A, B), op::ParameterVector{A, B});
+
+        auto external = manager->compile(f);
+        return external;
+    };
+
+    auto cf = backend->make_call_frame(make_external());
+
+    vector<float> av{2, 4, 8, 16};
+    vector<float> bv{1, 2, 4, 8};
+    // use custom mem with tensorview, no need to copy data
+    auto a = backend->make_primary_tensor_view(element::f32, shape, av.data());
+    auto b = backend->make_primary_tensor_view(element::f32, shape, bv.data());
+
+    // use custom mem with result tensorview
+    vector<float> rv{0, 0, 0, 0};
+    auto result = backend->make_primary_tensor_view(element::f32, shape, rv.data());
+
+    // result should be in memory without needing explict read
+    cf->call({result}, {a, b});
+    EXPECT_EQ((vector<float>{2, 2, 2, 2}), rv);
+}