GPU External Function cleanup (#1698)

* cleanup * cleanup header includes * cleanup * cleanup TensorMemoryReservation pass * include cleanup * more cleanup * more header cleanup * style * Remove obsolete comments

GPU External Function cleanup (#1698)
* cleanup * cleanup header includes * cleanup * cleanup TensorMemoryReservation pass * include cleanup * more cleanup * more header cleanup * style * Remove obsolete comments
943b167f · Robert Kimball · GitHub · d38aba91 · 943b167f · 943b167f
Unverified Commit 943b167f authored Oct 01, 2018 by Robert Kimball Committed by GitHub Oct 01, 2018
15 changed files
--- a/src/ngraph/runtime/gpu/cuda_error_check.hpp
+++ b/src/ngraph/runtime/gpu/cuda_error_check.hpp
@@ -16,17 +16,16 @@

 #pragma once

-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <stdint.h>
-#include <string>
-
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cudnn.h>
+#include <iostream>
 #include <nvrtc.h>
+#include <sstream>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>

 //why use "do...while.."
 //https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros

--- a/src/ngraph/runtime/gpu/cudnn_descriptors.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_descriptors.hpp
@@ -20,6 +20,7 @@

 #include <cudnn.h>

+#include "ngraph/runtime/gpu/cuda_error_check.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"

 namespace ngraph

--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>

+#include "ngraph/runtime/gpu/cuda_error_check.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"

 using namespace ngraph;

--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.hpp
@@ -16,11 +16,10 @@

 #pragma once

+#include <cuda.h>
 #include <memory>
 #include <string>

-#include "ngraph/runtime/gpu/gpu_util.hpp"
-
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
@@ -18,6 +18,7 @@
 #include <iostream>
 #include <string>

+#include "ngraph/runtime/gpu/cuda_error_check.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"

--- a/src/ngraph/runtime/gpu/gpu_cuda_function_pool.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_pool.hpp
@@ -16,11 +16,10 @@

 #pragma once

+#include <memory>
 #include <string>
 #include <unordered_map>

-#include "ngraph/runtime/gpu/gpu_util.hpp"
-
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -178,15 +178,11 @@ const size_t runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction::s_memory_

 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
    const shared_ptr<ngraph::Function>& function,
-    std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
-    bool release_function)
+    std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
    : m_compiled_function(nullptr)
    , m_function(function)
    , m_emit_timing(false)
    , m_is_compiled(false)
-    , m_release_function(release_function)
-    , m_temporaries_used(false)
-    , m_tensor_memory_buffers(new std::unordered_map<std::string, size_t>)
    , m_shared_context(shared_context)
 {
 }
@@ -195,51 +191,44 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
 {
 }

-void runtime::gpu::GPU_ExternalFunction::emit_header()
+const string& runtime::gpu::GPU_ExternalFunction::get_pch_header_source()
 {
-    m_writer += R"(
+    static string s_pch_header_source = R"(
 // Generated by the nGraph GPU backend
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cudnn.h>

-#include "ngraph/descriptor/input.hpp"
-#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
-#include "ngraph/descriptor/output.hpp"
-#include "ngraph/file_util.hpp"
-#include "ngraph/function.hpp"
-#include "ngraph/graph_util.hpp"
-#include "ngraph/node.hpp"
-#include "ngraph/pass/assign_layout.hpp"
-#include "ngraph/pass/dump_sorted.hpp"
-#include "ngraph/pass/like_replacement.hpp"
-#include "ngraph/pass/liveness.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/pass/memory_layout.hpp"
-#include "ngraph/runtime/aligned_buffer.hpp"
-#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
+#include "ngraph/runtime/gpu/cuda_error_check.hpp"
 #include "ngraph/runtime/gpu/gpu_invoke.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
-#include "ngraph/util.hpp"
 )";
+    return s_pch_header_source;
+}

-    m_pch_header_source = m_writer.get_code();
-
-    m_writer += R"(
+const string& runtime::gpu::GPU_ExternalFunction::get_header_source()
+{
+    static string s_header_source =
+        get_pch_header_source() + R"(
 using namespace ngraph;
 using namespace ngraph::runtime;
 using namespace std;
-)";
+)"
+        // The "dso_handle" symbol is required by __cxa_atexit()
+        // which is enabled because the JIT uses it as the default mechanism
+        // to register cleanup handlers. We use it, and not atexit(), because
+        // atexit() happens too late, when the JIT is no longer alive
+        + "void *__dso_handle = 0;\n\n" +
+        "static gpu::GPURuntimeContext* m_runtime_context = nullptr;\n";
+
+    return s_header_source;
+}

-    // The "dso_handle" symbol is required by __cxa_atexit()
-    // which is enabled because the JIT uses it as the default mechanism
-    // to register cleanup handlers. We use it, and not atexit(), because
-    // atexit() happens too late, when the JIT is no longer alive
-    m_writer << "void *__dso_handle = 0;\n\n";
-    m_writer << "static gpu::GPURuntimeContext* m_runtime_context = nullptr;\n";
+void runtime::gpu::GPU_ExternalFunction::emit_header()
+{
+    m_writer << get_header_source();
 }

 void runtime::gpu::GPU_ExternalFunction::emit_timer_functions()
@@ -368,26 +357,26 @@ void runtime::gpu::GPU_ExternalFunction::emit_function_declarations()
 void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_allocation(
    shared_ptr<Function> current_function)
 {
-    m_temporaries_used = false;
+    bool temporaries_used = false;
    size_t worst_case_tmp_size = 0;
    for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
    {
        if (node->liveness_new_list.size() > 0)
        {
-            m_temporaries_used = true;
+            temporaries_used = true;
            for (descriptor::Tensor* tensor : node->liveness_new_list)
            {
                worst_case_tmp_size += tensor->size();
            }
        }
    }
-    if (m_temporaries_used)
+    if (temporaries_used)
    {
        m_writer << "// Allocate the memory pool\n";
        // TODO memory pool malloc.
        m_writer
            << "char* pool_base_ptr = (char*)ngraph::runtime::gpu::invoke_memory_primitive(ctx, "
-            << m_tensor_memory_buffers->at(current_function->get_name()) << ");\n";
+            << m_tensor_memory_buffers.at(current_function->get_name()) << ");\n";

        // Add temporaries to the variable name map
        for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
@@ -562,9 +551,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    m_function_name = m_function->get_name();

-    auto allocator = std::make_shared<runtime::gpu::GPUAllocator>(
-        m_shared_context->m_primitive_emitter->get_memory_allocator());
-
    m_pass_manager.register_pass<ngraph::pass::LikeReplacement>();
    m_pass_manager
        .register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
@@ -574,6 +560,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);

+    GPUAllocator allocator = m_shared_context->m_primitive_emitter->get_memory_allocator();
    m_pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(
        allocator, m_tensor_memory_buffers);

@@ -603,7 +590,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    emit_functions();

    // allocate device buffers for primitive arguments and workspace
-    allocator->close();
+    allocator.close();
    m_shared_context->m_primitive_emitter->allocate_primitive_memory();

    string code = m_writer.get_code();
@@ -611,7 +598,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    m_compiler.reset(new codegen::Compiler());
    m_execution_engine.reset(new codegen::ExecutionEngine());
-    m_compiler->set_precompiled_header_source(m_pch_header_source);
+    m_compiler->set_precompiled_header_source(get_pch_header_source());

    auto codegen_module = m_compiler->compile(code);
    if (codegen_module == nullptr)
@@ -629,10 +616,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    }

    m_is_compiled = true;
-    if (m_release_function)
-    {
-        release_function();
-    }
 }

 void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(Node* node)

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -55,8 +55,7 @@ namespace ngraph

            public:
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
-                                     std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
-                                     bool release_function = true);
+                                     std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
                ~GPU_ExternalFunction();

                std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
@@ -90,11 +89,13 @@ namespace ngraph
                void emit_debug_function_exit(Node* node);
                void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
                void emit_op(EMIT_ARGS);
-                void release_function() { m_function = nullptr; }
                void store_emitted_functions(const std::string& code);
                std::string emit_op_as_function(const Node& node, const std::string& function_name);
                std::string strip_comments(const std::string& s) const;

+                static const std::string& get_pch_header_source();
+                static const std::string& get_header_source();
+
                codegen::CodeWriter m_writer;
                ngraph::pass::Manager m_pass_manager;

@@ -110,14 +111,11 @@ namespace ngraph

                bool m_emit_timing;
                bool m_is_compiled;
-                bool m_release_function;
-                bool m_temporaries_used;
                size_t m_offset;

                std::string m_function_name;
-                std::string m_pch_header_source;

-                std::shared_ptr<std::unordered_map<std::string, size_t>> m_tensor_memory_buffers;
+                std::unordered_map<std::string, size_t> m_tensor_memory_buffers;
                std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
            };
        }

--- a/src/ngraph/runtime/gpu/gpu_runtime_context.cpp
+++ b/src/ngraph/runtime/gpu/gpu_runtime_context.cpp
@@ -15,24 +15,24 @@
 //*****************************************************************************

 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
+#include "ngraph/runtime/gpu/gpu_util.hpp"

 using namespace ngraph;
-using namespace ngraph::runtime::gpu;

-extern "C" void ngraph::runtime::gpu::start_stopwatch(GPURuntimeContext* ctx, size_t idx)
+extern "C" void runtime::gpu::start_stopwatch(GPURuntimeContext* ctx, size_t idx)
 {
    ctx->stopwatch_pool->get(idx).start();
 }

-extern "C" void ngraph::runtime::gpu::stop_stopwatch(GPURuntimeContext* ctx, size_t idx)
+extern "C" void runtime::gpu::stop_stopwatch(GPURuntimeContext* ctx, size_t idx)
 {
    ctx->stopwatch_pool->get(idx).stop();
 }
-extern "C" size_t ngraph::runtime::gpu::count_stopwatch(GPURuntimeContext* ctx, size_t idx)
+extern "C" size_t runtime::gpu::count_stopwatch(GPURuntimeContext* ctx, size_t idx)
 {
    return ctx->stopwatch_pool->get(idx).get_call_count();
 }
-extern "C" size_t ngraph::runtime::gpu::us_stopwatch(GPURuntimeContext* ctx, size_t idx)
+extern "C" size_t runtime::gpu::us_stopwatch(GPURuntimeContext* ctx, size_t idx)
 {
    return ctx->stopwatch_pool->get(idx).get_total_microseconds();
 }
--- a/src/ngraph/runtime/gpu/gpu_runtime_context.hpp
+++ b/src/ngraph/runtime/gpu/gpu_runtime_context.hpp
@@ -16,12 +16,13 @@

 #pragma once

+#include <cublas_v2.h>
+#include <cudnn.h>
 #include <string>
 #include <unordered_map>

 #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
-#include "ngraph/runtime/gpu/gpu_util.hpp"

 namespace ngraph
 {
@@ -29,8 +30,10 @@ namespace ngraph
    {
        namespace gpu
        {
-            typedef std::function<void(void**, void**)> primitive;
-            typedef std::function<void*(void)> memory_primitive;
+            class StopWatchPool;
+
+            using primitive = std::function<void(void**, void**)>;
+            using memory_primitive = std::function<void*(void)>;

            extern "C" {
            struct GPURuntimeContext

--- a/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
@@ -19,6 +19,7 @@
 #include <cuda_runtime.h>

 #include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
+#include "ngraph/runtime/gpu/cuda_error_check.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"

--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -16,14 +16,14 @@

 #include <cassert>
 #include <cstdlib>
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <iostream>
 #include <stddef.h>
 #include <stdio.h>
 #include <string>

-#include <cuda.h>
-#include <cuda_runtime.h>
-
+#include "ngraph/runtime/gpu/cuda_error_check.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
 #include "ngraph/util.hpp"


--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -16,15 +16,9 @@

 #pragma once

-#include <iostream>
-#include <memory>
-#include <string>
-#include <tuple>
+#include <cudnn.h>
 #include <vector>

-#include "ngraph/runtime/gpu/cuda_error_check.hpp"
-#include "ngraph/util.hpp"
-
 namespace ngraph
 {
    namespace runtime
@@ -43,14 +37,16 @@ namespace ngraph
            std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
            uint32_t idiv_ceil(int n, int d);

-            template <typename T>
-            void print_gpu_tensor(const void* p, size_t element_count)
-            {
-                std::vector<T> local(element_count);
-                size_t size_in_bytes = sizeof(T) * element_count;
-                cuda_memcpyDtH(local.data(), p, size_in_bytes);
-                std::cout << "{" << ngraph::join(local) << "}" << std::endl;
-            }
+            // This is commented out because it increases the compile time.
+            // It should be moved to a debug header.
+            // template <typename T>
+            // void print_gpu_tensor(const void* p, size_t element_count)
+            // {
+            //     std::vector<T> local(element_count);
+            //     size_t size_in_bytes = sizeof(T) * element_count;
+            //     cuda_memcpyDtH(local.data(), p, size_in_bytes);
+            //     std::cout << "{" << ngraph::join(local) << "}" << std::endl;
+            // }

            class StopWatch
            {

--- a/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.cpp
+++ b/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.cpp
@@ -17,30 +17,24 @@
 #include <memory>

 #include "ngraph/function.hpp"
+#include "ngraph/graph_util.hpp"
 #include "ngraph/node.hpp"
 #include "ngraph/pass/manager_state.hpp"
-
-#include "ngraph/graph_util.hpp"
 #include "ngraph/runtime/gpu/gpu_memory_manager.hpp"
 #include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp"

 using namespace ngraph;
+using namespace std;

-bool ngraph::runtime::gpu::pass::TensorMemoryReservation::run_on_function(
-    std::shared_ptr<Function> f)
+bool runtime::gpu::pass::TensorMemoryReservation::run_on_function(shared_ptr<Function> f)
 {
-    auto allocator = m_allocator.lock();
-    auto buffers = m_memory_buffers.lock();
-    if (allocator && buffers)
+    size_t mem_pool_size = f->get_temporary_pool_size();
+    if (mem_pool_size)
    {
-        size_t mem_pool_size = f->get_temporary_pool_size();
-        if (mem_pool_size)
-        {
-            size_t pool_idx = allocator->reserve_workspace(mem_pool_size, false);
-            buffers->insert({f->get_name(), pool_idx});
+        size_t pool_idx = m_allocator.reserve_workspace(mem_pool_size, false);
+        m_memory_buffers.insert({f->get_name(), pool_idx});

-            return true;
-        }
+        return true;
    }
    return false;
 }
--- a/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp
+++ b/src/ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp
@@ -37,8 +37,8 @@ namespace ngraph
 class ngraph::runtime::gpu::pass::TensorMemoryReservation : public ngraph::pass::FunctionPass
 {
 public:
-    TensorMemoryReservation(std::weak_ptr<ngraph::runtime::gpu::GPUAllocator> allocator,
-                            std::weak_ptr<std::unordered_map<std::string, size_t>> buffers)
+    TensorMemoryReservation(GPUAllocator& allocator,
+                            std::unordered_map<std::string, size_t>& buffers)
        : ngraph::pass::FunctionPass()
        , m_allocator(allocator)
        , m_memory_buffers(buffers)
@@ -48,6 +48,6 @@ public:
    virtual bool run_on_function(std::shared_ptr<ngraph::Function> f);

 private:
-    std::weak_ptr<ngraph::runtime::gpu::GPUAllocator> m_allocator;
-    std::weak_ptr<std::unordered_map<std::string, size_t>> m_memory_buffers;
+    GPUAllocator& m_allocator;
+    std::unordered_map<std::string, size_t>& m_memory_buffers;
 };