Unverified Commit 943b167f authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

GPU External Function cleanup (#1698)

* cleanup

* cleanup header includes

* cleanup

* cleanup TensorMemoryReservation pass

* include cleanup

* more cleanup

* more header cleanup

* style

* Remove obsolete comments
parent d38aba91
......@@ -16,17 +16,16 @@
#pragma once
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <stdint.h>
#include <string>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include <iostream>
#include <nvrtc.h>
#include <sstream>
#include <stdexcept>
#include <stdint.h>
#include <string>
//why use "do...while.."
//https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
......
......@@ -20,6 +20,7 @@
#include <cudnn.h>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
......
......@@ -17,6 +17,7 @@
#include <memory>
#include <string>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
using namespace ngraph;
......
......@@ -16,11 +16,10 @@
#pragma once
#include <cuda.h>
#include <memory>
#include <string>
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
......
......@@ -18,6 +18,7 @@
#include <iostream>
#include <string>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
......
......@@ -16,11 +16,10 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
......
......@@ -178,15 +178,11 @@ const size_t runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction::s_memory_
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function,
std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
bool release_function)
std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
: m_compiled_function(nullptr)
, m_function(function)
, m_emit_timing(false)
, m_is_compiled(false)
, m_release_function(release_function)
, m_temporaries_used(false)
, m_tensor_memory_buffers(new std::unordered_map<std::string, size_t>)
, m_shared_context(shared_context)
{
}
......@@ -195,51 +191,44 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
{
}
void runtime::gpu::GPU_ExternalFunction::emit_header()
const string& runtime::gpu::GPU_ExternalFunction::get_pch_header_source()
{
m_writer += R"(
static string s_pch_header_source = R"(
// Generated by the nGraph GPU backend
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/like_replacement.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
)";
return s_pch_header_source;
}
m_pch_header_source = m_writer.get_code();
m_writer += R"(
const string& runtime::gpu::GPU_ExternalFunction::get_header_source()
{
static string s_header_source =
get_pch_header_source() + R"(
using namespace ngraph;
using namespace ngraph::runtime;
using namespace std;
)";
)"
// The "dso_handle" symbol is required by __cxa_atexit()
// which is enabled because the JIT uses it as the default mechanism
// to register cleanup handlers. We use it, and not atexit(), because
// atexit() happens too late, when the JIT is no longer alive
+ "void *__dso_handle = 0;\n\n" +
"static gpu::GPURuntimeContext* m_runtime_context = nullptr;\n";
return s_header_source;
}
// The "dso_handle" symbol is required by __cxa_atexit()
// which is enabled because the JIT uses it as the default mechanism
// to register cleanup handlers. We use it, and not atexit(), because
// atexit() happens too late, when the JIT is no longer alive
m_writer << "void *__dso_handle = 0;\n\n";
m_writer << "static gpu::GPURuntimeContext* m_runtime_context = nullptr;\n";
void runtime::gpu::GPU_ExternalFunction::emit_header()
{
m_writer << get_header_source();
}
void runtime::gpu::GPU_ExternalFunction::emit_timer_functions()
......@@ -368,26 +357,26 @@ void runtime::gpu::GPU_ExternalFunction::emit_function_declarations()
void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_allocation(
shared_ptr<Function> current_function)
{
m_temporaries_used = false;
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
if (node->liveness_new_list.size() > 0)
{
m_temporaries_used = true;
temporaries_used = true;
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
worst_case_tmp_size += tensor->size();
}
}
}
if (m_temporaries_used)
if (temporaries_used)
{
m_writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
m_writer
<< "char* pool_base_ptr = (char*)ngraph::runtime::gpu::invoke_memory_primitive(ctx, "
<< m_tensor_memory_buffers->at(current_function->get_name()) << ");\n";
<< m_tensor_memory_buffers.at(current_function->get_name()) << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
......@@ -562,9 +551,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_function_name = m_function->get_name();
auto allocator = std::make_shared<runtime::gpu::GPUAllocator>(
m_shared_context->m_primitive_emitter->get_memory_allocator());
m_pass_manager.register_pass<ngraph::pass::LikeReplacement>();
m_pass_manager
.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
......@@ -574,6 +560,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
GPUAllocator allocator = m_shared_context->m_primitive_emitter->get_memory_allocator();
m_pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(
allocator, m_tensor_memory_buffers);
......@@ -603,7 +590,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
emit_functions();
// allocate device buffers for primitive arguments and workspace
allocator->close();
allocator.close();
m_shared_context->m_primitive_emitter->allocate_primitive_memory();
string code = m_writer.get_code();
......@@ -611,7 +598,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(m_pch_header_source);
m_compiler->set_precompiled_header_source(get_pch_header_source());
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
......@@ -629,10 +616,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
}
m_is_compiled = true;
if (m_release_function)
{
release_function();
}
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(Node* node)
......
......@@ -55,8 +55,7 @@ namespace ngraph
public:
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
bool release_function = true);
std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
~GPU_ExternalFunction();
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
......@@ -90,11 +89,13 @@ namespace ngraph
void emit_debug_function_exit(Node* node);
void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
void emit_op(EMIT_ARGS);
void release_function() { m_function = nullptr; }
void store_emitted_functions(const std::string& code);
std::string emit_op_as_function(const Node& node, const std::string& function_name);
std::string strip_comments(const std::string& s) const;
static const std::string& get_pch_header_source();
static const std::string& get_header_source();
codegen::CodeWriter m_writer;
ngraph::pass::Manager m_pass_manager;
......@@ -110,14 +111,11 @@ namespace ngraph
bool m_emit_timing;
bool m_is_compiled;
bool m_release_function;
bool m_temporaries_used;
size_t m_offset;
std::string m_function_name;
std::string m_pch_header_source;
std::shared_ptr<std::unordered_map<std::string, size_t>> m_tensor_memory_buffers;
std::unordered_map<std::string, size_t> m_tensor_memory_buffers;
std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
};
}
......
......@@ -15,24 +15,24 @@
//*****************************************************************************
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
using namespace ngraph;
using namespace ngraph::runtime::gpu;
extern "C" void ngraph::runtime::gpu::start_stopwatch(GPURuntimeContext* ctx, size_t idx)
extern "C" void runtime::gpu::start_stopwatch(GPURuntimeContext* ctx, size_t idx)
{
ctx->stopwatch_pool->get(idx).start();
}
extern "C" void ngraph::runtime::gpu::stop_stopwatch(GPURuntimeContext* ctx, size_t idx)
extern "C" void runtime::gpu::stop_stopwatch(GPURuntimeContext* ctx, size_t idx)
{
ctx->stopwatch_pool->get(idx).stop();
}
extern "C" size_t ngraph::runtime::gpu::count_stopwatch(GPURuntimeContext* ctx, size_t idx)
extern "C" size_t runtime::gpu::count_stopwatch(GPURuntimeContext* ctx, size_t idx)
{
return ctx->stopwatch_pool->get(idx).get_call_count();
}
extern "C" size_t ngraph::runtime::gpu::us_stopwatch(GPURuntimeContext* ctx, size_t idx)
extern "C" size_t runtime::gpu::us_stopwatch(GPURuntimeContext* ctx, size_t idx)
{
return ctx->stopwatch_pool->get(idx).get_total_microseconds();
}
......@@ -16,12 +16,13 @@
#pragma once
#include <cublas_v2.h>
#include <cudnn.h>
#include <string>
#include <unordered_map>
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
......@@ -29,8 +30,10 @@ namespace ngraph
{
namespace gpu
{
typedef std::function<void(void**, void**)> primitive;
typedef std::function<void*(void)> memory_primitive;
class StopWatchPool;
using primitive = std::function<void(void**, void**)>;
using memory_primitive = std::function<void*(void)>;
extern "C" {
struct GPURuntimeContext
......
......@@ -19,6 +19,7 @@
#include <cuda_runtime.h>
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
......
......@@ -16,14 +16,14 @@
#include <cassert>
#include <cstdlib>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <stddef.h>
#include <stdio.h>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
......
......@@ -16,15 +16,9 @@
#pragma once
#include <iostream>
#include <memory>
#include <string>
#include <tuple>
#include <cudnn.h>
#include <vector>
#include "ngraph/runtime/gpu/cuda_error_check.hpp"
#include "ngraph/util.hpp"
namespace ngraph
{
namespace runtime
......@@ -43,14 +37,16 @@ namespace ngraph
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
uint32_t idiv_ceil(int n, int d);
template <typename T>
void print_gpu_tensor(const void* p, size_t element_count)
{
std::vector<T> local(element_count);
size_t size_in_bytes = sizeof(T) * element_count;
cuda_memcpyDtH(local.data(), p, size_in_bytes);
std::cout << "{" << ngraph::join(local) << "}" << std::endl;
}
// This is commented out because it increases the compile time.
// It should be moved to a debug header.
// template <typename T>
// void print_gpu_tensor(const void* p, size_t element_count)
// {
// std::vector<T> local(element_count);
// size_t size_in_bytes = sizeof(T) * element_count;
// cuda_memcpyDtH(local.data(), p, size_in_bytes);
// std::cout << "{" << ngraph::join(local) << "}" << std::endl;
// }
class StopWatch
{
......
......@@ -17,30 +17,24 @@
#include <memory>
#include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/pass/manager_state.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/runtime/gpu/gpu_memory_manager.hpp"
#include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp"
using namespace ngraph;
using namespace std;
bool ngraph::runtime::gpu::pass::TensorMemoryReservation::run_on_function(
std::shared_ptr<Function> f)
bool runtime::gpu::pass::TensorMemoryReservation::run_on_function(shared_ptr<Function> f)
{
auto allocator = m_allocator.lock();
auto buffers = m_memory_buffers.lock();
if (allocator && buffers)
size_t mem_pool_size = f->get_temporary_pool_size();
if (mem_pool_size)
{
size_t mem_pool_size = f->get_temporary_pool_size();
if (mem_pool_size)
{
size_t pool_idx = allocator->reserve_workspace(mem_pool_size, false);
buffers->insert({f->get_name(), pool_idx});
size_t pool_idx = m_allocator.reserve_workspace(mem_pool_size, false);
m_memory_buffers.insert({f->get_name(), pool_idx});
return true;
}
return true;
}
return false;
}
......@@ -37,8 +37,8 @@ namespace ngraph
class ngraph::runtime::gpu::pass::TensorMemoryReservation : public ngraph::pass::FunctionPass
{
public:
TensorMemoryReservation(std::weak_ptr<ngraph::runtime::gpu::GPUAllocator> allocator,
std::weak_ptr<std::unordered_map<std::string, size_t>> buffers)
TensorMemoryReservation(GPUAllocator& allocator,
std::unordered_map<std::string, size_t>& buffers)
: ngraph::pass::FunctionPass()
, m_allocator(allocator)
, m_memory_buffers(buffers)
......@@ -48,6 +48,6 @@ public:
virtual bool run_on_function(std::shared_ptr<ngraph::Function> f);
private:
std::weak_ptr<ngraph::runtime::gpu::GPUAllocator> m_allocator;
std::weak_ptr<std::unordered_map<std::string, size_t>> m_memory_buffers;
GPUAllocator& m_allocator;
std::unordered_map<std::string, size_t>& m_memory_buffers;
};
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment