Unverified Commit b5467550 authored by Chris Sullivan's avatar Chris Sullivan Committed by GitHub

Updated gpu cpp files with consistent use of namespaces (cosmetic) (#629)

* Updated namespace use in cpp files.
parent a32fdab5
...@@ -19,25 +19,18 @@ ...@@ -19,25 +19,18 @@
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp" #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
namespace ngraph using namespace ngraph;
runtime::gpu::CudaContextManager& runtime::gpu::CudaContextManager::instance()
{ {
namespace runtime static CudaContextManager manager;
{ return manager;
namespace gpu }
{
CudaContextManager& CudaContextManager::instance()
{
static CudaContextManager manager;
return manager;
}
CudaContextManager::CudaContextManager() runtime::gpu::CudaContextManager::CudaContextManager()
{ {
CUDA_SAFE_CALL(cuInit(0)); CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0)); CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device)); CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device));
m_context_ptr = std::make_shared<CUcontext>(m_context); m_context_ptr = std::make_shared<CUcontext>(m_context);
}
}
}
} }
...@@ -20,46 +20,39 @@ ...@@ -20,46 +20,39 @@
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp" #include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp" #include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph using namespace ngraph;
std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options)
{ {
namespace runtime nvrtcProgram prog;
{ NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
namespace gpu kernel.c_str(),
{ "op.cu",
std::shared_ptr<CUfunction> CudaFunctionBuilder::get(const std::string& name, 0, // numHeaders
const std::string& kernel, NULL, // headers
int number_of_options, NULL)); // includeNames
const char** options)
{
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
kernel.c_str(),
"op.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options); nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
if (compile_result != NVRTC_SUCCESS) if (compile_result != NVRTC_SUCCESS)
{ {
throw std::runtime_error("compile error: \n" + kernel + "\n options"); throw std::runtime_error("compile error: \n" + kernel + "\n options");
} }
size_t ptx_size; size_t ptx_size;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size)); NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
char* ptx = new char[ptx_size]; char* ptx = new char[ptx_size];
NVRTC_SAFE_CALL(nvrtcGetPTX( NVRTC_SAFE_CALL(
prog, nvrtcGetPTX(prog,
ptx)); // Load the generated PTX and get a handle to the parent kernel. ptx)); // Load the generated PTX and get a handle to the parent kernel.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program. NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
CUmodule module; CUmodule module;
CUfunction function; CUfunction function;
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str())); CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
return std::make_shared<CUfunction>(function); return std::make_shared<CUfunction>(function);
}
}
}
} }
...@@ -26,40 +26,31 @@ ...@@ -26,40 +26,31 @@
static const std::string s_output_dir = "gpu_codegen"; static const std::string s_output_dir = "gpu_codegen";
namespace ngraph using namespace ngraph;
runtime::gpu::CudaFunctionPool& runtime::gpu::CudaFunctionPool::instance()
{ {
namespace runtime static CudaFunctionPool pool;
{ return pool;
namespace gpu }
{
CudaFunctionPool& CudaFunctionPool::instance()
{
static CudaFunctionPool pool;
return pool;
}
void CudaFunctionPool::set(const std::string& name, const std::string& kernel) void runtime::gpu::CudaFunctionPool::set(const std::string& name, const std::string& kernel)
{ {
const char* opts[] = {"--gpu-architecture=compute_35", const char* opts[] = {"--gpu-architecture=compute_35", "--relocatable-device-code=true"};
"--relocatable-device-code=true"}; std::string filename =
std::string filename = file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu"); std::ofstream out(filename);
std::ofstream out(filename); out << kernel;
out << kernel; out.close();
out.close(); m_function_map.insert({name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
m_function_map.insert( }
{name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
}
std::shared_ptr<CUfunction> CudaFunctionPool::get(const std::string& name) std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::get(const std::string& name)
{ {
auto it = m_function_map.find(name); auto it = m_function_map.find(name);
if (it != m_function_map.end()) if (it != m_function_map.end())
{ {
return (*it).second; return (*it).second;
}
return nullptr;
}
}
} }
return nullptr;
} }
...@@ -16,74 +16,67 @@ ...@@ -16,74 +16,67 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
namespace ngraph using namespace ngraph;
void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& op,
const size_t& num_inputs)
{ {
namespace runtime writer << "extern \"C\" __global__ void cuda_" << name << "(";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_type << "* in" << i << ", ";
}
writer << data_type << "* out,"
<< "size_t n)\n";
writer << "{\n";
writer.indent++;
{ {
namespace gpu writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{ {
void CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer, writer << "out[tid] = " << op << "(";
const std::string& name, for (size_t i = 0; i < num_inputs - 1; i++)
const std::string& data_type,
const std::string& op,
const size_t& num_inputs)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << "("; writer << "in" << i << "[tid], ";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_type << "* in" << i << ", ";
}
writer << data_type << "* out,"
<< "size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "out[tid] = " << op << "(";
for (size_t i = 0; i < num_inputs - 1; i++)
{
writer << "in" << i << "[tid], ";
}
writer << "in" << num_inputs - 1 << "[tid]);\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
return;
} }
writer << "in" << num_inputs - 1 << "[tid]);\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
void CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer, return;
const std::string& name, }
const std::string& data_type,
const std::string& math_kernel, void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
const size_t& num_inputs) const std::string& name,
{ const std::string& data_type,
if (math_kernel.size()) const std::string& math_kernel,
{ const size_t& num_inputs)
writer << "__device__ " << data_type << " " << name << "("; {
for (size_t i = 0; i < num_inputs - 1; i++) if (math_kernel.size())
{ {
writer << data_type << " x" << i << ", "; writer << "__device__ " << data_type << " " << name << "(";
} for (size_t i = 0; i < num_inputs - 1; i++)
writer << data_type << " x" << num_inputs - 1; {
writer << ")\n"; writer << data_type << " x" << i << ", ";
writer << "{\n"; }
writer.indent++; writer << data_type << " x" << num_inputs - 1;
{ writer << ")\n";
writer << "return " + math_kernel << ";\n"; writer << "{\n";
} writer.indent++;
writer.indent--; {
writer << "}\n"; writer << "return " + math_kernel << ";\n";
}
return;
}
} }
writer.indent--;
writer << "}\n";
} }
return;
} }
...@@ -20,26 +20,22 @@ ...@@ -20,26 +20,22 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
namespace ngraph using namespace ngraph;
void runtime::gpu::emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{ {
namespace runtime std::string name = "broadcast";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{ {
namespace gpu std::string kernel;
{ std::string data_type("float");
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{
std::string name = "broadcast";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
std::string kernel;
std::string data_type("float");
kernel = R"( kernel = R"(
extern "C" __global__ extern "C" __global__
void cuda_)" + name + "(" + data_type + void cuda_)" + name +
"* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"( "(" + data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" +
R"(
{ {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n) if(tid < n)
...@@ -48,28 +44,25 @@ void cuda_)" + name + "(" + data_type + ...@@ -48,28 +44,25 @@ void cuda_)" + name + "(" + data_type +
out[tid] = in[idx]; out[tid] = in[idx];
} }
})"; })";
CudaFunctionPool::instance().set(name, kernel); CudaFunctionPool::instance().set(name, kernel);
} }
//convert runtime ptr to driver api ptr //convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out; CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = CUdeviceptr(in); d_ptr_in = CUdeviceptr(in);
d_ptr_out = CUdeviceptr(out); d_ptr_out = CUdeviceptr(out);
void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count}; void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(), CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
static_cast<unsigned int>(count), static_cast<unsigned int>(count),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
1, 1,
1, // block dim 1, // block dim
0, 0,
NULL, // shared mem and stream NULL, // shared mem and stream
args_list, args_list,
0)); // arguments 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
}
} }
...@@ -114,6 +114,7 @@ ...@@ -114,6 +114,7 @@
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
using namespace std; using namespace std;
using namespace ngraph;
static const string s_output_dir = "gpu_codegen"; static const string s_output_dir = "gpu_codegen";
...@@ -159,119 +160,113 @@ static StaticInitializers s_static_initializers; ...@@ -159,119 +160,113 @@ static StaticInitializers s_static_initializers;
#define TI(x) type_index(typeid(x)) #define TI(x) type_index(typeid(x))
namespace ngraph static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Add>},
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Equal>},
{TI(ngraph::op::GetOutputElement),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Greater>},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::GreaterEq>},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Less>},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::LessEq>},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit<ngraph::op::NotEqual>},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convert>},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReduceWindow>},
{TI(ngraph::op::SelectAndScatter),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
{TI(ngraph::op::AvgPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
{TI(ngraph::op::Pad), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Pad>},
{TI(ngraph::op::BatchNorm), &runtime::gpu::GPU_Emitter::emit<ngraph::op::BatchNorm>},
{TI(ngraph::op::BatchNormBackprop),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
{TI(ngraph::op::MaxPoolBackprop),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
{TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Relu>},
{TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReluBackprop>},
{TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>},
};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
{ {
namespace runtime }
{
namespace gpu
{
static const OpMap dispatcher{
{TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
{TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &GPU_Emitter::nop},
{TI(ngraph::op::Abs), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
{TI(ngraph::op::GetOutputElement),
&GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
{TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
{TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
{TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
{TI(ngraph::op::Log), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
{TI(ngraph::op::Power), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
{TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sin), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sinh), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cos), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cosh), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tan), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tanh), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Asin), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Acos), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Atan), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Ceiling), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
{TI(ngraph::op::ReduceWindow), &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
{TI(ngraph::op::SelectAndScatter),
&GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
{TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop), &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
{TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
{TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
{TI(ngraph::op::BatchNormBackprop),
&GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
{TI(ngraph::op::MaxPoolBackprop), &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
{TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
{TI(ngraph::op::ReluBackprop), &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
{TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
};
GPU_ExternalFunction::GPU_ExternalFunction(const shared_ptr<ngraph::Function>& function,
bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
{
}
void GPU_ExternalFunction::compile() void runtime::gpu::GPU_ExternalFunction::compile()
{ {
if (m_is_compiled) if (m_is_compiled)
{ {
return; return;
} }
string function_name = m_function->get_name(); string function_name = m_function->get_name();
string dump_filename = string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
file_util::path_join(s_output_dir, function_name + "_ops.txt");
pass::Manager pass_manager; pass::Manager pass_manager;
// pass_manager.register_pass<pass::TopologicalSort>(); // pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major. // For now, just make everyone row-major.
pass_manager pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>(); pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::Liveness>(); pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.register_pass<pass::MemoryLayout>(64); pass_manager.register_pass<pass::DumpSorted>(dump_filename);
pass_manager.register_pass<pass::DumpSorted>(dump_filename); pass_manager.run_passes(m_function);
pass_manager.run_passes(m_function);
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer += writer +=
R"(// Generated by the NGraph GPU backend R"(// Generated by the NGraph GPU backend
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -297,541 +292,512 @@ namespace ngraph ...@@ -297,541 +292,512 @@ namespace ngraph
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
)"; )";
string pch_header_source = writer.get_code(); string pch_header_source = writer.get_code();
writer += R"( writer += R"(
using namespace ngraph; using namespace ngraph;
using namespace std; using namespace std;
)"; )";
if (m_emit_timing) if (m_emit_timing)
{ {
writer << "// Declare debug timers\n"; writer << "// Declare debug timers\n";
vector<string> names; vector<string> names;
for (shared_ptr<Function> current_function : for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
pass_manager.get_state().get_functions()) {
{ for (shared_ptr<Node> node : current_function->get_ordered_ops())
for (shared_ptr<Node> node : current_function->get_ordered_ops()) {
{ if (!node->is_parameter() && !node->is_constant())
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return "
<< names.size() << "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer
<< "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer
<< "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) names.push_back(node->get_name());
{
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static "
<< tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "_cpu["
<< c_value_strings.size() << "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
writer << "static "
<< tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] =
tv->get_tensor().get_name();
}
}
} }
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
<< "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
}
}
}
writer << "// Declare all functions\n"; writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions()) for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{ {
writer << "extern \"C\" void " << f->get_name() writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
<< "(void** inputs, void** outputs, " "cublasHandle_t& cublas_handle, "
"cublasHandle_t& cublas_handle, " "cudnnHandle_t& cudnn_handle);\n";
"cudnnHandle_t& cudnn_handle);\n"; }
}
writer << "\n"; writer << "\n";
unordered_map<Node*, string> match_functions; unordered_map<Node*, string> match_functions;
for (shared_ptr<Function> current_function : for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
pass_manager.get_state().get_functions()) {
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
if (tmp.size() < 2)
{
// Since we are comparing ops there must be at least two ops to proceed.
continue;
}
vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
for (size_t i = 0; i < op_list.size() - 1; i++)
{
if (op_list[i]->is_constant() || op_list[i]->is_parameter())
{
continue;
}
if (contains_key(match_functions, op_list[i].get()))
{
continue;
}
string match_function_name;
if (!match_function_name.empty())
{
writer << "static void " << match_function_name << "(";
writer.indent++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto& n = *op_list[i];
auto handler = dispatcher.find(type_index(typeid(n)));
vector<GPU_TensorViewWrapper> in;
size_t arg_index = 0;
set<string> arg_names;
for (const descriptor::Input& input : n.get_inputs())
{ {
set<string> output_names; const descriptor::Output& output = input.get_output();
for (shared_ptr<Node> op : current_function->get_results()) shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
{ GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view(); if (!contains(arg_names, tvw.get_name()))
output_names.insert(tv->get_tensor().get_name());
}
const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
if (tmp.size() < 2)
{
// Since we are comparing ops there must be at least two ops to proceed.
continue;
}
vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
for (size_t i = 0; i < op_list.size() - 1; i++)
{ {
if (op_list[i]->is_constant() || op_list[i]->is_parameter()) arg_names.insert(tvw.get_name());
if (arg_index++ > 0)
{ {
continue; writer << ",";
}
if (contains_key(match_functions, op_list[i].get()))
{
continue;
}
string match_function_name;
if (!match_function_name.empty())
{
writer << "static void " << match_function_name << "(";
writer.indent++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto& n = *op_list[i];
auto handler = dispatcher.find(type_index(typeid(n)));
vector<GPU_TensorViewWrapper> in;
size_t arg_index = 0;
set<string> arg_names;
for (const descriptor::Input& input : n.get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
if (!contains(arg_names, tvw.get_name()))
{
arg_names.insert(tvw.get_name());
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
}
in.push_back(tvw);
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : n.get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
out.push_back(tvw);
}
writer.indent--;
writer << "\n)\n";
writer << "{\n";
writer.indent++;
handler->second(this, writer, &n, in, out);
writer.indent--;
writer << "}\n";
} }
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
} }
in.push_back(tvw);
} }
vector<GPU_TensorViewWrapper> out;
for (shared_ptr<Function> current_function : for (const descriptor::Output& output : n.get_outputs())
pass_manager.get_state().get_functions())
{ {
set<string> output_names; shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
for (shared_ptr<Node> op : current_function->get_results()) GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
{ if (arg_index++ > 0)
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
if (dynamic_cast<ngraph::op::Constant*>(node.get())) writer << ",";
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get());
}
} }
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
out.push_back(tvw);
}
writer.indent--;
writer << "\n)\n";
writer << "{\n";
writer.indent++;
handler->second(this, writer, &n, in, out);
writer.indent--;
writer << "}\n";
}
}
}
writer << "extern \"C\" void " << current_function->get_name(); for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, " {
"cudnnHandle_t& " set<string> output_names;
"cudnn_handle)\n"; for (shared_ptr<Node> op : current_function->get_results())
writer << "{\n"; {
writer.indent++; shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get());
}
}
for (shared_ptr<Node> node : current_function->get_ordered_ops()) writer << "extern \"C\" void " << current_function->get_name();
{ writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
const op::Constant* c = dynamic_cast<op::Constant*>(node.get()); "cudnnHandle_t& "
if (c) "cudnn_handle)\n";
{ writer << "{\n";
shared_ptr<descriptor::TensorView> tv = writer.indent++;
node->get_outputs()[0].get_tensor_view();
writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
writer << "{\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name()
<< ", " << tv->get_tensor().get_name() << "_cpu, "
<< tv->get_tensor().size() << ");\n";
writer.indent--;
writer << "}\n";
}
}
bool temporaries_used = false; for (shared_ptr<Node> node : current_function->get_ordered_ops())
size_t worst_case_tmp_size = 0; {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
{ if (c)
if (node->liveness_new_list.size() > 0) {
{ shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
temporaries_used = true; writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
for (descriptor::Tensor* tensor : node->liveness_new_list) writer << "{\n";
{ writer.indent++;
worst_case_tmp_size += tensor->size(); writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
} << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
} << ");\n";
} writer.indent--;
if (temporaries_used) writer << "}\n";
{ }
size_t temp_pool_size = current_function->get_temporary_pool_size(); }
writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< temp_pool_size << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string()
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset()
<< "))";
m_variable_name_map[tensor->get_name()] = ss.str();
}
}
}
// Add inputs to the variable name map bool temporaries_used = false;
size_t arg_index = 0; size_t worst_case_tmp_size = 0;
for (shared_ptr<ngraph::op::Parameter> param : for (shared_ptr<Node> node : current_function->get_ordered_ops())
current_function->get_parameters()) {
{ if (node->liveness_new_list.size() > 0)
for (size_t i = 0; i < param->get_output_size(); ++i) {
{ temporaries_used = true;
shared_ptr<descriptor::TensorView> tv = for (descriptor::Tensor* tensor : node->liveness_new_list)
param->get_output_tensor_view(i); {
const element::Type& et = worst_case_tmp_size += tensor->size();
tv->get_tensor_view_type()->get_element_type(); }
string type = et.c_type_string(); }
stringstream ss; }
ss << "((" << type << "*)(inputs[" << arg_index << "]))"; if (temporaries_used)
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); {
arg_index++; size_t temp_pool_size = current_function->get_temporary_pool_size();
} writer << "// Allocate the memory pool\n";
} // TODO memory pool malloc.
writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< temp_pool_size << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string()
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
m_variable_name_map[tensor->get_name()] = ss.str();
}
}
}
// create output alias map // Add inputs to the variable name map
size_t output_index = 0; size_t arg_index = 0;
unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map; for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
vector<size_t> aliases; {
for (size_t i = 0; i < current_function->get_output_size(); ++i) for (size_t i = 0; i < param->get_output_size(); ++i)
{ {
shared_ptr<Node> op = current_function->get_output_op(i); shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view(); const element::Type& et = tv->get_tensor_view_type()->get_element_type();
vector<size_t>& al = output_alias_map[otv.get()]; string type = et.c_type_string();
al.push_back(output_index); stringstream ss;
if (al.size() > 1) ss << "((" << type << "*)(inputs[" << arg_index << "]))";
{ m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
aliases.push_back(output_index); arg_index++;
} }
output_index++; }
}
// Add outputs to the variable name map // create output alias map
output_index = 0; size_t output_index = 0;
for (size_t i = 0; i < current_function->get_output_size(); ++i) unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
{ vector<size_t> aliases;
shared_ptr<Node> op = current_function->get_output_op(i); for (size_t i = 0; i < current_function->get_output_size(); ++i)
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view(); {
const element::Type& et = tv->get_tensor_view_type()->get_element_type(); shared_ptr<Node> op = current_function->get_output_op(i);
bool parameter_as_output = false; shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
for (shared_ptr<ngraph::op::Parameter> param : vector<size_t>& al = output_alias_map[otv.get()];
current_function->get_parameters()) al.push_back(output_index);
{ if (al.size() > 1)
for (const descriptor::Output& pout : param->get_outputs()) {
{ aliases.push_back(output_index);
shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view(); }
if (tv == ptv) output_index++;
{ }
parameter_as_output = true;
writer
<< "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<< et.c_type_string() << "*>(outputs[" << output_index
<< "]), "
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", "
<< ptv->get_tensor().size() << ");\n";
break;
}
}
}
if (!parameter_as_output && !contains(aliases, output_index))
{
if (contains(constants, tv.get()))
{
writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
<< output_index << "], " << tv->get_tensor().get_name()
<< ", " << tv->get_tensor().size() << ");\n";
}
else
{
string type = et.c_type_string();
stringstream ss;
ss << "((" << type << "*)(outputs[" << output_index << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
}
}
output_index++;
}
for (shared_ptr<Node> node : current_function->get_ordered_ops()) // Add outputs to the variable name map
output_index = 0;
for (size_t i = 0; i < current_function->get_output_size(); ++i)
{
shared_ptr<Node> op = current_function->get_output_op(i);
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
const element::Type& et = tv->get_tensor_view_type()->get_element_type();
bool parameter_as_output = false;
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
{
for (const descriptor::Output& pout : param->get_outputs())
{
shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
if (tv == ptv)
{ {
auto& n = parameter_as_output = true;
*node; // Work around a compiler warning (*node inside typeid may have effects writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
// with shared pointers, which is fine here but clang doesn't like it.) << et.c_type_string() << "*>(outputs[" << output_index << "]), "
auto handler = dispatcher.find(type_index(typeid(n))); << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
if (handler == dispatcher.end()) << ptv->get_tensor().size() << ");\n";
{ break;
throw ngraph_error("Unhandled op during code generation : " +
node->description());
}
vector<GPU_TensorViewWrapper> in;
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
// Emit operation prologue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{
emit_debug_function_entry(writer, node.get(), in, out);
}
}
// Emit operation body
string func_name;
auto it = match_functions.find(node.get());
if (it != match_functions.end())
{
func_name = it->second;
}
if (func_name.empty())
{
handler->second(this, writer, node.get(), in, out);
}
else
{
vector<string> names;
for (const GPU_TensorViewWrapper& tv : in)
{
names.push_back(tv.get_name());
}
for (const GPU_TensorViewWrapper& tv : out)
{
names.push_back(tv.get_name());
}
writer << func_name << "(" << join(names) << ");\n";
}
// Emit operation epilogue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
}
}
} }
writer.indent--;
// End generated function
writer += "}\n\n";
} }
// TODO: Cleanup and make this a utility function }
if (!parameter_as_output && !contains(aliases, output_index))
file_util::make_directory(s_output_dir); {
string filename = if (contains(constants, tv.get()))
file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
ofstream out(filename);
string code = writer.get_code();
out << code;
out.close();
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(pch_header_source);
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
{ {
throw runtime_error("function failed to compile"); writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
<< "], " << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().size() << ");\n";
} }
m_execution_engine->add_module(codegen_module); else
m_execution_engine->finalize();
m_compiled_function =
m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function);
m_is_compiled = true;
if (m_release_function)
{ {
release_function(); string type = et.c_type_string();
stringstream ss;
ss << "((" << type << "*)(outputs[" << output_index << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
} }
} }
output_index++;
}
void GPU_ExternalFunction::handle_output_alias( for (shared_ptr<Node> node : current_function->get_ordered_ops())
codegen::CodeWriter& writer, {
const Node& node, auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map) // with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end())
{
throw ngraph_error("Unhandled op during code generation : " + node->description());
}
vector<GPU_TensorViewWrapper> in;
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{ {
for (const descriptor::Output& output : node.get_outputs()) shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
// Emit operation prologue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{ {
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view(); emit_debug_function_entry(writer, node.get(), in, out);
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{
writer << "{ // handle output alias for previous op\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<< outputs[i] << "]), static_cast<void*>(outputs["
<< outputs[0] << "]), " << otv->get_tensor().size()
<< ");\n";
}
writer.indent--;
writer << "}\n";
}
}
} }
} }
shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame() // Emit operation body
string func_name;
auto it = match_functions.find(node.get());
if (it != match_functions.end())
{
func_name = it->second;
}
if (func_name.empty())
{ {
if (!m_is_compiled) handler->second(this, writer, node.get(), in, out);
}
else
{
vector<string> names;
for (const GPU_TensorViewWrapper& tv : in)
{ {
compile(); names.push_back(tv.get_name());
} }
for (const GPU_TensorViewWrapper& tv : out)
return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function); {
names.push_back(tv.get_name());
}
writer << func_name << "(" << join(names) << ");\n";
} }
void GPU_ExternalFunction::emit_debug_function_entry( // Emit operation epilogue
codegen::CodeWriter& writer, if (!node->is_parameter() && !node->is_constant())
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".start();\n"; if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
}
} }
}
writer.indent--;
// End generated function
writer += "}\n\n";
}
// TODO: Cleanup and make this a utility function
void GPU_ExternalFunction::emit_debug_function_exit( file_util::make_directory(s_output_dir);
codegen::CodeWriter& writer, string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
Node* node, ofstream out(filename);
const std::vector<GPU_TensorViewWrapper>& in, string code = writer.get_code();
const std::vector<GPU_TensorViewWrapper>& out) out << code;
out.close();
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(pch_header_source);
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
{
throw runtime_error("function failed to compile");
}
m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function);
m_is_compiled = true;
if (m_release_function)
{
release_function();
}
}
void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
codegen::CodeWriter& writer,
const Node& node,
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
{
for (const descriptor::Output& output : node.get_outputs())
{
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{ {
writer << "timer_" << node->get_name() << ".stop();\n"; writer << "{ // handle output alias for previous op\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<< outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
<< "]), " << otv->get_tensor().size() << ");\n";
}
writer.indent--;
writer << "}\n";
} }
} }
} }
} }
shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
{
if (!m_is_compiled)
{
compile();
}
return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{
writer << "timer_" << node->get_name() << ".start();\n";
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{
writer << "timer_" << node->get_name() << ".stop();\n";
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment