Unverified Commit 95594d93 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by GitHub

Merge branch 'master' into jmenon/dex2

parents c829a9c7 3d66cba4
......@@ -68,16 +68,16 @@ public:
std::string generate_temporary_name(std::string prefix = "tempvar");
void block_begin(std::string block_prefix = "")
void block_begin()
{
*this << "{" << block_prefix << "\n";
*this << "{\n";
indent++;
}
void block_end(std::string block_suffix = "")
void block_end()
{
indent--;
*this << "}" << block_suffix << "\n";
*this << "}\n";
}
private:
......
......@@ -265,7 +265,6 @@ void codegen::StaticCompiler::add_header_search_path(const string& p)
vector<string> paths = split(p, ';');
for (const string& path : paths)
{
NGRAPH_INFO << path;
if (!contains(m_extra_search_path_list, path))
{
m_extra_search_path_list.push_back(path);
......
......@@ -268,8 +268,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
unsigned int rank = static_cast<unsigned int>(input_shape.size());
unsigned int nthreads = static_cast<unsigned int>(shape_size(input_shape));
uint32_t rank = static_cast<uint32_t>(input_shape.size());
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
GPUShape pad_below(input_shape.size(), 0);
GPUShape pad_interior(input_shape.size(), 1);
......@@ -286,14 +286,14 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_input_strides = allocator.reserve_argspace(
input_strides.data(), input_strides.size() * sizeof(unsigned int));
size_t idx_output_strides = allocator.reserve_argspace(
output_strides.data(), output_strides.size() * sizeof(unsigned int));
size_t idx_input_strides =
allocator.reserve_argspace(input_strides.data(), input_strides.size() * sizeof(uint32_t));
size_t idx_output_strides =
allocator.reserve_argspace(output_strides.data(), output_strides.size() * sizeof(uint32_t));
size_t idx_padding_below =
allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(unsigned int));
allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(uint32_t));
size_t idx_padding_interior =
allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(unsigned int));
allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(uint32_t));
// create the launch primitive
std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs,
......@@ -1015,7 +1015,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
args_list[6] = &nthreads;
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(nthreads),
static_cast<uint32_t>(nthreads),
1,
1, // grid dim
1,
......
......@@ -285,19 +285,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, unsigned int* input_strides, unsigned int* output_strides, "
"unsigned int* padding_below, unsigned int* "
"padding_interior, unsigned int rank, unsigned int n)\n";
<< data_types[1] << "* out, uint32_t* input_strides, uint32_t* output_strides, "
"uint32_t* padding_below, uint32_t* "
"padding_interior, uint32_t rank, uint32_t n)\n";
writer.block_begin();
{
writer << "unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "unsigned int output_idx = 0;\n";
writer << "unsigned int input_idx = tid;\n";
writer << "uint32_t output_idx = 0;\n";
writer << "uint32_t input_idx = tid;\n";
writer << "for(unsigned int i = 0; i < rank; i++)\n";
writer << "for(uint32_t i = 0; i < rank; i++)\n";
writer.block_begin();
{
writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i] + "
......
......@@ -47,7 +47,7 @@ void runtime::gpu::emit_onehot(const std::string& name,
void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count),
static_cast<uint32_t>(count),
1,
1, // grid dim
1,
......@@ -84,7 +84,7 @@ void runtime::gpu::emit_reshape(const std::string& name,
void* args_list[] = {&in, &out, &input_strides, &trans_strides, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count),
static_cast<uint32_t>(count),
1,
1, // grid dim
1,
......@@ -124,7 +124,7 @@ void runtime::gpu::emit_slice(const std::string& name,
void* args_list[] = {
&in, &out, &input_strides, &lower_bounds, &slice_strides, &output_strides, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count),
static_cast<uint32_t>(count),
1,
1, // grid dim
1,
......@@ -161,7 +161,7 @@ void runtime::gpu::emit_reverse(const std::string& name,
void* args_list[] = {&in, &out, &input_shapes, &reverse_axes, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(count),
static_cast<uint32_t>(count),
1,
1, // grid dim
1,
......
......@@ -118,7 +118,7 @@ namespace ngraph
{
return;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
......@@ -136,7 +136,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_ADD,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
)";
writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
<< "opTensorDesc,"
......@@ -193,7 +193,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
auto input_shape = args[0].get_shape();
Shape input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
if (pad_required || is_deconvolution)
{
input_shape_padded = get_padded_shape(
......@@ -314,7 +314,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
}
Shape padding_interior(data_dilation_strides);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
if (pad_required || is_deconvolution)
{
output_shape_padded = get_padded_shape(
......@@ -467,7 +467,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
auto input_shape = args[0].get_shape();
auto input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
if (pad_required || is_deconvolution)
{
input_shape_padded = get_padded_shape(
......@@ -549,7 +549,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << second.get_size() << ";\n";
writer << "CUBLAS_SAFE_CALL(cublasScopy("
<< "*ctx->cublas_handle,"
......@@ -566,7 +566,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
// set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0)
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * sizeof(float));\n";
writer.block_end();
......@@ -586,7 +586,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
"arg0 and arg1 shape does not match for dot.");
}
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "CUBLAS_SAFE_CALL(cublasSdot("
<< "*ctx->cublas_handle," << args[0].get_size() << ","
<< args[0].get_name() << ","
......@@ -598,7 +598,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) &&
(dot->get_reduction_axes_count() == 1))
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "CUBLAS_SAFE_CALL(cublasSetPointerMode(*ctx->cublas_handle, "
......@@ -668,7 +668,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
}
// GEMM Call
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0.0;\n";
writer << "int m = " << m << ";\n";
......@@ -703,7 +703,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
......@@ -721,7 +721,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MAX,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
)";
writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
<< "opTensorDesc,"
......@@ -741,7 +741,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
......@@ -759,7 +759,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MIN,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
)";
writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
<< "opTensorDesc,"
......@@ -779,7 +779,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0;
......@@ -797,7 +797,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_ADD,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
)";
writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
<< "opTensorDesc,"
......@@ -825,7 +825,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
// broadcast axes is empty, do a copy
if (axes.empty())
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer.block_end();
return;
......@@ -867,7 +867,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
block_size += block_strides[i];
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer << "int num_inputs = " << args.size() << ";\n";
......@@ -910,7 +910,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
return;
}
auto reshape = static_cast<const op::Reshape*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape();
......@@ -1013,7 +1013,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
const auto input_strides = row_major_strides(arg_shape);
const auto output_strides = row_major_strides(result_shape);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
if (args[0].get_size() == out[0].get_size())
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
......@@ -1077,7 +1077,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
reverse_axes_flag[a] = 1;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
if (out[0].get_size() == 1)
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
......@@ -1124,7 +1124,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
......@@ -1142,7 +1142,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MUL,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
)";
writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
<< "opTensorDesc,"
......@@ -1173,7 +1173,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
repeat_size *= result_shape[i];
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer << "runtime::gpu::emit_onehot(\"" << node->description() << "\", {\""
......@@ -1193,7 +1193,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0;
......@@ -1211,7 +1211,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_SQRT,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
)";
writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
<< "opTensorDesc,"
......@@ -1227,7 +1227,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer.block_end();
return;
......@@ -1237,7 +1237,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
void GPU_Emitter::EMITTER_DECL(ngraph::op::Max)
{
const ngraph::op::Max* max_op = static_cast<const ngraph::op::Max*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
if (out[0].get_size() != 0)
{
......@@ -1285,7 +1285,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
void GPU_Emitter::EMITTER_DECL(ngraph::op::Min)
{
const ngraph::op::Min* min_op = static_cast<const ngraph::op::Min*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
if (out[0].get_size() != 0)
{
......@@ -1333,7 +1333,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
void GPU_Emitter::EMITTER_DECL(ngraph::op::Sum)
{
const ngraph::op::Sum* sum = static_cast<const ngraph::op::Sum*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
if (out[0].get_size() != 0)
{
......@@ -1372,7 +1372,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
void GPU_Emitter::EMITTER_DECL(ngraph::op::Product)
{
const ngraph::op::Product* product = static_cast<const ngraph::op::Product*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
if (out[0].get_size() != 0)
{
......@@ -1432,7 +1432,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{TI(ngraph::op::Maximum), CUDNN_REDUCE_TENSOR_MAX},
{TI(ngraph::op::Minimum), CUDNN_REDUCE_TENSOR_MIN}};
const ngraph::op::Reduce* reduce_op = static_cast<const ngraph::op::Reduce*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
if (out[0].get_size() != 0)
{
......@@ -1521,7 +1521,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
const ngraph::op::ReduceWindow* reduce_window_op =
static_cast<const ngraph::op::ReduceWindow*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
if (out[0].get_size() != 0)
{
......@@ -1620,7 +1620,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
void GPU_Emitter::EMITTER_DECL(ngraph::op::Pad)
{
auto pad = static_cast<const ngraph::op::Pad*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto input_shape = args[0].get_shape();
auto output_shape = out[0].get_shape();
......@@ -1653,7 +1653,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
// assumes NC{d1,d2,...} format
auto max_pool = static_cast<const ngraph::op::MaxPool*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto& input_shape = args[0].get_shape();
auto& result_shape = out[0].get_shape();
......@@ -1785,7 +1785,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::MaxPoolBackprop)
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto mpb = static_cast<const ngraph::op::MaxPoolBackprop*>(node);
auto fp_input_shape = out[0].get_shape();
......@@ -1843,7 +1843,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
args[0].get_shape(),
batchnorm->get_eps_value());
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
writer << "gpu::invoke_primitive(ctx, " << bn_index << ", ";
writer << "std::vector<void*>{" << args.front().get_name();
......@@ -1879,7 +1879,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
args[0].get_shape(),
batchnorm->get_eps_value());
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
writer << "gpu::invoke_primitive(ctx, " << bn_index << ", ";
writer << "std::vector<void*>{" << args.front().get_name();
......@@ -1904,7 +1904,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
auto get_tuple_element = static_cast<const ngraph::op::GetOutputElement*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
writer << "runtime::gpu::cuda_memcpyDtH(" << out[0].get_name() << ", "
<< args[get_tuple_element->get_n()].get_name() << ", "
<< out[0].get_size() * out[0].get_element_type().size() << ");\n";
......@@ -1959,7 +1959,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
// assumes NC{d1,d2,...} format
auto avg_pool = static_cast<const ngraph::op::AvgPool*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto& input_shape = args[0].get_shape();
auto& result_shape = out[0].get_shape();
......@@ -2034,7 +2034,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::AvgPoolBackprop)
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto apb = static_cast<const ngraph::op::AvgPoolBackprop*>(node);
auto output_shape = out[0].get_shape();
......@@ -2079,7 +2079,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
{
// assumes NC{d1,d2,...} format
auto rep_slice = static_cast<const ngraph::op::ReplaceSlice*>(node);
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto& input_shape = args[0].get_shape();
auto& source_shape = args[1].get_shape();
......@@ -2129,7 +2129,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
{
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
auto softmax = static_cast<const ngraph::op::Softmax*>(node);
auto tensor_shape = args[0].get_shape();
......
......@@ -77,7 +77,7 @@ namespace ngraph
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
writer.block_begin(" // " + node->get_name());
writer.block_begin();
{
std::vector<std::string> dtypes;
for (auto& arg : args)
......
......@@ -312,32 +312,32 @@ void runtime::gpu::GPU_ExternalFunction::compile()
writer +=
R"(// Generated by the NGraph GPU backend
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/descriptor/primary_tensor_view.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/descriptor/primary_tensor_view.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
)";
string pch_header_source = writer.get_code();
......@@ -346,81 +346,12 @@ void runtime::gpu::GPU_ExternalFunction::compile()
using namespace ngraph;
using namespace ngraph::runtime;
using namespace std;
)";
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
)";
// The "dso_handle" symbol is required by __cxa_atexit()
// which is enabled because the JIT uses it as the default mechanism
// to register cleanup handlers. We use it, and not atexit(), because
// atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
......@@ -432,6 +363,8 @@ using namespace std;
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
<< "] =\n";
......@@ -440,8 +373,6 @@ using namespace std;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
}
}
......@@ -449,7 +380,7 @@ using namespace std;
// Add cudnn descriptor factory for descriptor management.
// After the cuDNN code emitted in gpu_emitter.cc is refactored
// into the CUDNNEmitter class, this can be removed.
writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n";
writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n\n";
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
......@@ -457,85 +388,52 @@ using namespace std;
writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
<< "gpu::GPURuntimeContext* ctx);\n";
}
writer << "\n";
unordered_map<Node*, string> match_functions;
// This for loop creates a collection of functions that are called more than once
// and emitting them as globally callable functions.
// ops implement the is_functionally_identical method
unordered_map<string, string> match_function_map;
unordered_map<const Node*, string> node_function_map;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
list<shared_ptr<Node>> tmp = current_function->get_ordered_ops();
if (tmp.size() < 2)
{
// Since we are comparing ops there must be at least two ops to proceed.
continue;
}
vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
for (size_t i = 0; i < op_list.size() - 1; i++)
for (size_t i = 0; i < op_list.size(); i++)
{
if (op_list[i]->is_constant() || op_list[i]->is_parameter())
{
continue;
}
if (contains_key(match_functions, op_list[i].get()))
Node& node = *op_list[i];
auto handler = dispatcher.find(type_index(typeid(node)));
if (handler == dispatcher.end())
{
continue;
throw ngraph_error("Unhandled op during code generation : " + node.description());
}
string match_function = emit_op_as_function(node, "__f__");
string match_function_name;
if (!match_function_name.empty())
if (contains_key(match_function_map, match_function))
{
writer << "static void " << match_function_name << "(";
writer.indent++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto& n = *op_list[i];
auto handler = dispatcher.find(type_index(typeid(n)));
vector<GPU_TensorViewWrapper> in;
size_t arg_index = 0;
set<string> arg_names;
for (const descriptor::Input& input : n.get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
if (!contains(arg_names, tvw.get_name()))
{
arg_names.insert(tvw.get_name());
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
}
in.push_back(tvw);
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : n.get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
out.push_back(tvw);
}
writer.indent--;
writer << "\n)\n";
writer << "{\n";
writer.indent++;
handler->second(this, writer, &n, in, out);
writer.indent--;
writer << "}\n";
match_function_name = match_function_map[match_function];
}
else
{
auto offset = match_function.find("__f__");
string emitted_function = match_function;
match_function_name = "func_" + node.get_name();
emitted_function.replace(offset, 5, match_function_name);
match_function_map.insert({match_function, match_function_name});
writer << emitted_function << "\n";
}
node_function_map.insert({&node, match_function_name});
}
}
......@@ -704,12 +602,15 @@ using namespace std;
throw ngraph_error("Unhandled op during code generation : " + node->description());
}
vector<GPU_TensorViewWrapper> in;
vector<string> node_input_names;
vector<string> node_output_names;
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_input_names.emplace_back(tv->get_tensor().get_name());
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
......@@ -717,6 +618,18 @@ using namespace std;
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_output_names.emplace_back(tv->get_tensor().get_name());
}
// Emit function description comment
if (!node->is_parameter() && !node->is_constant())
{
writer << "\n// " << node->get_name() << "(";
vector<string> parameter_nodes = node_input_names;
parameter_nodes.insert(
parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
writer << join(parameter_nodes);
writer << ")\n";
}
// Emit operation prologue
......@@ -730,13 +643,10 @@ using namespace std;
// Emit operation body
string func_name;
auto it = match_functions.find(node.get());
if (it != match_functions.end())
{
func_name = it->second;
}
func_name = node_function_map[node.get()];
if (func_name.empty())
{
//throw runtime_error("No matching function found for '" + node->get_name() + "'");
handler->second(this, writer, node.get(), in, out);
}
else
......@@ -750,6 +660,7 @@ using namespace std;
{
names.push_back(tv.get_name());
}
names.push_back("ctx");
writer << func_name << "(" << join(names) << ");\n";
}
......@@ -875,3 +786,117 @@ std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunc
{
return m_ctx;
}
bool runtime::gpu::GPU_ExternalFunction::is_functionally_identical(
const Node& n1, const Node& n2, const unordered_map<const Node*, string>& node_cache) const
{
return node_cache.at(&n1) == node_cache.at(&n2);
}
string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
const string& function_name)
{
codegen::CodeWriter writer;
writer << "static void " << function_name << "(";
writer.indent++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(node)));
vector<GPU_TensorViewWrapper> in;
size_t arg_index = 0;
set<string> arg_names;
for (const descriptor::Input& input : node.get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
if (!contains(arg_names, tvw.get_name()))
{
arg_names.insert(tvw.get_name());
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
}
in.push_back(tvw);
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node.get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
out.push_back(tvw);
}
writer << ",\ngpu::GPURuntimeContext* ctx";
writer.indent--;
writer << "\n)\n";
codegen::CodeWriter tmp_writer;
handler->second(this, tmp_writer, &node, in, out);
string body = tmp_writer.get_code();
if (body.size() > 0 && body[0] == '{')
{
// Body already surrounded by curly braces so don't add more
writer << body;
}
else
{
writer.block_begin();
writer << body;
writer.block_end();
}
string rc = writer.get_code();
if (function_name == "f")
{
rc = strip_comments(rc);
}
return rc;
}
string runtime::gpu::GPU_ExternalFunction::strip_comments(const string& s) const
{
stringstream out;
for (size_t i = 0; i < s.size(); i++)
{
if (i < s.size() - 2)
{
if (s[i] == '/' && s[i + 1] == '/')
{
// line comment
i += 2;
while (s[i] != '\n')
{
i++;
}
out << '\n';
}
else if (s[i] == '/' && s[i + 1] == '*')
{
// multi-line comment
i += 2;
while (!(s[i] == '*' && s[i + 1] == '/'))
{
i++;
}
i++;
}
else
{
out << s[i];
}
}
else
{
out << s[i];
}
}
return out.str();
}
......@@ -83,6 +83,13 @@ namespace ngraph
const Node&,
const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
void release_function() { m_function = nullptr; }
std::string emit_op_as_function(const Node& node, const std::string& function_name);
std::string strip_comments(const std::string& s) const;
bool is_functionally_identical(
const Node& n1,
const Node& n2,
const std::unordered_map<const Node*, std::string>& node_cache) const;
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment