Commit 40069d27 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu deconvolution (#1099)

* add pad_dilation function

* add dilation to gpu_emitter

* add CoordinateDiff constructor to GPUShape

* remove unecessary cast

* working version for forward

* forward working

* forward test all pass

* deconvolution forward

* backward data dilation

* forward test passed

* initial to 0

* fix bug for get_padded_shape and clang format

* code style, change variable names

* refactor convolution conditions

* fix bug padding_below_diff

* change pad_dilation to pad_dynamic, compare to pad

* remove passed convolution test from skip list, clang format

* change pad to use GPUShape
parent 83e6aa5f
...@@ -231,6 +231,106 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex ...@@ -231,6 +231,106 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape padding_below,
GPUShape padding_interior)
{
std::stringstream kernel_name;
kernel_name << "pad_dynamic_" << join(dtypes, "_");
std::string hash = kernel_name.str() + "pad_i" + join(input_shape, "_") + "pad_o" +
join(output_shape) + "_pb" + join(padding_below, "_") + "_pi" +
join(padding_interior, "_");
// For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_pad_dynamic_op(writer, kernel_name.str(), dtypes);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
unsigned int rank = static_cast<unsigned int>(input_shape.size());
unsigned int nthreads = static_cast<unsigned int>(shape_size(input_shape));
GPUShape pad_below(input_shape.size(), 0);
GPUShape pad_interior(input_shape.size(), 1);
int64_t i = padding_below.size() - 1;
int64_t j = input_shape.size() - 1;
for (; i >= 0; i--, j--)
{
pad_below[j] = padding_below[i];
pad_interior[j] = padding_interior[i];
}
GPUShape input_strides = row_major_strides(input_shape);
GPUShape output_strides = row_major_strides(output_shape);
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_input_strides = allocator.reserve_argspace(
input_strides.data(), input_strides.size() * sizeof(unsigned int));
size_t idx_output_strides = allocator.reserve_argspace(
output_strides.data(), output_strides.size() * sizeof(unsigned int));
size_t idx_padding_below =
allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(unsigned int));
size_t idx_padding_interior =
allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(unsigned int));
// create the launch primitive
std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides);
void* param_padding_below = runtime::gpu::invoke_memory_primitive(ctx, idx_padding_below);
void* param_padding_interior =
runtime::gpu::invoke_memory_primitive(ctx, idx_padding_interior);
std::vector<void*> args_list{&inputs[0],
&outputs[0],
&param_input_strides,
&param_output_strides,
&param_padding_below,
&param_padding_interior,
&rank,
&nthreads};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<uint32_t>(nthreads),
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list.data(),
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}});
primitive_index = this->m_primitive_emitter->insert(std::move(pad_dynamic));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes, const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
......
...@@ -47,6 +47,13 @@ namespace ngraph ...@@ -47,6 +47,13 @@ namespace ngraph
GPUShape pad_interior, GPUShape pad_interior,
const std::string& pad_value = ""); const std::string& pad_value = "");
size_t build_pad_dynamic(const runtime::gpu::GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape padding_below,
GPUShape padding_interior);
size_t build_1d_max_pool(const GPURuntimeContext* ctx, size_t build_1d_max_pool(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes, const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
......
...@@ -279,6 +279,40 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer, ...@@ -279,6 +279,40 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer.block_end(); writer.block_end();
} }
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, unsigned int* input_strides, unsigned int* output_strides, "
"unsigned int* padding_below, unsigned int* "
"padding_interior, unsigned int rank, unsigned int n)\n";
writer.block_begin();
{
writer << "unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "unsigned int output_idx = 0;\n";
writer << "unsigned int input_idx = tid;\n";
writer << "for(unsigned int i = 0; i < rank; i++)\n";
writer.block_begin();
{
writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i] + "
"padding_below[i]) "
"* output_strides[i];\n";
writer << "input_idx %= input_strides[i];\n";
}
writer.block_end();
writer << "out[output_idx] = in[tid];\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer, void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::array<std::string, 2>& data_types) const std::array<std::string, 2>& data_types)
......
...@@ -81,6 +81,10 @@ namespace ngraph ...@@ -81,6 +81,10 @@ namespace ngraph
const std::string& math_kernel, const std::string& math_kernel,
const std::vector<std::string>& data_types); const std::vector<std::string>& data_types);
static void get_pad_dynamic_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_ew_collective_op(codegen::CodeWriter& writer, static void get_ew_collective_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::string& op, const std::string& op,
......
This diff is collapsed.
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "ngraph/axis_set.hpp" #include "ngraph/axis_set.hpp"
#include "ngraph/coordinate.hpp" #include "ngraph/coordinate.hpp"
#include "ngraph/coordinate_diff.hpp"
#include "ngraph/shape.hpp" #include "ngraph/shape.hpp"
#include "ngraph/strides.hpp" #include "ngraph/strides.hpp"
...@@ -125,5 +126,19 @@ namespace ngraph ...@@ -125,5 +126,19 @@ namespace ngraph
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<uint32_t>(size));
} }
} }
GPUShape(const CoordinateDiff& coord)
{
for (auto const& size : coord)
{
if (size >> 32 != 0)
{
throw std::runtime_error(
"Request for Coordinate which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
}
}
}; };
} }
...@@ -94,15 +94,6 @@ namespace ngraph ...@@ -94,15 +94,6 @@ namespace ngraph
namespace gpu namespace gpu
{ {
void print_gpu_f32_tensor(const void* p, size_t element_count, size_t element_size); void print_gpu_f32_tensor(const void* p, size_t element_count, size_t element_size);
template <typename T>
void print_gpu_tensor(const void* p, size_t element_count)
{
std::vector<T> local(element_count);
size_t size_in_bytes = sizeof(T) * element_count;
cudaMemcpy(local.data(), p, size_in_bytes, cudaMemcpyDeviceToHost);
std::cout << "{" << ngraph::join(local) << "}" << std::endl;
}
void check_cuda_errors(CUresult err); void check_cuda_errors(CUresult err);
void* create_gpu_buffer(size_t buffer_size, const void* data = NULL); void* create_gpu_buffer(size_t buffer_size, const void* data = NULL);
void free_gpu_buffer(void* buffer); void free_gpu_buffer(void* buffer);
...@@ -112,6 +103,15 @@ namespace ngraph ...@@ -112,6 +103,15 @@ namespace ngraph
void cuda_memset(void* dst, int value, size_t buffer_size); void cuda_memset(void* dst, int value, size_t buffer_size);
std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
template <typename T>
void print_gpu_tensor(const void* p, size_t element_count)
{
std::vector<T> local(element_count);
size_t size_in_bytes = sizeof(T) * element_count;
cuda_memcpyDtH(local.data(), p, size_in_bytes);
std::cout << "{" << ngraph::join(local) << "}" << std::endl;
}
} }
} }
} }
...@@ -7,20 +7,6 @@ batch_norm_three_outputs ...@@ -7,20 +7,6 @@ batch_norm_three_outputs
computation_reuse computation_reuse
concat_matrix_int64 concat_matrix_int64
constant_equality_bool constant_equality_bool
convolution_2d_1item_1o1i_data_dilated
convolution_2d_1item_2o1i_data_dilated
convolution_2d_1item_2o2i_data_dilated
convolution_2d_1item_5o3i_data_dilated
convolution_2d_2item_5o3i_data_dilated
convolution_2d_2items_dilated_padded
convolution_2d_2items_strided_padded
convolution_2d_8item_large_5o3i_data_dilated
convolution_2d_8item_large_5o3i_uneven_filter_data_dilated
convolution_2d_8item_large_5o3i_uneven_filter_uneven_data_dilation_data_dilated
convolution_3d_1item_large_5o3i_padded_uneven_filter_uneven_data_dilation_data_dilated
convolution_3d_2item_large_5o3i_padded_strided_uneven_filter_uneven_data_dilation_data_dilated
convolution_3d_2item_large_5o3i_padded_strided_uneven_filter_uneven_data_dilation_filter_dilated_data_dilated
convolution_3d_2item_large_5o3i_uneven_filter_uneven_data_dilation_data_dilated
convolution_4d_2items convolution_4d_2items
convolution_4d_4items convolution_4d_4items
convolution_4d_4items_dilated convolution_4d_4items_dilated
......
...@@ -45,7 +45,7 @@ namespace ngraph ...@@ -45,7 +45,7 @@ namespace ngraph
{ {
if (std::abs(a[i] - b[i]) > atol + rtol * std::abs(b[i])) if (std::abs(a[i] - b[i]) > atol + rtol * std::abs(b[i]))
{ {
NGRAPH_INFO << a[i] << " is not close to " << b[i]; NGRAPH_INFO << a[i] << " is not close to " << b[i] << " at index " << i;
rc = false; rc = false;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment