Commit 63a233b6 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Utilize GPUKernelArgs parameter for ew-collective, nd-conv, replace_slice. (#1346)

* Support GPUKernelArgs in Elementwise-collective and Nd-Convolution.

* Update op::ReplaceSlice to use GPUKernelArgs and unroll coordinate transform loop.

* Formatting.

* Moved function signature for global kernels back to emitter body.

* Formatting.
parent 14019ab9
......@@ -1436,31 +1436,6 @@ size_t
return primitive_index;
}
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes);
}
CudaKernelBuilder::get_ew_collective_op(writer,
kernel_name.str(),
op,
reduce_op,
dtypes,
reduced_tensors,
save_elementwise,
tensor_shape.size());
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
// calculate strides
NVShape strides = row_major_strides(tensor_shape);
// precacluate invariants for integer division via multiplication
......@@ -1486,56 +1461,76 @@ size_t
reduced_strides[axis] = 0;
}
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_strides = allocator.reserve_argspace(strides.data(), strides.size() * sizeof(int));
size_t idx_stride_magic =
allocator.reserve_argspace(stride_magic.data(), stride_magic.size() * sizeof(int));
size_t idx_stride_shift =
allocator.reserve_argspace(stride_shift.data(), stride_shift.size() * sizeof(int));
size_t idx_reduced_strides =
allocator.reserve_argspace(reduced_strides.data(), reduced_strides.size() * sizeof(int));
size_t nthreads = shape_size(tensor_shape);
constexpr const int nthreads_per_block = 32;
int nblocks = 1 + ((static_cast<int>(nthreads) - 1) / nthreads_per_block);
// TODO: check if mutable is necessary
std::unique_ptr<gpu::primitive> ew_collective(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_strides);
void* stride_magic_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_magic);
void* stride_shift_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_shift);
void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduced_strides);
auto args = this->m_primitive_emitter->add_kernel_args();
for (auto i = 0u; i < dtypes.size() - 1; i++)
{
args.add_placeholder(dtypes[i], "in" + std::to_string(i));
}
args.add_placeholder(dtypes.back(), "out0");
if (save_elementwise)
{
args.add_placeholder(dtypes.back(), "out1");
}
std::vector<void*> args_list;
for (auto i = 0u; i < dtypes.size() - 1; i++)
{
args_list.push_back(&inputs[i]);
}
args_list.push_back(&outputs[0]);
if (save_elementwise)
args.add("strides", strides)
.add("stride_magic", stride_magic)
.add("stride_shift", stride_shift)
.add("reduced_strides", reduced_strides)
.add("nthreads", nthreads);
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
args_list.push_back(&outputs[1]);
CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes);
}
args_list.push_back(&strides_d);
args_list.push_back(&stride_magic_d);
args_list.push_back(&stride_shift_d);
args_list.push_back(&reduced_strides_d);
args_list.push_back(&nthreads);
CudaKernelBuilder::get_ew_collective_op(writer,
kernel_name.str(),
args,
op,
reduce_op,
dtypes,
reduced_tensors,
save_elementwise,
tensor_shape.size());
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
nblocks,
1,
1,
nthreads_per_block,
1,
1,
0,
NULL,
args_list.data(),
0));
debug_sync();
}});
// TODO: check if mutable is necessary
std::unique_ptr<gpu::primitive> ew_collective(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
for (auto i = 0u; i < dtypes.size() - 1; i++)
{
args.resolve_placeholder(i, &inputs[i]);
}
args.resolve_placeholder(dtypes.size() - 1, &outputs[0]);
if (save_elementwise)
{
args.resolve_placeholder(dtypes.size(), &outputs[1]);
}
void** args_list = args.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
nblocks,
1,
1,
nthreads_per_block,
1,
1,
0,
NULL,
args_list,
0));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(ew_collective));
m_primitive_emitter->cache(hash, primitive_index);
......@@ -1665,7 +1660,8 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
NVShape slice_strides)
{
// assumes NC{d1,...,dn} format
std::string kernel_name = "repslices_" + join(dtypes, "_");
std::string kernel_name =
"repslices_" + join(dtypes, "_") + "_r" + std::to_string(tensor_shape.size());
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
......@@ -1681,19 +1677,6 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
return primitive_index;
}
constexpr const int nthreads_per_block = 32;
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_replace_slice_op(
writer, kernel_name, dtypes, nthreads_per_block);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
// calculate strides
NVShape input_strides = row_major_strides(tensor_shape);
NVShape source_strides = row_major_strides(source_shape);
......@@ -1714,81 +1697,62 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
sshifts.push_back(shift);
}
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
// TODO factor into range based for loop of arguments
// (lazy) allocation for kernel arguments
size_t idx_input_strides =
allocator.reserve_argspace(input_strides.data(), (input_strides.size() - 1) * sizeof(int));
size_t idx_dmagics = allocator.reserve_argspace(dmagics.data(), dmagics.size() * sizeof(int));
size_t idx_dshifts = allocator.reserve_argspace(dshifts.data(), dshifts.size() * sizeof(int));
size_t idx_lower_bounds =
allocator.reserve_argspace(lower_bounds.data(), lower_bounds.size() * sizeof(int));
size_t idx_upper_bounds =
allocator.reserve_argspace(upper_bounds.data(), upper_bounds.size() * sizeof(int));
size_t idx_slice_strides =
allocator.reserve_argspace(slice_strides.data(), slice_strides.size() * sizeof(int));
size_t idx_smagics = allocator.reserve_argspace(smagics.data(), smagics.size() * sizeof(int));
size_t idx_sshifts = allocator.reserve_argspace(sshifts.data(), sshifts.size() * sizeof(int));
size_t idx_source_shape =
allocator.reserve_argspace(source_shape.data(), source_shape.size() * sizeof(int));
size_t idx_source_strides =
allocator.reserve_argspace(source_strides.data(), source_strides.size() * sizeof(int));
int rank = static_cast<int>(tensor_shape.size());
size_t rank = tensor_shape.size();
size_t nthreads = shape_size(tensor_shape);
constexpr const int nthreads_per_block = 32;
int nblocks = 1 + ((static_cast<int>(nthreads) - 1) / nthreads_per_block); // ceil_div(nthreads)
// TODO: blending factors are not currently implemented
float alpha = 1.0f;
float beta = 0.0f;
std::unique_ptr<gpu::primitive> replace_slice(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* param_dstr = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
void* param_dmagic = runtime::gpu::invoke_memory_primitive(m_ctx, idx_dmagics);
void* param_dshift = runtime::gpu::invoke_memory_primitive(m_ctx, idx_dshifts);
void* param_lbound = runtime::gpu::invoke_memory_primitive(m_ctx, idx_lower_bounds);
void* param_ubound = runtime::gpu::invoke_memory_primitive(m_ctx, idx_upper_bounds);
void* param_slice_str = runtime::gpu::invoke_memory_primitive(m_ctx, idx_slice_strides);
void* param_slice_magic = runtime::gpu::invoke_memory_primitive(m_ctx, idx_smagics);
void* param_slice_shift = runtime::gpu::invoke_memory_primitive(m_ctx, idx_sshifts);
void* param_dsource = runtime::gpu::invoke_memory_primitive(m_ctx, idx_source_shape);
void* param_sourcestr = runtime::gpu::invoke_memory_primitive(m_ctx, idx_source_strides);
void* args_list[] = {&inputs[0],
&inputs[1],
&outputs[0],
&alpha,
&beta,
&param_dstr,
&param_dmagic,
&param_dshift,
&param_lbound,
&param_ubound,
&param_slice_str,
&param_slice_magic,
&param_slice_shift,
&param_dsource,
&param_sourcestr,
&rank,
&nthreads};
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "source")
.add_placeholder(dtypes[2], "out")
.add("alpha", alpha)
.add("beta", beta)
.add("dim_strides", input_strides)
.add("dim_magic", dmagics)
.add("dim_shift", dshifts)
.add("lower_bounds", lower_bounds)
.add("upper_bounds", upper_bounds)
.add("slice_str", slice_strides)
.add("slice_magic", smagics)
.add("slice_shift", sshifts)
.add("src_strides", source_strides)
.add("nthreads", nthreads);
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
nblocks,
1,
1,
nthreads_per_block,
1,
1,
rank * nthreads_per_block * sizeof(int),
NULL,
args_list,
0));
debug_sync();
}});
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_replace_slice_op(writer, kernel_name, args, rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
std::unique_ptr<gpu::primitive> replace_slice(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &inputs[1])
.resolve_placeholder(2, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
nblocks,
1,
1,
nthreads_per_block,
1,
1,
0,
NULL,
args_list,
0));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(replace_slice));
m_primitive_emitter->cache(hash, primitive_index);
......@@ -1867,9 +1831,8 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
{
codegen::CodeWriter writer;
writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_kernel_signature(
writer, kernel_name, args.get_input_signature());
runtime::gpu::CudaKernelBuilder::get_broadcast_op(writer, result_shape.size());
runtime::gpu::CudaKernelBuilder::get_broadcast_op(
writer, kernel_name, args, result_shape.size());
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
......@@ -2047,9 +2010,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
// coalescing and vectorization is maintained regardless of coordinate access
// (e.g. data and filter dilation).
std::string kernel_name = "convolution_fprop_c_nd_n" + join(dtypes, "_");
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
// prerequisits for kernel cacheing and building
int N = input_shape.back();
int K = filter_shape.back();
......@@ -2061,10 +2021,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
rank++;
}
// additional kernel cache parameters
kernel_name = kernel_name + "_n" + std::to_string(N) + "_k" + std::to_string(K) + "_fsz" +
std::to_string(filter_size) + "_r" + std::to_string(rank);
// tiling options are determined by
// batch size (N) and number of filters (K)
int reg_tile_size = 1;
......@@ -2075,19 +2031,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
reg_tile_size = 4;
}
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
CudaKernelBuilder::get_convolution_forward(
writer, kernel_name, dtypes, N, K, filter_size, rank, sm_tile_size, reg_tile_size);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
// ----- build primitive arguments -----
// TODO: as each cuda_emitter has a regular structure
// it would be beneficial to factor these into classes
// with seperate methods for compiling the kernel, building
......@@ -2159,23 +2102,59 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
float alpha = 1.0f;
float beta = 0.0f;
// ----- register primitive arguments with device -----
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "filter")
.add_placeholder(dtypes[2], "out")
.add("alpha", alpha)
.add("beta", beta)
.add("N", N)
.add("C", C)
.add("K", K)
.add("input_channel_size", input_channel_size)
.add("filter_channel_size", filter_channel_size)
.add("output_filter_size", output_filter_size)
.add("output_pixels", output_pixels)
.add("output_pixels_magic", output_pixels_magic)
.add("output_pixels_shift", output_pixels_shift)
.add("pad", input_pad_below)
.add("data_dilation", input_dilation)
.add("data_dilation_magic", data_dilation_magic)
.add("data_dilation_shift", data_dilation_shift)
.add("filter_strides", filter_stride)
.add("filter_dilation", filter_dilation)
.add("in_shape", input_shape)
.add("in_shape_str", input_shape_str)
.add("out_dim_str", output_dim_strides)
.add("out_str_magic", output_str_magic)
.add("out_str_shift", output_str_shift)
.add("filter_dim_str", filter_dim_strides)
.add("filter_str_magic", filter_str_magic)
.add("filter_str_shift", filter_str_shift);
std::string kernel_name = "convolution_fprop_c_nd_n" + join(dtypes, "_") + "_n" +
std::to_string(N) + "_k" + std::to_string(K) + "_fsz" +
std::to_string(filter_size) + "_r" + std::to_string(rank);
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
size_t idx_pad = allocator.reserve_argspace(input_pad_below);
size_t idx_data_dilation = allocator.reserve_argspace(input_dilation);
size_t idx_data_dilation_magic = allocator.reserve_argspace(data_dilation_magic);
size_t idx_data_dilation_shift = allocator.reserve_argspace(data_dilation_shift);
size_t idx_filter_strides = allocator.reserve_argspace(filter_stride);
size_t idx_filter_dilation = allocator.reserve_argspace(filter_dilation);
size_t idx_input_shape = allocator.reserve_argspace(input_shape);
size_t idx_input_shape_str = allocator.reserve_argspace(input_shape_str);
size_t idx_output_dim_strides = allocator.reserve_argspace(output_dim_strides);
size_t idx_output_str_magic = allocator.reserve_argspace(output_str_magic);
size_t idx_output_str_shift = allocator.reserve_argspace(output_str_shift);
size_t idx_filter_dim_strides = allocator.reserve_argspace(filter_dim_strides);
size_t idx_filter_str_magic = allocator.reserve_argspace(filter_str_magic);
size_t idx_filter_str_shift = allocator.reserve_argspace(filter_str_shift);
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_convolution_forward(writer,
kernel_name,
dtypes,
args,
N,
K,
rank,
filter_size,
sm_tile_size,
reg_tile_size);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
// launch arguments:
// each output pixel is its own block. if the batch size is greater than reg_tile_size * sm_tile_size, a single
......@@ -2193,74 +2172,26 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
// blocks = (PQ*N/8, K/8, 1)
// threads = (8, 8, 1)
std::unique_ptr<gpu::primitive> conv(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* pad_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_pad);
void* data_dilation_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation);
void* data_dilation_magic_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation_magic);
void* data_dilation_shift_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation_shift);
void* filter_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_strides);
void* filter_dilation_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_dilation);
void* input_shape_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_shape);
void* input_shape_str_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_shape_str);
void* output_dim_strides_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_dim_strides);
void* output_str_magic_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_str_magic);
void* output_str_shift_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_str_shift);
void* filter_dim_strides_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_dim_strides);
void* filter_str_magic_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_str_magic);
void* filter_str_shift_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_str_shift);
void* args_list[] = {&inputs[0],
&inputs[1],
&outputs[0],
&alpha,
&beta,
&N,
&C,
&K,
&input_channel_size,
&filter_channel_size,
&output_filter_size,
&output_pixels,
&output_pixels_magic,
&output_pixels_shift,
&pad_d,
&data_dilation_d,
&data_dilation_magic_d,
&data_dilation_shift_d,
&filter_strides_d,
&filter_dilation_d,
&input_shape_d,
&input_shape_str_d,
&output_dim_strides_d,
&output_str_magic_d,
&output_str_shift_d,
&filter_dim_strides_d,
&filter_str_magic_d,
&filter_str_shift_d};
std::unique_ptr<gpu::primitive> conv(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
blocks.x,
blocks.y,
blocks.z,
threads.x,
threads.y,
threads.z,
0,
NULL,
args_list,
0));
debug_sync();
}});
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &inputs[1])
.resolve_placeholder(2, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
blocks.x,
blocks.y,
blocks.z,
threads.x,
threads.y,
threads.z,
0,
NULL,
args_list,
0));
debug_sync();
}});
return this->m_primitive_emitter->insert(std::move(conv));
}
......
......@@ -17,6 +17,7 @@
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_args.hpp"
#include "ngraph/runtime/gpu/type_info.hpp"
using namespace ngraph;
......@@ -106,6 +107,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_divide_op(
void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::string& op,
const std::string& reduce_op,
const std::vector<std::string>& data_types,
......@@ -113,28 +115,11 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
bool save_elementwise,
size_t rank)
{
auto num_inputs = data_types.size() - 1;
writer << "extern \"C\" __global__ void cuda_" << name << "(";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_types[i] << "* in" << i << ", ";
}
writer << data_types[num_inputs] << "* out0, ";
// multi-output to save intermediate elementwise op if requested
if (save_elementwise)
{
writer << data_types[num_inputs] << "* out1, ";
}
writer << "int* strides, "
<< "int* stride_magic, "
<< "int* stride_shift, "
<< "int* reduced_strides, "
<< "size_t n)\n";
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < n)\n";
writer << "if (tid < nthreads)\n";
writer.block_begin();
{
std::string reduced_idx = collective_coordinate_transform_helper(writer,
......@@ -144,8 +129,10 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
"stride_shift",
"reduced_strides",
"coordinate",
rank);
rank,
true);
// element-wise operation
auto num_inputs = data_types.size() - 1;
writer << data_types[num_inputs] << " output = " << op << "(";
for (size_t i = 0; i < num_inputs; i++)
{
......@@ -195,8 +182,11 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
}
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "const int tid = blockDim.x*blockIdx.x + threadIdx.x;\n";
......@@ -524,72 +514,42 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_window_op(
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_replace_slice_op(
codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
int nthreads_per_block)
void runtime::gpu::CudaKernelBuilder::get_replace_slice_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* source, " << data_types[2] << "* out, "
<< "float alpha, float beta, "
<< "int* dim_strides, "
<< "int* dim_magic, "
<< "int* dim_shift, "
<< "int* lower_bounds, "
<< "int* upper_bounds, "
<< "int* slice_str, "
<< "int* slice_magic, "
<< "int* slice_shift, "
<< "int* dim_source, "
<< "int* src_strides, "
<< "int rank,"
<< "size_t nthreads"
<< ")\n";
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "extern __shared__ int dimensions[];\n";
writer << "const int tid = blockDim.x*blockIdx.x + threadIdx.x;\n";
writer << "if (tid < nthreads)\n";
writer.block_begin();
{
writer << "int dim_product = tid;\n";
writer << "int data_idx = 0;\n";
writer << "for (int i = threadIdx.x; i < (rank - 1) * " << nthreads_per_block
<< "; i += " << nthreads_per_block << ")\n";
writer.block_begin();
{
writer << "dimensions[i] = division_by_invariant_multiplication(dim_product, "
"dim_magic[data_idx], "
"dim_shift[data_idx]);\n";
writer << "dim_product -= (dimensions[i] * dim_strides[data_idx]);\n";
writer << "data_idx++;\n";
}
writer.block_end();
writer << "dimensions[threadIdx.x + (rank-1) * " << nthreads_per_block
<< "] = dim_product;\n";
writer << "data_idx = 0;\n";
coordinate_transform_to_multi_d(
writer, "dim_strides", "dim_magic", "dim_shift", "tid", "dimension", rank, true);
writer << "int source_di;\n";
writer << "bool on_stride;\n";
writer << "bool in_slice_di;\n";
writer << "bool in_bounds = true;\n";
writer << "int source_idx = 0;\n";
writer << "for (int i = threadIdx.x; i < rank * " << nthreads_per_block
<< "; i += " << nthreads_per_block << ")\n";
writer.block_begin();
for (int i = 0; i < rank; i++)
{
writer << "int source_di = division_by_invariant_multiplication(dimensions[i], "
"slice_magic[data_idx], "
"slice_shift[data_idx]);\n";
writer << "bool on_stride = (mod16(dimensions[i], source_di, "
"slice_str[data_idx]) == 0);\n";
// within slice of input tensor and a multiple of the slice stride
writer << "bool in_slice_di = (dimensions[i] >= lower_bounds[data_idx]) && "
"(dimensions[i] < upper_bounds[data_idx]) && on_stride;\n";
// determine coordinate in slice
writer << "source_di = division_by_invariant_multiplication(dimension" << i
<< ", slice_magic" << i << ", slice_shift" << i << ");\n";
writer << "on_stride = (mod16(dimension" << i << ", source_di, slice_str" << i
<< ") == 0);\n";
writer << "in_slice_di = "
<< "(dimension" << i << " >= lower_bounds" << i << ") && "
<< "(dimension" << i << " < upper_bounds" << i << ") && on_stride;\n";
writer << "in_bounds = in_bounds && in_slice_di;\n";
// subtract off lower bound to convert to source index
writer << "source_di -= lower_bounds[data_idx];\n";
writer << "source_idx += source_di * src_strides[data_idx];\n";
writer << "data_idx++;\n";
writer << "source_di -= lower_bounds" << i << ";\n";
writer << "source_idx += source_di * src_strides" << i << ";\n";
}
writer.block_end();
writer << "out[tid] = in_bounds ? source[source_idx] : in[tid];\n";
}
writer.block_end();
......@@ -754,10 +714,11 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
runtime::gpu::GPUKernelArgs& args,
int N,
int K,
int filter_size,
int rank,
int filter_size,
int sm_tile_size,
int reg_tile_size)
{
......@@ -775,36 +736,7 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
writer.block_end();
writer << "Matrix;\n\n";
writer << "extern \"C\" __global__ void cuda_" << name << "(";
writer << data_types[0] << "* in, ";
writer << data_types[1] << "* filter, ";
writer << data_types[2] << "* out, ";
// TODO: add alpha/beta support
writer << "float alpha, float beta, "
<< "int N, "
<< "int C, "
<< "int K, "
<< "int input_channel_size, "
<< "int filter_channel_size, "
<< "int output_filter_size, "
<< "int output_pixels, "
<< "int output_pixels_magic, "
<< "int output_pixels_shift, "
<< "int* pad, "
<< "int* data_dilation, "
<< "int* data_dilation_magic, "
<< "int* data_dilation_shift, "
<< "int* filter_strides, "
<< "int* filter_dilation, "
<< "int* in_shape, "
<< "int* in_shape_str, "
<< "int* out_dim_str, "
<< "int* out_str_magic, "
<< "int* out_str_shift, "
<< "int* filter_dim_str, "
<< "int* filter_str_magic, "
<< "int* filter_str_shift"
<< ")\n";
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "Matrix* I = reinterpret_cast<Matrix*>(in);\n";
......@@ -836,7 +768,8 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
"out_str_shift",
"output_pixel_idx",
"out_d",
rank);
rank,
true);
// offset tensors by image and filter indices
// each thread is responsible for it's own image and filter
......@@ -879,8 +812,8 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
for (int i = 0; i < rank; i++)
{
writer << "int input_base_d" << i << " = out_d" << i << " * filter_strides[" << i
<< "] - pad[" << i << "];\n";
writer << "int input_base_d" << i << " = out_d" << i << " * filter_strides" << i
<< " - pad" << i << ";\n";
}
// a mask marking all threads that have tid less than the current thread
......@@ -897,7 +830,8 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
"filter_str_shift",
"filter_pixel",
"filter_d",
rank);
rank,
true);
// transform from filter coordinate to input coordinates
// and check that each coordinate maps to an input element in the undilated space
writer << "int off_dilation_stride = 0;\n";
......@@ -905,14 +839,14 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
for (int i = 0; i < rank; i++)
{
writer << "int input_d" << i << " = input_base_d" << i << " + filter_d" << i
<< " * filter_dilation[" << i << "];\n";
<< " * filter_dilation" << i << ";\n";
// determine coordinate in undilated input space
writer << "undilated_coordinate = division_by_invariant_multiplication(input_d"
<< i << ", data_dilation_magic[" << i << "], data_dilation_shift[" << i
<< "]);\n";
<< i << ", data_dilation_magic" << i << ", data_dilation_shift" << i
<< ");\n";
// if division remainder is 0, then dilated coordinate is on an input element
writer << "off_dilation_stride += (input_d" << i
<< " - undilated_coordinate * data_dilation[" << i << "]);\n";
<< " - undilated_coordinate * data_dilation" << i << ");\n";
// reassign dilated coordinate to undilated input coordinate
writer << "input_d" << i << " = undilated_coordinate;\n";
}
......@@ -929,8 +863,7 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
// in_shape contains the full shape of the input_tensor
// for 2D this is: (C, H, W, N) but rank = 2 and so only [H, W] are used
// condition (input_d0 >=0 && input_d0 < H && input_d1 >= 0 && input_d1 < W)
writer << "input_d" << i << ">= 0 && input_d" << i << " < in_shape[" << i + 1
<< "] ";
writer << "input_d" << i << ">= 0 && input_d" << i << " < in_shape" << i + 1;
}
writer << ");\n";
......@@ -953,7 +886,7 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
}
// skips the first and last stride which correspond
// to the channel and batch coordinate, respectively
writer << "input_d" << i << " * in_shape_str[" << i + 1 << "] ";
writer << "input_d" << i << " * in_shape_str" << i + 1;
}
writer << ")";
// if using register tiling, down shift
......
......@@ -31,23 +31,19 @@ namespace ngraph
{
namespace gpu
{
class GPUKernelArgs;
class CudaKernelBuilder
{
public:
static void get_kernel_signature(codegen::CodeWriter& writer,
const std::string& name,
const std::string& input_signature)
{
writer << "extern \"C\" __global__ void cuda_" << name;
writer << input_signature;
}
static void get_elementwise_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& op,
const std::vector<std::string>& data_types);
static void get_broadcast_op(codegen::CodeWriter& writer, const size_t rank);
static void get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
const size_t rank);
static void get_concat_op(codegen::CodeWriter& writer,
const std::string& name,
......@@ -74,8 +70,8 @@ namespace ngraph
static void get_replace_slice_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
int nthreads_per_block);
GPUKernelArgs& args,
const size_t rank);
static void get_reduce_window_op(codegen::CodeWriter& writer,
const std::string& name,
......@@ -101,6 +97,7 @@ namespace ngraph
static void get_ew_collective_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
const std::string& op,
const std::string& reduce_op,
const std::vector<std::string>& data_types,
......@@ -124,10 +121,11 @@ namespace ngraph
static void get_convolution_forward(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
GPUKernelArgs& args,
int N,
int K,
int filter_size,
int rank,
int filter_size,
int sm_tile_size = 8,
int reg_tile_size = 1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment