Commit f6b84d67 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

add gpu reduce_window op (#1020)

* add reduce op

* hack solution to get reduction function in reduct op

* hack version working on all tests

* add recude_window op

* fixed the reduction checking process

* add reduce window op, save progress, not compilable yet

* change puchback to pre allocate for vector

* fixed datatype vector

* dataype and comments

* pass op intead of using template

* using new GPUshape and allocator

* using GPUShape

* add comment, change map inside function.

* change to more menaful name
parent aacbb305
...@@ -682,6 +682,122 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte ...@@ -682,6 +682,122 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* ctx,
const OpName op_name,
const std::vector<std::string>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape reduce_window_shape,
GPUShape reduce_window_strides)
{
const char* op = NULL;
const char* kernel = NULL;
switch (op_name)
{
case OpName::add:
op = CudaOpMap<ngraph::op::Add>::op;
kernel = CudaOpMap<ngraph::op::Add>::math_kernel;
break;
case OpName::multiply:
op = CudaOpMap<ngraph::op::Multiply>::op;
kernel = CudaOpMap<ngraph::op::Multiply>::math_kernel;
break;
case OpName::minimum:
op = CudaOpMap<ngraph::op::Minimum>::op;
kernel = CudaOpMap<ngraph::op::Minimum>::math_kernel;
break;
case OpName::maximum:
op = CudaOpMap<ngraph::op::Maximum>::op;
kernel = CudaOpMap<ngraph::op::Maximum>::math_kernel;
}
// kernel_name is used to check if the cuda kernel has been previously compiled
size_t rank = input_shape.size();
std::stringstream kernel_name;
kernel_name << "reduce_window"
<< "_" << op << "_" << join(dtypes, "_") << rank;
// hash is used to check if the emitted primitive already exists
std::stringstream ss;
ss << kernel_name.str() << "_s" << join(output_shape, "_");
auto hash = ss.str();
// if the primitive exists, we are done
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
if (kernel)
{
CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes);
}
CudaKernelBuilder::get_reduce_window_op(writer, kernel_name.str(), op, dtypes, rank);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
size_t nthreads = shape_size(output_shape);
GPUShape input_strides = row_major_strides(input_shape);
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
// (lazy) allocation for kernel arguments
size_t idx_input_strides = allocator.reserve_argspace(input_strides.data(), rank * sizeof(int));
size_t idx_output_shape = allocator.reserve_argspace(output_shape.data(), rank * sizeof(int));
size_t idx_reduce_window_shape =
allocator.reserve_argspace(reduce_window_shape.data(), rank * sizeof(int));
size_t idx_reduce_window_strides =
allocator.reserve_argspace(reduce_window_strides.data(), rank * sizeof(int));
// create the launch primitive
std::unique_ptr<gpu::primitive> f(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape);
void* param_reduce_window_shape =
runtime::gpu::invoke_memory_primitive(ctx, idx_reduce_window_shape);
void* param_reduce_window_strides =
runtime::gpu::invoke_memory_primitive(ctx, idx_reduce_window_strides);
std::vector<void*> args_list(7, NULL);
args_list[0] = &inputs[0];
args_list[1] = &outputs[0];
args_list[2] = &param_input_strides;
args_list[3] = &param_output_shape;
args_list[4] = &param_reduce_window_shape;
args_list[5] = &param_reduce_window_strides;
args_list[6] = &nthreads;
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<unsigned int>(nthreads),
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list.data(),
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}});
primitive_index = this->m_primitive_emitter->insert(std::move(f));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes, const std::array<std::string, 3>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_shape.hpp" #include "ngraph/runtime/gpu/gpu_shape.hpp"
#include "ngraph/strides.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -62,6 +63,14 @@ namespace ngraph ...@@ -62,6 +63,14 @@ namespace ngraph
GPUShape padding_below, GPUShape padding_below,
bool include_pad = false); bool include_pad = false);
size_t build_reduce_window(const GPURuntimeContext* ctx,
const OpName op_name,
const std::vector<std::string>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape reduce_window_shape,
GPUShape reduce_window_strides);
template <typename T> template <typename T>
size_t build_elementwise(const GPURuntimeContext* ctx, size_t build_elementwise(const GPURuntimeContext* ctx,
const std::vector<std::string>& dtypes, const std::vector<std::string>& dtypes,
......
...@@ -118,17 +118,17 @@ void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer ...@@ -118,17 +118,17 @@ void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer
writer << "if (tid < n)\n"; writer << "if (tid < n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "size_t idx_in = tid;\n"; writer << "size_t input_idx = tid;\n";
writer << "size_t idx_out = 0;\n"; writer << "size_t output_idx = 0;\n";
writer << "for(size_t i = 0; i < rank; i++)\n"; writer << "for(size_t i = 0; i < rank; i++)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "idx_out += (idx_in / input_strides[i]) * trans_strides[i];\n"; writer << "output_idx += (input_idx / input_strides[i]) * trans_strides[i];\n";
writer << "idx_in %= input_strides[i];\n"; writer << "input_idx %= input_strides[i];\n";
} }
writer.block_end(); writer.block_end();
writer << "out[idx_out] = in[tid];\n"; writer << "out[output_idx] = in[tid];\n";
} }
writer.block_end(); writer.block_end();
} }
...@@ -154,7 +154,7 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer, ...@@ -154,7 +154,7 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer.block_begin(); writer.block_begin();
{ {
writer << "out[tid] = 1;\n"; writer << "out[tid] = 1;\n";
writer << "size_t idx_out = tid;\n"; writer << "size_t output_idx = tid;\n";
writer << "size_t block_id = tid / block_size;\n"; writer << "size_t block_id = tid / block_size;\n";
writer << "size_t block_idx = tid % block_size;\n"; writer << "size_t block_idx = tid % block_size;\n";
writer << "bool processed = false;\n"; writer << "bool processed = false;\n";
...@@ -163,7 +163,7 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer, ...@@ -163,7 +163,7 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer << "if(!processed && (block_idx < block_strides[" << i << "]))\n"; writer << "if(!processed && (block_idx < block_strides[" << i << "]))\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "out[idx_out] = in" << i << "[block_id * block_strides[" << i writer << "out[output_idx] = in" << i << "[block_id * block_strides[" << i
<< "] + block_idx];"; << "] + block_idx];";
writer << "processed = true;\n"; writer << "processed = true;\n";
} }
...@@ -189,18 +189,18 @@ void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer, ...@@ -189,18 +189,18 @@ void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer,
writer << "if (tid < n)\n"; writer << "if (tid < n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "size_t idx_in = 0;\n"; writer << "size_t input_idx = 0;\n";
writer << "size_t idx_out = tid;\n"; writer << "size_t output_idx = tid;\n";
writer << "for(size_t i = 0; i < rank; i++)\n"; writer << "for(size_t i = 0; i < rank; i++)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "idx_in += (((idx_out / output_strides[i]) * slice_strides[i]) + " writer << "input_idx += (((output_idx / output_strides[i]) * slice_strides[i]) + "
"lower_bounds[i]) * input_strides[i];\n"; "lower_bounds[i]) * input_strides[i];\n";
writer << "idx_out %= output_strides[i];\n"; writer << "output_idx %= output_strides[i];\n";
} }
writer.block_end(); writer.block_end();
writer << "out[tid] = in[idx_in];\n"; writer << "out[tid] = in[input_idx];\n";
} }
writer.block_end(); writer.block_end();
} }
...@@ -220,22 +220,79 @@ void runtime::gpu::CudaKernelBuilder::get_reverse_op(codegen::CodeWriter& writer ...@@ -220,22 +220,79 @@ void runtime::gpu::CudaKernelBuilder::get_reverse_op(codegen::CodeWriter& writer
writer << "if (tid < n)\n"; writer << "if (tid < n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "size_t idx_in = tid;\n"; writer << "size_t input_idx = tid;\n";
writer << "size_t idx_out = 0;\n"; writer << "size_t output_idx = 0;\n";
writer << "size_t stride = 1;\n"; writer << "size_t stride = 1;\n";
writer << "for(size_t i = rank; i > 0; i--)\n"; writer << "for(size_t i = rank; i > 0; i--)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "size_t idx = i - 1;\n"; writer << "size_t idx = i - 1;\n";
writer << "size_t axes_i_in = idx_in % input_shape[idx];\n"; writer << "size_t axes_i_in = input_idx % input_shape[idx];\n";
writer << "idx_in /= input_shape[idx];\n"; writer << "input_idx /= input_shape[idx];\n";
writer << "size_t axes_i_out = reverse_axes[idx] ? input_shape[idx] - axes_i_in - " writer << "size_t axes_i_out = reverse_axes[idx] ? input_shape[idx] - axes_i_in - "
"1 : axes_i_in;\n"; "1 : axes_i_in;\n";
writer << "idx_out += axes_i_out * stride;\n"; writer << "output_idx += axes_i_out * stride;\n";
writer << "stride *= input_shape[idx];\n"; writer << "stride *= input_shape[idx];\n";
} }
writer.block_end(); writer.block_end();
writer << "out[idx_out] = in[tid];\n"; writer << "out[output_idx] = in[tid];\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_reduce_window_op(
codegen::CodeWriter& writer,
const std::string& name,
const std::string& op,
const std::vector<std::string>& data_types,
const size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, int* input_strides, int* output_shape, int* "
"reduce_window_shape, int* reduce_window_strides, size_t n)\n";
writer.block_begin();
{
writer << "const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "int output_idx = tid;\n";
writer << "int idx_init = 0; //result will be initial to in[idx_init]\n";
for (int i = static_cast<int>(rank) - 1; i >= 0; i--)
{
writer << "int output_idx_" << i << " = output_idx % output_shape[" << i << "];\n";
writer << "int input_idx_start_for_axis_" << i << " = output_idx_" << i
<< " * reduce_window_strides[" << i << "];\n";
writer << "int input_idx_end_for_axis_" << i << " = input_idx_start_for_axis_" << i
<< " + reduce_window_shape[" << i << "];\n";
writer << "idx_init += input_idx_start_for_axis_" << i << " * input_strides[" << i
<< "];\n";
writer << "output_idx /= output_shape[" << i << "];\n";
}
writer << data_types[1] << " result = in[idx_init];\n";
for (int i = 0; i < rank; i++)
{
writer << "for(int i_" << i << " = input_idx_start_for_axis_" << i << "; i_" << i
<< " < input_idx_end_for_axis_" << i << "; i_" << i << "++)\n";
writer.block_begin();
}
writer << "int input_idx = 0;\n";
for (int i = 0; i < rank; i++)
{
writer << "input_idx += i_" << i << " * input_strides[" << i << "];\n";
}
writer << "result = (input_idx == idx_init) ? result : " << op
<< "(result, in[input_idx]); //skip in[idx_init] in loop\n";
for (int i = 0; i < rank; i++)
{
writer.block_end();
}
writer << "out[tid] = result;\n";
} }
writer.block_end(); writer.block_end();
} }
......
...@@ -63,6 +63,12 @@ namespace ngraph ...@@ -63,6 +63,12 @@ namespace ngraph
const std::string& name, const std::string& name,
const std::array<std::string, 2>& data_types); const std::array<std::string, 2>& data_types);
static void get_reduce_window_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& op,
const std::vector<std::string>& data_types,
const size_t rank);
static void get_replace_slice_op(codegen::CodeWriter& writer, static void get_replace_slice_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::array<std::string, 3>& data_types, const std::array<std::string, 3>& data_types,
......
...@@ -22,6 +22,7 @@ namespace ngraph ...@@ -22,6 +22,7 @@ namespace ngraph
{ {
class Abs; class Abs;
class Acos; class Acos;
class Add;
class Asin; class Asin;
class Atan; class Atan;
class Ceiling; class Ceiling;
...@@ -38,6 +39,9 @@ namespace ngraph ...@@ -38,6 +39,9 @@ namespace ngraph
class Subtract; class Subtract;
class Divide; class Divide;
class Sign; class Sign;
class Maximum;
class Minimum;
class Multiply;
class Convert; class Convert;
class Equal; class Equal;
class NotEqual; class NotEqual;
...@@ -61,6 +65,14 @@ namespace ngraph ...@@ -61,6 +65,14 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
enum class OpName
{
add,
multiply,
minimum,
maximum
};
template <typename T> template <typename T>
struct CudaOpMap; struct CudaOpMap;
...@@ -301,6 +313,34 @@ namespace ngraph ...@@ -301,6 +313,34 @@ namespace ngraph
static constexpr const char* op = "logical_or"; static constexpr const char* op = "logical_or";
static constexpr const char* math_kernel = "x0 | x1"; static constexpr const char* math_kernel = "x0 | x1";
}; };
template <>
struct CudaOpMap<ngraph::op::Add>
{
static constexpr const char* op = "add";
static constexpr const char* math_kernel = "x0 + x1";
};
template <>
struct CudaOpMap<ngraph::op::Multiply>
{
static constexpr const char* op = "mul";
static constexpr const char* math_kernel = "x0 * x1";
};
template <>
struct CudaOpMap<ngraph::op::Minimum>
{
static constexpr const char* op = "min";
static constexpr const char* math_kernel = "x0 > x1 ? x1 : x0";
};
template <>
struct CudaOpMap<ngraph::op::Maximum>
{
static constexpr const char* op = "max";
static constexpr const char* math_kernel = "x0 > x1 ? x0 : x1";
};
} }
} }
} }
This diff is collapsed.
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
using namespace ngraph; using namespace ngraph;
using namespace std; using namespace std;
void runtime::gpu::print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size) void runtime::gpu::print_gpu_f32_tensor(const void* p, size_t element_count, size_t element_size)
{ {
std::vector<float> local(element_count); std::vector<float> local(element_count);
size_t size_in_bytes = element_size * element_count; size_t size_in_bytes = element_size * element_count;
...@@ -43,10 +43,14 @@ void runtime::gpu::check_cuda_errors(CUresult err) ...@@ -43,10 +43,14 @@ void runtime::gpu::check_cuda_errors(CUresult err)
assert(err == CUDA_SUCCESS); assert(err == CUDA_SUCCESS);
} }
void* runtime::gpu::create_gpu_buffer(size_t buffer_size) void* runtime::gpu::create_gpu_buffer(size_t buffer_size, const void* data)
{ {
void* allocated_buffer_pool; void* allocated_buffer_pool;
cudaMalloc(static_cast<void**>(&allocated_buffer_pool), buffer_size); cudaMalloc(static_cast<void**>(&allocated_buffer_pool), buffer_size);
if (data)
{
runtime::gpu::cuda_memcpyHtD(allocated_buffer_pool, data, buffer_size);
}
return allocated_buffer_pool; return allocated_buffer_pool;
} }
...@@ -58,17 +62,17 @@ void runtime::gpu::free_gpu_buffer(void* buffer) ...@@ -58,17 +62,17 @@ void runtime::gpu::free_gpu_buffer(void* buffer)
} }
} }
void runtime::gpu::cuda_memcpyDtD(void* dst, void* src, size_t buffer_size) void runtime::gpu::cuda_memcpyDtD(void* dst, const void* src, size_t buffer_size)
{ {
cudaMemcpy(dst, src, buffer_size, cudaMemcpyDeviceToDevice); cudaMemcpy(dst, src, buffer_size, cudaMemcpyDeviceToDevice);
} }
void runtime::gpu::cuda_memcpyHtD(void* dst, void* src, size_t buffer_size) void runtime::gpu::cuda_memcpyHtD(void* dst, const void* src, size_t buffer_size)
{ {
cudaMemcpy(dst, src, buffer_size, cudaMemcpyHostToDevice); cudaMemcpy(dst, src, buffer_size, cudaMemcpyHostToDevice);
} }
void runtime::gpu::cuda_memcpyDtH(void* dst, void* src, size_t buffer_size) void runtime::gpu::cuda_memcpyDtH(void* dst, const void* src, size_t buffer_size)
{ {
cudaMemcpy(dst, src, buffer_size, cudaMemcpyDeviceToHost); cudaMemcpy(dst, src, buffer_size, cudaMemcpyDeviceToHost);
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#pragma once #pragma once
#include <iostream>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
...@@ -29,6 +30,7 @@ ...@@ -29,6 +30,7 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cudnn.h> #include <cudnn.h>
#include <nvrtc.h> #include <nvrtc.h>
#include "ngraph/util.hpp"
//why use "do...while.." //why use "do...while.."
//https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros //https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
...@@ -91,13 +93,22 @@ namespace ngraph ...@@ -91,13 +93,22 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
void print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size); void print_gpu_f32_tensor(const void* p, size_t element_count, size_t element_size);
template <typename T>
void print_gpu_tensor(const void* p, size_t element_count)
{
std::vector<T> local(element_count);
size_t size_in_bytes = sizeof(T) * element_count;
cudaMemcpy(local.data(), p, size_in_bytes, cudaMemcpyDeviceToHost);
std::cout << "{" << ngraph::join(local) << "}" << std::endl;
}
void check_cuda_errors(CUresult err); void check_cuda_errors(CUresult err);
void* create_gpu_buffer(size_t buffer_size); void* create_gpu_buffer(size_t buffer_size, const void* data = NULL);
void free_gpu_buffer(void* buffer); void free_gpu_buffer(void* buffer);
void cuda_memcpyDtD(void* dst, void* src, size_t buffer_size); void cuda_memcpyDtD(void* dst, const void* src, size_t buffer_size);
void cuda_memcpyHtD(void* dst, void* src, size_t buffer_size); void cuda_memcpyHtD(void* dst, const void* src, size_t buffer_size);
void cuda_memcpyDtH(void* dst, void* src, size_t buffer_size); void cuda_memcpyDtH(void* dst, const void* src, size_t buffer_size);
void cuda_memset(void* dst, int value, size_t buffer_size); void cuda_memset(void* dst, int value, size_t buffer_size);
std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
......
...@@ -46,11 +46,6 @@ numeric_double_nan ...@@ -46,11 +46,6 @@ numeric_double_nan
numeric_float_inf numeric_float_inf
numeric_float_nan numeric_float_nan
parameter_as_output parameter_as_output
reduce_window_emulating_max_pool_1d_1channel_1image
reduce_window_emulating_max_pool_1d_1channel_2image
reduce_window_emulating_max_pool_1d_2channel_2image
reduce_window_emulating_max_pool_2d_1channel_1image_strided
reduce_window_emulating_max_pool_2d_2channel_2image
reverse_sequence_n4d2c3h2w2 reverse_sequence_n4d2c3h2w2
reverse_sequence_n4c3h2w2 reverse_sequence_n4c3h2w2
reverse_sequence_n2c3h4w2 reverse_sequence_n2c3h4w2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment