Unverified Commit 392ef0e1 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

Merge branch 'master' into leona/doc_v0.25.1

parents e9101633 e83c2ffa
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
......@@ -56,8 +56,8 @@ namespace ngraph
}
else
{
// Get the sizes of the dot axes. It's easiest to pull them from arg1 because they're
// right up front.
// Get the sizes of the dot axes. It's easiest to pull them from arg1
// because they're right up front.
Shape dot_axis_sizes(reduction_axes_count);
std::copy(arg1_shape.begin(),
arg1_shape.begin() + reduction_axes_count,
......@@ -67,7 +67,8 @@ namespace ngraph
CoordinateTransform arg1_transform(arg1_shape);
CoordinateTransform output_transform(out_shape);
// Create coordinate transforms for arg0 and arg1 that throw away the dotted axes.
// Create coordinate transforms for arg0 and arg1 that throw away the dotted
// axes.
size_t arg0_projected_rank = arg0_shape.size() - reduction_axes_count;
size_t arg1_projected_rank = arg1_shape.size() - reduction_axes_count;
......@@ -84,15 +85,16 @@ namespace ngraph
CoordinateTransform arg0_projected_transform(arg0_projected_shape);
CoordinateTransform arg1_projected_transform(arg1_projected_shape);
// Create a coordinate transform that allows us to iterate over all possible values
// for the dotted axes.
// Create a coordinate transform that allows us to iterate over all possible
// values for the dotted axes.
CoordinateTransform dot_axes_transform(dot_axis_sizes);
for (const Coordinate& arg0_projected_coord : arg0_projected_transform)
{
for (const Coordinate& arg1_projected_coord : arg1_projected_transform)
{
// The output coordinate is just the concatenation of the projected coordinates.
// The output coordinate is just the concatenation of the projected
// coordinates.
Coordinate out_coord(arg0_projected_coord.size() +
arg1_projected_coord.size());
......@@ -116,8 +118,9 @@ namespace ngraph
arg0_coord.begin());
for (const Coordinate& dot_axis_positions : dot_axes_transform)
{
// In order to find the points to multiply together, we need to inject our current
// positions along the dotted axes back into the projected arg0 and arg1 coordinates.
// In order to find the points to multiply together, we need to
// inject our current positions along the dotted axes back into
// the projected arg0 and arg1 coordinates.
std::copy(dot_axis_positions.begin(),
dot_axis_positions.end(),
arg0_it);
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
......@@ -107,7 +107,7 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::string& dtype,
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primutive
size_t split_input_size = 256; //max num of inputs fit 4KB parameter space: 256 * 8 + 7 * ?
size_t split_input_size = 256; // max num of inputs fit 4KB parameter space: 256 * 8 + 7 * ?
size_t residue = input_num % split_input_size;
std::stringstream kernel_name_1;
std::stringstream kernel_name_2;
......@@ -243,8 +243,9 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d
// __device__ void set_value(float val){value = val;}
//
// };
// Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the struct can
// therfore be given by 'shared_struct_bytes' as calculated below accounting for structure padding
// Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the
// struct can therfore be given by 'shared_struct_bytes' as calculated below accounting for
// structure padding
size_t shared_struct_bytes = (((dtypes[0].size() + index_elem_type.size()) <= 8) ? 8 : 16);
size_t shared_data_bytes = num_cols * shared_struct_bytes;
......@@ -332,7 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d
1,
1,
shared_data_bytes, // shared mem
nullptr, //stream
nullptr, // stream
args_list,
nullptr)); // arguments
debug_sync();
......@@ -849,9 +850,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape_3d(const std::array<std::string,
std::vector<uint32_t> block_size(3, 0);
// TODO: currently we set it to 16, will add tuning method later
uint32_t block_size_x = 16;
block_size[0] = block_size_x; //x
block_size[2] = (input_order[2] == 0) ? block_size_x : 1; //z
block_size[1] = (block_size[2] == block_size_x) ? 1 : block_size_x; //y
block_size[0] = block_size_x; // x
block_size[2] = (input_order[2] == 0) ? block_size_x : 1; // z
block_size[1] = (block_size[2] == block_size_x) ? 1 : block_size_x; // y
uint32_t aligned_grid_size_x = align_to_block_size(input_shape[2], block_size[0]);
uint32_t aligned_grid_size_y = align_to_block_size(input_shape[1], block_size[1]);
uint32_t aligned_grid_size_z = align_to_block_size(input_shape[0], block_size[2]);
......@@ -1571,7 +1572,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
input_shape_padded =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
padded_size = shape_size(input_shape_padded);
//currntly we set this to float point only, need to add other datatype support later
// currntly we set this to float point only, need to add other datatype support later
float pad_value = std::numeric_limits<float>::lowest();
std::vector<float> temp(padded_size, pad_value);
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
......@@ -1609,7 +1610,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
// std::vector<void*>{pad_buffer}.data());
// gpu::invoke_primitive(
// m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
// m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(),
// outputs);
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
......@@ -1701,8 +1703,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<element::Type>
}});
return this->m_primitive_emitter->register_primitive(memset, hash);
}
// if reduce not include last axis, this is a heuristic to choose by reduce axis for better cache
// a more accurate but slow way is to tune with actual kernel
// if reduce not include last axis, this is a heuristic to choose by reduce axis for better
// cache. a more accurate but slow way is to tune with actual kernel
else if (reduce_strides_in_input.back() != 1)
{
// TODO: currently we set it to 64, will add tuning method later
......@@ -1817,10 +1819,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<element::
const char* kernel)
{
std::vector<std::string> dtypes_str = get_string_vector(dtypes);
//if call from reduce, this is duplicated
// if call from reduce, this is duplicated
NVShape simplified_reduce_axis;
NVShape simplified_input_shape;
// simplified_reduce_axis will not be empty, since we checked if input size is same as output size in gpu_emitter
// simplified_reduce_axis will not be empty, since we checked if input size is same as output
// size in gpu_emitter
simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
size_t rank = simplified_input_shape.size();
size_t reduce_rank = simplified_reduce_axis.size();
......@@ -2070,7 +2073,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>&
{
NVShape simplified_reduce_axis;
NVShape simplified_input_shape;
// simplified_reduce_axis will not be empty, since we checked if input size is same as output size in gpu_emitter
// simplified_reduce_axis will not be empty, since we checked if input size is same as output
// size in gpu_emitter
simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
size_t rank = simplified_input_shape.size();
......@@ -2166,8 +2170,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>&
}
else
{
//if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
//other wise, call reduce to scalar directly.
// if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
// other wise, call reduce to scalar directly.
const uint32_t unroll_size = 8;
if (nthreads > nthreads_acc * (unroll_size + 1))
{
......@@ -2804,12 +2808,15 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
}
// launch arguments:
// each output pixel is its own block. if the batch size is greater than reg_tile_size * sm_tile_size, a single
// output pixel is spread over multiple blocks along the batch axis so that memory coordination is not required
// each block consists of 2 warps in an 8 x 8 array used for accessing the SM block of the GEMM
// each output pixel is its own block. if the batch size is greater than reg_tile_size *
// sm_tile_size, a single output pixel is spread over multiple blocks along the batch axis so
// that memory coordination is not required each block consists of 2 warps in an 8 x 8 array
// used for accessing the SM block of the GEMM
// do_i = output pixel coordinates
// grid = (do_1*do_2*...*do_N*ceil_div(N, REG_TILE_SIZE*SM_TILE_SIZE), ceil_div(K, REG_TILE_SIZE*SM_TILE_SIZE), 1)
// grid = (do_1*do_2*...*do_N*ceil_div(N, REG_TILE_SIZE*SM_TILE_SIZE),
// ceil_div(K, REG_TILE_SIZE*SM_TILE_SIZE),
// 1)
// block = (8, 8, 1)
dim3 blocks(output_pixels * idiv_ceil(N, reg_tile_size * sm_tile_size),
idiv_ceil(K, reg_tile_size * sm_tile_size),
......@@ -3060,7 +3067,7 @@ void* runtime::gpu::CUDAEmitter::get_init_reduce_val(std::string reduce_op, std:
}
else
{
//not defined.
// not defined.
throw std::runtime_error(data_type + "currently not supportted with init value.");
}
}
......
......@@ -228,9 +228,11 @@ namespace ngraph
NVShape input_shape,
const char* op,
const char* kernel);
/// \brief This is the preprocess for reduce to scalar if the data size is large than a number.
/// \brief This is the preprocess for reduce to scalar if the data size is large
/// than a number.
/// The number can be tuned based on hardware.
/// This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
/// This cuda kernel will accumulate reduction to a certain number of bins depends
/// on hardware.
size_t build_reduce_to_scalar_acc(const std::vector<element::Type>& dtypes,
NVShape input_shape,
NVShape output_shape,
......@@ -239,7 +241,8 @@ namespace ngraph
const char* kernel);
/// \brief Simplifed reduce shape and reduce axis, remove dimsion size 1,
/// combine two or more adjacent reduce/nonreduce axis.
/// the simplified reduce shape and reduce axis will make index caculation simplier in cuda kernel.
/// the simplified reduce shape and reduce axis will make index caculation simplier
/// in cuda kernel.
/// example:
/// {1 1 2 2} with reduce axis {3} simplifiy to: {2 2} with reduce_axis {1};
/// {2 3 4} with reduce axis {0 1} simplify to {6 4} with reduce_axis {0};
......@@ -248,13 +251,15 @@ namespace ngraph
NVShape reduce_axis,
NVShape& simplified_shape,
NVShape& simplified_reduce_axis);
/// \brief Seperate input_shape to reduced_shape and non_reduce_shape, and calcuate strides for them
/// and strides in input. This help caculate input index and output index for cuda kernel.
/// \brief Seperate input_shape to reduced_shape and non_reduce_shape, and calcuate
/// strides for them and strides in input. This help caculate input index and
/// output index for cuda kernel.
/// example:
/// input_shape {2 3 4 5} with reduce_axis {0 2}:
/// input_strides: {60, 20, 5, 1}
/// reduce_shape {2 4}, reduce_strides {4 1}, reduce_strides_in_input {60 5}
/// non_reduce_shape {3 5}, non_reduce_strides {5 1}, non_reduce_strides_in_input {20 1}
/// non_reduce_shape {3 5}, non_reduce_strides {5 1}, non_reduce_strides_in_input
/// {20 1}
void get_reduce_strides(NVShape input_shape,
NVShape reduce_axis,
NVShape& non_reduce_shape,
......@@ -264,8 +269,8 @@ namespace ngraph
NVShape& reduce_strides,
NVShape& reduce_strides_in_input);
/// \brief Calculate magic and shift part of an shape vector (denomitor), change divide to multiply
/// in cuda kernel.
/// \brief Calculate magic and shift part of an shape vector (denomitor), change
/// divide to multiply in cuda kernel.
void div_to_mul(const NVShape& shape,
std::vector<int>& magic,
std::vector<int>& shift);
......
......@@ -27,8 +27,8 @@
#include <stdint.h>
#include <string>
//why use "do...while.."
//https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
// why use "do...while.."
// https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
#define NVRTC_SAFE_CALL_NO_THROW(x) \
do \
{ \
......
......@@ -882,7 +882,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
input_shape_padded =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
padded_size = shape_size(input_shape_padded);
//currntly we set this to float point only, need to add other datatype support later
// currntly we set this to float point only, need to add other datatype support later
float pad_value = std::numeric_limits<float>::lowest();
std::vector<float> temp(padded_size, pad_value);
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
......@@ -1125,7 +1125,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::gpu::Rnn* node)
sequence_lengths.data(),
pad_value));
// TO DO: with rnn projection layers the third dimension of the hidden_shape should be recProjSize
// TO DO: with rnn projection layers the third dimension of the hidden_shape should be
// recProjSize
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
uint32_t num_layers = node->get_num_fused_layers() * direction;
Shape hidden_shape{num_layers, batch_size, hidden_size};
......
......@@ -348,7 +348,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
writer << "uint32_t init_in_idx = in_idx;\n";
int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1;
//find max
// find max
writer << data_types[1] << " r_max = in[init_in_idx];\n";
writer << data_types[1] << " input_i;\n";
......@@ -406,7 +406,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
}
writer.block_end();
//exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
// exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
writer << data_types[1] << " r_sum = 0;\n";
writer << data_types[1] << " c = 0;\n";
writer << data_types[1] << " y;\n";
......@@ -457,7 +457,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
}
writer.block_end();
//divide
// divide
writer.block_begin();
for (int64_t j = 0; j < last_r_idx; j++)
{
......@@ -638,7 +638,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer << "r_max = sdata[tid];\n";
}
writer.block_end();
//accumulate WARPSIZE threads
// accumulate WARPSIZE threads
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{
if (num_of_warp > i)
......@@ -660,7 +660,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer << "__syncthreads();\n";
writer << "r_max = sdata[0];\n";
//exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
// exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
writer << data_types[1] << " r_sum = 0;\n";
writer << data_types[1] << " c = 0;\n";
writer << data_types[1] << " y;\n";
......@@ -718,7 +718,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer << "r_sum = sdata[tid];\n";
}
writer.block_end();
//accumulate WARPSIZE = 32 threads
// accumulate WARPSIZE = 32 threads
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{
if (num_of_warp > i)
......@@ -771,7 +771,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
return;
}
//each thread calculate the whole reduction of one output
// each thread calculate the whole reduction of one output
void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
CodeWriter& writer,
const std::string& name,
......@@ -920,7 +920,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
writer << "r = in[input_idx];\n";
writer << "input_idx += step;\n";
writer.block_end();
//accumulate reduction to blockDim.x threads
// accumulate reduction to blockDim.x threads
if (stable_sum)
{
writer << data_types[1] << " c = 0;\n";
......@@ -946,7 +946,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
}
writer.block_end();
//accumulate WARPSIZE threads for each warp
// accumulate WARPSIZE threads for each warp
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{
if (block_size_x > i)
......@@ -976,7 +976,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
writer << "r = sdata[tid];\n";
}
writer.block_end();
//accumulate WARPSIZE threads
// accumulate WARPSIZE threads
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{
if (num_of_warp > i)
......@@ -1034,7 +1034,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
writer << "r = in[input_idx];\n";
writer << "input_idx += step;\n";
writer.block_end();
//accumulate reduction to step threads
// accumulate reduction to step threads
if (stable_sum)
{
writer << data_types[1] << " c = 0;\n";
......
......@@ -95,9 +95,11 @@ namespace ngraph
size_t non_reduce_rank,
size_t reduce_rank);
/// \brief This is the preprocess to reduce to scalar if the input data size is large than a number.
/// \brief This is the preprocess to reduce to scalar if the input data size is
/// large than a number.
/// The number can be tuned based on hardware.
/// This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
/// This cuda kernel will accumulate reduction to a certain number of bins depends
/// on hardware.
/// stable kahan sum is been used for float point sum.
/// no initial value needed since we load one input value as initial
/// not support 0 sized input
......@@ -221,8 +223,9 @@ namespace ngraph
size_t rank,
bool register_arguments = false);
/// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
/// of the non-reduced tensor and return the string name of integer index into reduced tensor
/// \brief Given kernel input variables i_* produce register variables
/// o_coordinates{i} of the non-reduced tensor and return the string name of
/// integer index into reduced tensor
static std::string
collective_coordinate_transform_helper(CodeWriter& writer,
std::string i_thread_index,
......
......@@ -1043,7 +1043,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
auto input_order = reshape->get_input_order();
size_t result_shape_product = shape_size(result_shape);
//for a zero-size tensor, or change from 1^m shape to 1^n shape, just do a copy
// for a zero-size tensor, or change from 1^m shape to 1^n shape, just do a copy
if (!reshape->get_is_transpose() || result_shape_product < 2)
{
auto& host_emitter = compiled_function->get_primitive_emitter()->get_host_emitter();
......@@ -1052,7 +1052,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
return compiled_function->add_to_runtime(index, function_name, args, out);
}
//combine inordered dimensons after reorder in shape, update output shape and input order
// combine inordered dimensons after reorder in shape, update output shape and input order
Shape in_order_map(arg_rank, 0);
for (int i = 0; i < arg_rank - 1; i++)
{
......@@ -1089,7 +1089,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
}
}
//eleminate dimenson size = 1, update input order and output shape
// eleminate dimenson size = 1, update input order and output shape
Shape new_arg_shape;
Shape new_result_shape;
Shape new_idx_map(combine_rank, 0);
......
......@@ -463,9 +463,9 @@ void runtime::gpu::GPUExternalFunction::emit_functions()
m_variable_name_map[tv->get_name()] = ss.str();
auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
//keep assigning different outputs to a result descriptor
//op::Result emitter will check if in and out descriptors are the same
//and skip a copy
// keep assigning different outputs to a result descriptor
// op::Result emitter will check if in and out descriptors are the same
// and skip a copy
auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter())
{
......
......@@ -155,7 +155,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime(
{
primitive_invocation = [args, out, primitive_index](GPUCallFrame& call_frame,
GPURuntimeContext* ctx) mutable {
// here, these inputs and outputs could be any of [constant, input, output, intermediate]
// here, these inputs and outputs could be any of [constant, input, output,
// intermediate]
auto inputs = call_frame.get_tensor_io(args);
auto outputs = call_frame.get_tensor_io(out);
runtime::gpu::invoke_primitive(ctx, primitive_index, inputs.data(), outputs.data());
......@@ -165,7 +166,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime(
{
primitive_invocation = [this, args, out, primitive_index](GPUCallFrame& call_frame,
GPURuntimeContext* ctx) mutable {
// here, these inputs and outputs could be any of [constant, input, output, intermediate]
// here, these inputs and outputs could be any of [constant, input, output,
// intermediate]
auto inputs = call_frame.get_tensor_io(args);
auto outputs = call_frame.get_tensor_io(out);
*m_trace << "(";
......@@ -277,9 +279,9 @@ void runtime::gpu::GPUInternalFunction::build_functions()
m_variable_name_map[tv->get_name()] = std::make_tuple(TensorRole::OUTPUT, i, ss.str());
auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
//keep assigning different outputs to a result descriptor
//op::Result emitter will check if in and out descriptors are the same
//and skip a copy
// keep assigning different outputs to a result descriptor
// op::Result emitter will check if in and out descriptors are the same
// and skip a copy
auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter())
{
......@@ -288,7 +290,7 @@ void runtime::gpu::GPUInternalFunction::build_functions()
auto output_name = ss.str();
m_variable_name_map[itv->get_name()] =
std::make_tuple(TensorRole::OUTPUT, i, ss.str());
//propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
// propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
}
}
......
......@@ -43,7 +43,8 @@ namespace ngraph
GPUKernelArgs(const GPUKernelArgs& args);
//
// Add a placeholder parameter for a tensor pointer which will be resolved at runtime.
// Add a placeholder parameter for a tensor pointer which will be resolved at
// runtime.
//
GPUKernelArgs& add_placeholder(const std::string& type, const std::string& name);
......
......@@ -25,11 +25,12 @@ namespace ngraph
{
namespace gpu
{
// This is RNN op, which is formed by the fusion of multiple RNN cells ( LSTM/ GRU/ vanilla RNN)
// across multiple time slices
// This is RNN op, which is formed by the fusion of multiple RNN cells ( LSTM/ GRU/
// vanilla RNN) across multiple time slices
// INPUTS:
// [0] - {X0, X1...., Xt} input tensor of layout TNC, Shape{num_fused_layers*batch_size, feature_size}
// [0] - {X0, X1...., Xt} input tensor of layout TNC, Shape{num_fused_layers*batch_size,
// feature_size}
// [1] - recurrent input tensor ht_1 of Shape{sequence length*batch_size, feature_size}
// [2] - flat parameter tensor consisting of weights and biases for each layer
// {W_x^0 | W_h^0 | W_x^1 | W_h^1 | ... | B_x^0 | B_h^0 | B_x^1 | B_h^1 }
......@@ -39,12 +40,16 @@ namespace ngraph
// src_sequence_length - this will be same as number_of_timesteps
// src_layer_feature_size - feature size w.r.to input tensor
// src_iter_feature_size - feature size w.r.to hidden state
// num_cell_states - number of recurrent state tensor states , LSTM = 2, GRU = 1, vanilla RNN = 1
// num_cell_states - number of recurrent state tensor states , LSTM = 2, GRU = 1,
// vanilla RNN = 1
// OUTPUT VALUE: A tuple with the following structure:
// [0] - ht, sequence-wise output tensor with shape (sequence_length*batch_size, feature_size) .
// [1] - hf, layer-wise output tensor with shape (num_fused_layers*batch_size, feature_size) .
// [2] - ct output cell state tensor with the same shape as states i.e (sequence_length*batch_size, feature_size)
// [0] - ht, sequence-wise output tensor with shape (sequence_length*batch_size,
// feature_size) .
// [1] - hf, layer-wise output tensor with shape (num_fused_layers*batch_size,
// feature_size) .
// [2] - ct output cell state tensor with the same shape as states i.e
// (sequence_length*batch_size, feature_size)
class Rnn : public Op
{
......
......@@ -34,7 +34,8 @@ bool ngraph::runtime::gpu::pass::BatchNormCache::run_on_function(
{
if (auto bnbp = std::dynamic_pointer_cast<op::BatchNormTrainingBackprop>(n))
{
// batch norm bprop annotations are used to indicate if variance is in inverse stddev format
// batch norm bprop annotations are used to indicate if variance is in inverse stddev
// format
auto op_annotations =
std::make_shared<ngraph::runtime::gpu::BatchNormBackpropAnnotations>();
......
......@@ -57,7 +57,7 @@
using namespace ngraph;
void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
{
//construct variance
// construct variance
auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
auto neg_input = std::make_shared<op::Negative>(input);
auto exp_neg_input = std::make_shared<op::Exp>(neg_input);
......@@ -69,7 +69,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
auto add_exp = std::make_shared<op::Add>(exp_neg_input, broadcast_constant);
auto divide_1_over_exp = std::make_shared<op::Divide>(broadcast_constant, add_exp);
//Define a call back that needs to called once the DFG matches the pattern
// Define a call back that needs to called once the DFG matches the pattern
auto callback = [input](pattern::Matcher& m) {
NGRAPH_DEBUG << "In a callback for construct_fprop_sigmoid pattern against "
<< m.get_match_root()->get_name();
......@@ -153,7 +153,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
auto input_slice_0 = std::make_shared<op::Slice>(X, Coordinate{0, 0}, Coordinate{10, 100});
auto forget_gate = std::make_shared<op::Sigmoid>(input_slice_0);
//ct-1 -> cell state
// ct-1 -> cell state
auto ct_1 = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 100});
auto multiply_forget_gate_ct_1 = std::make_shared<op::Multiply>(forget_gate, ct_1);
......@@ -176,7 +176,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
auto ht = std::make_shared<op::Multiply>(output_gate, tanh_2);
auto ht_label = std::make_shared<pattern::op::Label>(ht, nullptr, NodeVector{ht});
//Define a call back that needs to called once the DFG matches the pattern
// Define a call back that needs to called once the DFG matches the pattern
auto callback = [ct_label,
input_xt,
weights_i2h,
......@@ -211,8 +211,8 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
"Bias should have rank of 1 for Rnn op");
// Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this
// reliably in the RNN fusion.
// Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to
// capture this reliably in the RNN fusion.
std::shared_ptr<op::gpu::Rnn> lstm = nullptr;
bool intermediate_lstm = false;
if (std::dynamic_pointer_cast<op::GetOutputElement>(pattern_map[ct_1]))
......@@ -411,7 +411,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
if (std::dynamic_pointer_cast<op::Broadcast>(xt_node_array[xt_node_array.size() - 1]) &&
std::dynamic_pointer_cast<op::Constant>(
xt_node_array[xt_node_array.size() - 1]->get_argument(0)))
// here xt is determined to be the hidden (recurrent) input data and so ht is the feedforward input
// here xt is determined to be the hidden (recurrent) input data and so ht is the
// feedforward input
{
// concatenate the sequence inputs for a given layer
std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{ht_1};
......@@ -425,7 +426,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
hidden_ht_array[hidden_ht_array.size() - 1]) &&
std::dynamic_pointer_cast<op::Constant>(
hidden_ht_array[hidden_ht_array.size() - 1]->get_argument(0)))
// here ht is determined to be the hidden (recurrent) input data and so xt is the feedforward input
// here ht is determined to be the hidden (recurrent) input data and so xt is the
// feedforward input
{
std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{xt};
src_layer = compute_rnn_args(src_layer_labels, m, true);
......@@ -502,10 +504,11 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
auto layer_rnn_ht = std::make_shared<op::GetOutputElement>(rnn, 1);
auto layer_rnn_ct = std::make_shared<op::GetOutputElement>(rnn, 2);
//slice the rnn ht's
// slice the rnn ht's
size_t start_index = 0;
size_t end_index = batch_size;
// capture the slices in the reverse order, so it corrosponds to lstm_goes order captured by the Pattern matcher
// capture the slices in the reverse order, so it corrosponds to lstm_goes order captured by
// the Pattern matcher
for (size_t i = 0; i < num_of_lstm_matched; i++)
{
ht_slice_per_timestep[i] = (std::make_shared<op::Slice>(
......@@ -574,7 +577,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
}
}
//now go through the lstm goe_0 consumers and replace them with the slice
// now go through the lstm goe_0 consumers and replace them with the slice
for (auto& node : lstm_goe0_user)
{
for (size_t i = 0; i < node->get_input_size(); i++)
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
......@@ -1817,7 +1817,8 @@ shared_ptr<runtime::Executable>
if ((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1)))
{
// symmetric padding case temporally excluded (custom kernel executed) due to stability issues
// symmetric padding case temporally excluded (custom kernel executed) due to
// stability issues
const CoordinateDiff& pad_below_for = conv_op->get_padding_below_forward();
input_offset_xy = -pad_below_for.at(0);
}
......
......@@ -289,7 +289,7 @@ static CustomKernels::krnl_info do_create_variance_back(const string& output_nam
writer.block_begin();
{ // Main function body
gws.push_back(1); //input_shape.at(0));
gws.push_back(1); // input_shape.at(0));
// Channel axis loop
writer << "\nconst uint i" << channel_axis << " = get_global_id(" << channel_axis
<< "); /* channel_axis trip count " << input_shape.at(channel_axis) << "*/\n";
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
......@@ -38,7 +38,8 @@
// INTERPRETER_LOCAL is used for non-api symbols.
// #ifdef INTERPRETER_DLL // defined if INTERPRETER is compiled as a DLL
#ifdef INTERPRETER_DLL_EXPORTS // defined if we are building the INTERPRETER DLL (instead of using it)
#ifdef INTERPRETER_DLL_EXPORTS // defined if we are building the INTERPRETER DLL (instead of using
// it)
#define INTERPRETER_API INTERPRETER_HELPER_DLL_EXPORT
#else
#define INTERPRETER_API INTERPRETER_HELPER_DLL_IMPORT
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
......@@ -32,13 +32,16 @@
#endif
#endif
// Now we use the generic helper definitions above to define PLAIDML_BACKEND_API and PLAIDML_BACKEND_LOCAL.
// Now we use the generic helper definitions above to define PLAIDML_BACKEND_API and
// PLAIDML_BACKEND_LOCAL.
//
// PLAIDML_BACKEND_API is used for the public API symbols. It either DLL imports or DLL exports
// (or does nothing for static build)
// PLAIDML_BACKEND_LOCAL is used for non-api symbols.
// #ifdef PLAIDML_BACKEND_DLL // defined if PLAIDML_BACKEND is compiled as a DLL
#ifdef PLAIDML_BACKEND_DLL_EXPORTS // defined if we are building the PLAIDML_BACKEND DLL (instead of using it)
#ifdef PLAIDML_BACKEND_DLL_EXPORTS // defined if we are building the PLAIDML_BACKEND DLL (instead of
// using it)
#define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_EXPORT
#else
#define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_IMPORT
......
......@@ -50,6 +50,7 @@ public:
private:
std::mutex m_mu;
// N.B. The key here is the original source function, *not* the copy that's been processed by the compilation passes.
// N.B. The key here is the original source function, *not* the copy that's been processed by
// the compilation passes.
std::unordered_map<std::shared_ptr<Function>, std::shared_ptr<PlaidML_Executable>> m_cache;
};
......@@ -28,9 +28,10 @@ namespace ngraph
{
namespace op
{
// Implements NumPy-style broadcast semantics by passing its single argument through to its
// output and pretending that this changes the shape. The creator of this node is responsible
// for ensuring that the downstream operation will perform a NumPy-style broadcast.
// Implements NumPy-style broadcast semantics by passing its single argument through
// to its output and pretending that this changes the shape. The creator of this
// node is responsible for ensuring that the downstream operation will perform a
// NumPy-style broadcast.
class ImplicitBroadcast;
}
}
......
......@@ -75,8 +75,8 @@ namespace ngraph
}
}))
.set(builder::ContractionInput{"I"}.add_indices("d", 0, dim_limit)))
.add( // Compare the input against the (broadcasted) max values, and select the indices
// where the max val occurs
.add( // Compare the input against the (broadcasted) max values, and select
// the indices where the max val occurs
builder::Elementwise{"SelValIdxs",
"I == SelVal ? index(I, " + reduction_axis_str +
") : D" + reduction_axis_str})
......
......@@ -52,8 +52,8 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()
if (src_shape.size())
{
// Create a reshape operation to get the right target broadcast shape. (Note that a zero-D tensor
// or constant can be passed directly into the ImplicitBroadcast op).
// Create a reshape operation to get the right target broadcast shape. (Note that a
// zero-D tensor or constant can be passed directly into the ImplicitBroadcast op).
AxisVector reshape_order;
Shape reshape_shape;
std::size_t input_dim = 0;
......@@ -76,9 +76,9 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()
auto implicit_broadcast =
std::make_shared<plaidml::op::ImplicitBroadcast>(src, broadcast->get_shape());
// N.B. We don't use replace_node() here, since it's important to only replace the broadcast with an
// implicit broadcast when the consuming operation is an elementwise operation, since PlaidML
// contractions don't provide implicit broadcast semantics.
// N.B. We don't use replace_node() here, since it's important to only replace the broadcast
// with an implicit broadcast when the consuming operation is an elementwise operation,
// since PlaidML contractions don't provide implicit broadcast semantics.
bool result = false;
for (size_t i = 0; i < broadcast->get_output_size(); ++i)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment