Commit e83c2ffa authored by Adam Procter's avatar Adam Procter Committed by Scott Cyphers

clang-format: Combine several already-approved PRs (#3488)

* Enable new clang-format comment rules for src/ngraph/runtime/{dynamic,generic_cpu,gpuh,interpreter,nop}

* Update clang-format comment rules for GPU BE

* Update clang-format comment rules for src/ngraph/runtime/plaidml

* Update clang-format comment rules for src/ngraph/runtime/intelgpu

* Fix a couple of demangles I missed
parent 7133540a
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
...@@ -56,8 +56,8 @@ namespace ngraph ...@@ -56,8 +56,8 @@ namespace ngraph
} }
else else
{ {
// Get the sizes of the dot axes. It's easiest to pull them from arg1 because they're // Get the sizes of the dot axes. It's easiest to pull them from arg1
// right up front. // because they're right up front.
Shape dot_axis_sizes(reduction_axes_count); Shape dot_axis_sizes(reduction_axes_count);
std::copy(arg1_shape.begin(), std::copy(arg1_shape.begin(),
arg1_shape.begin() + reduction_axes_count, arg1_shape.begin() + reduction_axes_count,
...@@ -67,7 +67,8 @@ namespace ngraph ...@@ -67,7 +67,8 @@ namespace ngraph
CoordinateTransform arg1_transform(arg1_shape); CoordinateTransform arg1_transform(arg1_shape);
CoordinateTransform output_transform(out_shape); CoordinateTransform output_transform(out_shape);
// Create coordinate transforms for arg0 and arg1 that throw away the dotted axes. // Create coordinate transforms for arg0 and arg1 that throw away the dotted
// axes.
size_t arg0_projected_rank = arg0_shape.size() - reduction_axes_count; size_t arg0_projected_rank = arg0_shape.size() - reduction_axes_count;
size_t arg1_projected_rank = arg1_shape.size() - reduction_axes_count; size_t arg1_projected_rank = arg1_shape.size() - reduction_axes_count;
...@@ -84,15 +85,16 @@ namespace ngraph ...@@ -84,15 +85,16 @@ namespace ngraph
CoordinateTransform arg0_projected_transform(arg0_projected_shape); CoordinateTransform arg0_projected_transform(arg0_projected_shape);
CoordinateTransform arg1_projected_transform(arg1_projected_shape); CoordinateTransform arg1_projected_transform(arg1_projected_shape);
// Create a coordinate transform that allows us to iterate over all possible values // Create a coordinate transform that allows us to iterate over all possible
// for the dotted axes. // values for the dotted axes.
CoordinateTransform dot_axes_transform(dot_axis_sizes); CoordinateTransform dot_axes_transform(dot_axis_sizes);
for (const Coordinate& arg0_projected_coord : arg0_projected_transform) for (const Coordinate& arg0_projected_coord : arg0_projected_transform)
{ {
for (const Coordinate& arg1_projected_coord : arg1_projected_transform) for (const Coordinate& arg1_projected_coord : arg1_projected_transform)
{ {
// The output coordinate is just the concatenation of the projected coordinates. // The output coordinate is just the concatenation of the projected
// coordinates.
Coordinate out_coord(arg0_projected_coord.size() + Coordinate out_coord(arg0_projected_coord.size() +
arg1_projected_coord.size()); arg1_projected_coord.size());
...@@ -116,8 +118,9 @@ namespace ngraph ...@@ -116,8 +118,9 @@ namespace ngraph
arg0_coord.begin()); arg0_coord.begin());
for (const Coordinate& dot_axis_positions : dot_axes_transform) for (const Coordinate& dot_axis_positions : dot_axes_transform)
{ {
// In order to find the points to multiply together, we need to inject our current // In order to find the points to multiply together, we need to
// positions along the dotted axes back into the projected arg0 and arg1 coordinates. // inject our current positions along the dotted axes back into
// the projected arg0 and arg1 coordinates.
std::copy(dot_axis_positions.begin(), std::copy(dot_axis_positions.begin(),
dot_axis_positions.end(), dot_axis_positions.end(),
arg0_it); arg0_it);
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
...@@ -107,7 +107,7 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::string& dtype, ...@@ -107,7 +107,7 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::string& dtype,
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primutive // recompile the kernel and then create the primutive
size_t split_input_size = 256; //max num of inputs fit 4KB parameter space: 256 * 8 + 7 * ? size_t split_input_size = 256; // max num of inputs fit 4KB parameter space: 256 * 8 + 7 * ?
size_t residue = input_num % split_input_size; size_t residue = input_num % split_input_size;
std::stringstream kernel_name_1; std::stringstream kernel_name_1;
std::stringstream kernel_name_2; std::stringstream kernel_name_2;
...@@ -243,8 +243,9 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d ...@@ -243,8 +243,9 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d
// __device__ void set_value(float val){value = val;} // __device__ void set_value(float val){value = val;}
// //
// }; // };
// Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the struct can // Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the
// therfore be given by 'shared_struct_bytes' as calculated below accounting for structure padding // struct can therfore be given by 'shared_struct_bytes' as calculated below accounting for
// structure padding
size_t shared_struct_bytes = (((dtypes[0].size() + index_elem_type.size()) <= 8) ? 8 : 16); size_t shared_struct_bytes = (((dtypes[0].size() + index_elem_type.size()) <= 8) ? 8 : 16);
size_t shared_data_bytes = num_cols * shared_struct_bytes; size_t shared_data_bytes = num_cols * shared_struct_bytes;
...@@ -332,7 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d ...@@ -332,7 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d
1, 1,
1, 1,
shared_data_bytes, // shared mem shared_data_bytes, // shared mem
nullptr, //stream nullptr, // stream
args_list, args_list,
nullptr)); // arguments nullptr)); // arguments
debug_sync(); debug_sync();
...@@ -849,9 +850,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape_3d(const std::array<std::string, ...@@ -849,9 +850,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape_3d(const std::array<std::string,
std::vector<uint32_t> block_size(3, 0); std::vector<uint32_t> block_size(3, 0);
// TODO: currently we set it to 16, will add tuning method later // TODO: currently we set it to 16, will add tuning method later
uint32_t block_size_x = 16; uint32_t block_size_x = 16;
block_size[0] = block_size_x; //x block_size[0] = block_size_x; // x
block_size[2] = (input_order[2] == 0) ? block_size_x : 1; //z block_size[2] = (input_order[2] == 0) ? block_size_x : 1; // z
block_size[1] = (block_size[2] == block_size_x) ? 1 : block_size_x; //y block_size[1] = (block_size[2] == block_size_x) ? 1 : block_size_x; // y
uint32_t aligned_grid_size_x = align_to_block_size(input_shape[2], block_size[0]); uint32_t aligned_grid_size_x = align_to_block_size(input_shape[2], block_size[0]);
uint32_t aligned_grid_size_y = align_to_block_size(input_shape[1], block_size[1]); uint32_t aligned_grid_size_y = align_to_block_size(input_shape[1], block_size[1]);
uint32_t aligned_grid_size_z = align_to_block_size(input_shape[0], block_size[2]); uint32_t aligned_grid_size_z = align_to_block_size(input_shape[0], block_size[2]);
...@@ -1571,7 +1572,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node) ...@@ -1571,7 +1572,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
input_shape_padded = input_shape_padded =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {}); runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
padded_size = shape_size(input_shape_padded); padded_size = shape_size(input_shape_padded);
//currntly we set this to float point only, need to add other datatype support later // currntly we set this to float point only, need to add other datatype support later
float pad_value = std::numeric_limits<float>::lowest(); float pad_value = std::numeric_limits<float>::lowest();
std::vector<float> temp(padded_size, pad_value); std::vector<float> temp(padded_size, pad_value);
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
...@@ -1609,7 +1610,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node) ...@@ -1609,7 +1610,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
// std::vector<void*>{pad_buffer}.data()); // std::vector<void*>{pad_buffer}.data());
// gpu::invoke_primitive( // gpu::invoke_primitive(
// m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs); // m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(),
// outputs);
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
...@@ -1701,8 +1703,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<element::Type> ...@@ -1701,8 +1703,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<element::Type>
}}); }});
return this->m_primitive_emitter->register_primitive(memset, hash); return this->m_primitive_emitter->register_primitive(memset, hash);
} }
// if reduce not include last axis, this is a heuristic to choose by reduce axis for better cache // if reduce not include last axis, this is a heuristic to choose by reduce axis for better
// a more accurate but slow way is to tune with actual kernel // cache. a more accurate but slow way is to tune with actual kernel
else if (reduce_strides_in_input.back() != 1) else if (reduce_strides_in_input.back() != 1)
{ {
// TODO: currently we set it to 64, will add tuning method later // TODO: currently we set it to 64, will add tuning method later
...@@ -1817,10 +1819,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<element:: ...@@ -1817,10 +1819,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<element::
const char* kernel) const char* kernel)
{ {
std::vector<std::string> dtypes_str = get_string_vector(dtypes); std::vector<std::string> dtypes_str = get_string_vector(dtypes);
//if call from reduce, this is duplicated // if call from reduce, this is duplicated
NVShape simplified_reduce_axis; NVShape simplified_reduce_axis;
NVShape simplified_input_shape; NVShape simplified_input_shape;
// simplified_reduce_axis will not be empty, since we checked if input size is same as output size in gpu_emitter // simplified_reduce_axis will not be empty, since we checked if input size is same as output
// size in gpu_emitter
simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis); simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
size_t rank = simplified_input_shape.size(); size_t rank = simplified_input_shape.size();
size_t reduce_rank = simplified_reduce_axis.size(); size_t reduce_rank = simplified_reduce_axis.size();
...@@ -2070,7 +2073,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>& ...@@ -2070,7 +2073,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>&
{ {
NVShape simplified_reduce_axis; NVShape simplified_reduce_axis;
NVShape simplified_input_shape; NVShape simplified_input_shape;
// simplified_reduce_axis will not be empty, since we checked if input size is same as output size in gpu_emitter // simplified_reduce_axis will not be empty, since we checked if input size is same as output
// size in gpu_emitter
simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis); simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
size_t rank = simplified_input_shape.size(); size_t rank = simplified_input_shape.size();
...@@ -2166,8 +2170,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>& ...@@ -2166,8 +2170,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>&
} }
else else
{ {
//if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar. // if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
//other wise, call reduce to scalar directly. // other wise, call reduce to scalar directly.
const uint32_t unroll_size = 8; const uint32_t unroll_size = 8;
if (nthreads > nthreads_acc * (unroll_size + 1)) if (nthreads > nthreads_acc * (unroll_size + 1))
{ {
...@@ -2804,12 +2808,15 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string ...@@ -2804,12 +2808,15 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
} }
// launch arguments: // launch arguments:
// each output pixel is its own block. if the batch size is greater than reg_tile_size * sm_tile_size, a single // each output pixel is its own block. if the batch size is greater than reg_tile_size *
// output pixel is spread over multiple blocks along the batch axis so that memory coordination is not required // sm_tile_size, a single output pixel is spread over multiple blocks along the batch axis so
// each block consists of 2 warps in an 8 x 8 array used for accessing the SM block of the GEMM // that memory coordination is not required each block consists of 2 warps in an 8 x 8 array
// used for accessing the SM block of the GEMM
// do_i = output pixel coordinates // do_i = output pixel coordinates
// grid = (do_1*do_2*...*do_N*ceil_div(N, REG_TILE_SIZE*SM_TILE_SIZE), ceil_div(K, REG_TILE_SIZE*SM_TILE_SIZE), 1) // grid = (do_1*do_2*...*do_N*ceil_div(N, REG_TILE_SIZE*SM_TILE_SIZE),
// ceil_div(K, REG_TILE_SIZE*SM_TILE_SIZE),
// 1)
// block = (8, 8, 1) // block = (8, 8, 1)
dim3 blocks(output_pixels * idiv_ceil(N, reg_tile_size * sm_tile_size), dim3 blocks(output_pixels * idiv_ceil(N, reg_tile_size * sm_tile_size),
idiv_ceil(K, reg_tile_size * sm_tile_size), idiv_ceil(K, reg_tile_size * sm_tile_size),
...@@ -3060,7 +3067,7 @@ void* runtime::gpu::CUDAEmitter::get_init_reduce_val(std::string reduce_op, std: ...@@ -3060,7 +3067,7 @@ void* runtime::gpu::CUDAEmitter::get_init_reduce_val(std::string reduce_op, std:
} }
else else
{ {
//not defined. // not defined.
throw std::runtime_error(data_type + "currently not supportted with init value."); throw std::runtime_error(data_type + "currently not supportted with init value.");
} }
} }
......
...@@ -228,9 +228,11 @@ namespace ngraph ...@@ -228,9 +228,11 @@ namespace ngraph
NVShape input_shape, NVShape input_shape,
const char* op, const char* op,
const char* kernel); const char* kernel);
/// \brief This is the preprocess for reduce to scalar if the data size is large than a number. /// \brief This is the preprocess for reduce to scalar if the data size is large
/// than a number.
/// The number can be tuned based on hardware. /// The number can be tuned based on hardware.
/// This cuda kernel will accumulate reduction to a certain number of bins depends on hardware. /// This cuda kernel will accumulate reduction to a certain number of bins depends
/// on hardware.
size_t build_reduce_to_scalar_acc(const std::vector<element::Type>& dtypes, size_t build_reduce_to_scalar_acc(const std::vector<element::Type>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
...@@ -239,7 +241,8 @@ namespace ngraph ...@@ -239,7 +241,8 @@ namespace ngraph
const char* kernel); const char* kernel);
/// \brief Simplifed reduce shape and reduce axis, remove dimsion size 1, /// \brief Simplifed reduce shape and reduce axis, remove dimsion size 1,
/// combine two or more adjacent reduce/nonreduce axis. /// combine two or more adjacent reduce/nonreduce axis.
/// the simplified reduce shape and reduce axis will make index caculation simplier in cuda kernel. /// the simplified reduce shape and reduce axis will make index caculation simplier
/// in cuda kernel.
/// example: /// example:
/// {1 1 2 2} with reduce axis {3} simplifiy to: {2 2} with reduce_axis {1}; /// {1 1 2 2} with reduce axis {3} simplifiy to: {2 2} with reduce_axis {1};
/// {2 3 4} with reduce axis {0 1} simplify to {6 4} with reduce_axis {0}; /// {2 3 4} with reduce axis {0 1} simplify to {6 4} with reduce_axis {0};
...@@ -248,13 +251,15 @@ namespace ngraph ...@@ -248,13 +251,15 @@ namespace ngraph
NVShape reduce_axis, NVShape reduce_axis,
NVShape& simplified_shape, NVShape& simplified_shape,
NVShape& simplified_reduce_axis); NVShape& simplified_reduce_axis);
/// \brief Seperate input_shape to reduced_shape and non_reduce_shape, and calcuate strides for them /// \brief Seperate input_shape to reduced_shape and non_reduce_shape, and calcuate
/// and strides in input. This help caculate input index and output index for cuda kernel. /// strides for them and strides in input. This help caculate input index and
/// output index for cuda kernel.
/// example: /// example:
/// input_shape {2 3 4 5} with reduce_axis {0 2}: /// input_shape {2 3 4 5} with reduce_axis {0 2}:
/// input_strides: {60, 20, 5, 1} /// input_strides: {60, 20, 5, 1}
/// reduce_shape {2 4}, reduce_strides {4 1}, reduce_strides_in_input {60 5} /// reduce_shape {2 4}, reduce_strides {4 1}, reduce_strides_in_input {60 5}
/// non_reduce_shape {3 5}, non_reduce_strides {5 1}, non_reduce_strides_in_input {20 1} /// non_reduce_shape {3 5}, non_reduce_strides {5 1}, non_reduce_strides_in_input
/// {20 1}
void get_reduce_strides(NVShape input_shape, void get_reduce_strides(NVShape input_shape,
NVShape reduce_axis, NVShape reduce_axis,
NVShape& non_reduce_shape, NVShape& non_reduce_shape,
...@@ -264,8 +269,8 @@ namespace ngraph ...@@ -264,8 +269,8 @@ namespace ngraph
NVShape& reduce_strides, NVShape& reduce_strides,
NVShape& reduce_strides_in_input); NVShape& reduce_strides_in_input);
/// \brief Calculate magic and shift part of an shape vector (denomitor), change divide to multiply /// \brief Calculate magic and shift part of an shape vector (denomitor), change
/// in cuda kernel. /// divide to multiply in cuda kernel.
void div_to_mul(const NVShape& shape, void div_to_mul(const NVShape& shape,
std::vector<int>& magic, std::vector<int>& magic,
std::vector<int>& shift); std::vector<int>& shift);
......
...@@ -27,8 +27,8 @@ ...@@ -27,8 +27,8 @@
#include <stdint.h> #include <stdint.h>
#include <string> #include <string>
//why use "do...while.." // why use "do...while.."
//https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros // https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
#define NVRTC_SAFE_CALL_NO_THROW(x) \ #define NVRTC_SAFE_CALL_NO_THROW(x) \
do \ do \
{ \ { \
......
...@@ -882,7 +882,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node) ...@@ -882,7 +882,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
input_shape_padded = input_shape_padded =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {}); runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
padded_size = shape_size(input_shape_padded); padded_size = shape_size(input_shape_padded);
//currntly we set this to float point only, need to add other datatype support later // currntly we set this to float point only, need to add other datatype support later
float pad_value = std::numeric_limits<float>::lowest(); float pad_value = std::numeric_limits<float>::lowest();
std::vector<float> temp(padded_size, pad_value); std::vector<float> temp(padded_size, pad_value);
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
...@@ -1125,7 +1125,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::gpu::Rnn* node) ...@@ -1125,7 +1125,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::gpu::Rnn* node)
sequence_lengths.data(), sequence_lengths.data(),
pad_value)); pad_value));
// TO DO: with rnn projection layers the third dimension of the hidden_shape should be recProjSize // TO DO: with rnn projection layers the third dimension of the hidden_shape should be
// recProjSize
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW; cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
uint32_t num_layers = node->get_num_fused_layers() * direction; uint32_t num_layers = node->get_num_fused_layers() * direction;
Shape hidden_shape{num_layers, batch_size, hidden_size}; Shape hidden_shape{num_layers, batch_size, hidden_size};
......
...@@ -348,7 +348,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer, ...@@ -348,7 +348,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
writer << "uint32_t init_in_idx = in_idx;\n"; writer << "uint32_t init_in_idx = in_idx;\n";
int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1; int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1;
//find max // find max
writer << data_types[1] << " r_max = in[init_in_idx];\n"; writer << data_types[1] << " r_max = in[init_in_idx];\n";
writer << data_types[1] << " input_i;\n"; writer << data_types[1] << " input_i;\n";
...@@ -406,7 +406,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer, ...@@ -406,7 +406,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
} }
writer.block_end(); writer.block_end();
//exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm // exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
writer << data_types[1] << " r_sum = 0;\n"; writer << data_types[1] << " r_sum = 0;\n";
writer << data_types[1] << " c = 0;\n"; writer << data_types[1] << " c = 0;\n";
writer << data_types[1] << " y;\n"; writer << data_types[1] << " y;\n";
...@@ -457,7 +457,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer, ...@@ -457,7 +457,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
} }
writer.block_end(); writer.block_end();
//divide // divide
writer.block_begin(); writer.block_begin();
for (int64_t j = 0; j < last_r_idx; j++) for (int64_t j = 0; j < last_r_idx; j++)
{ {
...@@ -638,7 +638,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op( ...@@ -638,7 +638,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer << "r_max = sdata[tid];\n"; writer << "r_max = sdata[tid];\n";
} }
writer.block_end(); writer.block_end();
//accumulate WARPSIZE threads // accumulate WARPSIZE threads
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1) for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{ {
if (num_of_warp > i) if (num_of_warp > i)
...@@ -660,7 +660,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op( ...@@ -660,7 +660,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer << "__syncthreads();\n"; writer << "__syncthreads();\n";
writer << "r_max = sdata[0];\n"; writer << "r_max = sdata[0];\n";
//exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm // exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
writer << data_types[1] << " r_sum = 0;\n"; writer << data_types[1] << " r_sum = 0;\n";
writer << data_types[1] << " c = 0;\n"; writer << data_types[1] << " c = 0;\n";
writer << data_types[1] << " y;\n"; writer << data_types[1] << " y;\n";
...@@ -718,7 +718,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op( ...@@ -718,7 +718,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer << "r_sum = sdata[tid];\n"; writer << "r_sum = sdata[tid];\n";
} }
writer.block_end(); writer.block_end();
//accumulate WARPSIZE = 32 threads // accumulate WARPSIZE = 32 threads
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1) for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{ {
if (num_of_warp > i) if (num_of_warp > i)
...@@ -771,7 +771,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op( ...@@ -771,7 +771,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
return; return;
} }
//each thread calculate the whole reduction of one output // each thread calculate the whole reduction of one output
void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op( void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
CodeWriter& writer, CodeWriter& writer,
const std::string& name, const std::string& name,
...@@ -920,7 +920,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op( ...@@ -920,7 +920,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
writer << "r = in[input_idx];\n"; writer << "r = in[input_idx];\n";
writer << "input_idx += step;\n"; writer << "input_idx += step;\n";
writer.block_end(); writer.block_end();
//accumulate reduction to blockDim.x threads // accumulate reduction to blockDim.x threads
if (stable_sum) if (stable_sum)
{ {
writer << data_types[1] << " c = 0;\n"; writer << data_types[1] << " c = 0;\n";
...@@ -946,7 +946,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op( ...@@ -946,7 +946,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
} }
writer.block_end(); writer.block_end();
//accumulate WARPSIZE threads for each warp // accumulate WARPSIZE threads for each warp
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1) for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{ {
if (block_size_x > i) if (block_size_x > i)
...@@ -976,7 +976,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op( ...@@ -976,7 +976,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
writer << "r = sdata[tid];\n"; writer << "r = sdata[tid];\n";
} }
writer.block_end(); writer.block_end();
//accumulate WARPSIZE threads // accumulate WARPSIZE threads
for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1) for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
{ {
if (num_of_warp > i) if (num_of_warp > i)
...@@ -1034,7 +1034,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op( ...@@ -1034,7 +1034,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
writer << "r = in[input_idx];\n"; writer << "r = in[input_idx];\n";
writer << "input_idx += step;\n"; writer << "input_idx += step;\n";
writer.block_end(); writer.block_end();
//accumulate reduction to step threads // accumulate reduction to step threads
if (stable_sum) if (stable_sum)
{ {
writer << data_types[1] << " c = 0;\n"; writer << data_types[1] << " c = 0;\n";
......
...@@ -95,9 +95,11 @@ namespace ngraph ...@@ -95,9 +95,11 @@ namespace ngraph
size_t non_reduce_rank, size_t non_reduce_rank,
size_t reduce_rank); size_t reduce_rank);
/// \brief This is the preprocess to reduce to scalar if the input data size is large than a number. /// \brief This is the preprocess to reduce to scalar if the input data size is
/// large than a number.
/// The number can be tuned based on hardware. /// The number can be tuned based on hardware.
/// This cuda kernel will accumulate reduction to a certain number of bins depends on hardware. /// This cuda kernel will accumulate reduction to a certain number of bins depends
/// on hardware.
/// stable kahan sum is been used for float point sum. /// stable kahan sum is been used for float point sum.
/// no initial value needed since we load one input value as initial /// no initial value needed since we load one input value as initial
/// not support 0 sized input /// not support 0 sized input
...@@ -221,8 +223,9 @@ namespace ngraph ...@@ -221,8 +223,9 @@ namespace ngraph
size_t rank, size_t rank,
bool register_arguments = false); bool register_arguments = false);
/// \brief Given kernel input variables i_* produce register variables o_coordinates{i} /// \brief Given kernel input variables i_* produce register variables
/// of the non-reduced tensor and return the string name of integer index into reduced tensor /// o_coordinates{i} of the non-reduced tensor and return the string name of
/// integer index into reduced tensor
static std::string static std::string
collective_coordinate_transform_helper(CodeWriter& writer, collective_coordinate_transform_helper(CodeWriter& writer,
std::string i_thread_index, std::string i_thread_index,
......
...@@ -1043,7 +1043,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS) ...@@ -1043,7 +1043,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
auto input_order = reshape->get_input_order(); auto input_order = reshape->get_input_order();
size_t result_shape_product = shape_size(result_shape); size_t result_shape_product = shape_size(result_shape);
//for a zero-size tensor, or change from 1^m shape to 1^n shape, just do a copy // for a zero-size tensor, or change from 1^m shape to 1^n shape, just do a copy
if (!reshape->get_is_transpose() || result_shape_product < 2) if (!reshape->get_is_transpose() || result_shape_product < 2)
{ {
auto& host_emitter = compiled_function->get_primitive_emitter()->get_host_emitter(); auto& host_emitter = compiled_function->get_primitive_emitter()->get_host_emitter();
...@@ -1052,7 +1052,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS) ...@@ -1052,7 +1052,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
return compiled_function->add_to_runtime(index, function_name, args, out); return compiled_function->add_to_runtime(index, function_name, args, out);
} }
//combine inordered dimensons after reorder in shape, update output shape and input order // combine inordered dimensons after reorder in shape, update output shape and input order
Shape in_order_map(arg_rank, 0); Shape in_order_map(arg_rank, 0);
for (int i = 0; i < arg_rank - 1; i++) for (int i = 0; i < arg_rank - 1; i++)
{ {
...@@ -1089,7 +1089,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS) ...@@ -1089,7 +1089,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
} }
} }
//eleminate dimenson size = 1, update input order and output shape // eleminate dimenson size = 1, update input order and output shape
Shape new_arg_shape; Shape new_arg_shape;
Shape new_result_shape; Shape new_result_shape;
Shape new_idx_map(combine_rank, 0); Shape new_idx_map(combine_rank, 0);
......
...@@ -463,9 +463,9 @@ void runtime::gpu::GPUExternalFunction::emit_functions() ...@@ -463,9 +463,9 @@ void runtime::gpu::GPUExternalFunction::emit_functions()
m_variable_name_map[tv->get_name()] = ss.str(); m_variable_name_map[tv->get_name()] = ss.str();
auto res = dynamic_pointer_cast<ngraph::op::Result>(op); auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
//keep assigning different outputs to a result descriptor // keep assigning different outputs to a result descriptor
//op::Result emitter will check if in and out descriptors are the same // op::Result emitter will check if in and out descriptors are the same
//and skip a copy // and skip a copy
auto input_node = res->get_inputs().at(0).get_output().get_node(); auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter()) if (!input_node->is_constant() && !input_node->is_parameter())
{ {
......
...@@ -155,7 +155,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime( ...@@ -155,7 +155,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime(
{ {
primitive_invocation = [args, out, primitive_index](GPUCallFrame& call_frame, primitive_invocation = [args, out, primitive_index](GPUCallFrame& call_frame,
GPURuntimeContext* ctx) mutable { GPURuntimeContext* ctx) mutable {
// here, these inputs and outputs could be any of [constant, input, output, intermediate] // here, these inputs and outputs could be any of [constant, input, output,
// intermediate]
auto inputs = call_frame.get_tensor_io(args); auto inputs = call_frame.get_tensor_io(args);
auto outputs = call_frame.get_tensor_io(out); auto outputs = call_frame.get_tensor_io(out);
runtime::gpu::invoke_primitive(ctx, primitive_index, inputs.data(), outputs.data()); runtime::gpu::invoke_primitive(ctx, primitive_index, inputs.data(), outputs.data());
...@@ -165,7 +166,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime( ...@@ -165,7 +166,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime(
{ {
primitive_invocation = [this, args, out, primitive_index](GPUCallFrame& call_frame, primitive_invocation = [this, args, out, primitive_index](GPUCallFrame& call_frame,
GPURuntimeContext* ctx) mutable { GPURuntimeContext* ctx) mutable {
// here, these inputs and outputs could be any of [constant, input, output, intermediate] // here, these inputs and outputs could be any of [constant, input, output,
// intermediate]
auto inputs = call_frame.get_tensor_io(args); auto inputs = call_frame.get_tensor_io(args);
auto outputs = call_frame.get_tensor_io(out); auto outputs = call_frame.get_tensor_io(out);
*m_trace << "("; *m_trace << "(";
...@@ -277,9 +279,9 @@ void runtime::gpu::GPUInternalFunction::build_functions() ...@@ -277,9 +279,9 @@ void runtime::gpu::GPUInternalFunction::build_functions()
m_variable_name_map[tv->get_name()] = std::make_tuple(TensorRole::OUTPUT, i, ss.str()); m_variable_name_map[tv->get_name()] = std::make_tuple(TensorRole::OUTPUT, i, ss.str());
auto res = dynamic_pointer_cast<ngraph::op::Result>(op); auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
//keep assigning different outputs to a result descriptor // keep assigning different outputs to a result descriptor
//op::Result emitter will check if in and out descriptors are the same // op::Result emitter will check if in and out descriptors are the same
//and skip a copy // and skip a copy
auto input_node = res->get_inputs().at(0).get_output().get_node(); auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter()) if (!input_node->is_constant() && !input_node->is_parameter())
{ {
...@@ -288,7 +290,7 @@ void runtime::gpu::GPUInternalFunction::build_functions() ...@@ -288,7 +290,7 @@ void runtime::gpu::GPUInternalFunction::build_functions()
auto output_name = ss.str(); auto output_name = ss.str();
m_variable_name_map[itv->get_name()] = m_variable_name_map[itv->get_name()] =
std::make_tuple(TensorRole::OUTPUT, i, ss.str()); std::make_tuple(TensorRole::OUTPUT, i, ss.str());
//propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name); // propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
} }
} }
......
...@@ -43,7 +43,8 @@ namespace ngraph ...@@ -43,7 +43,8 @@ namespace ngraph
GPUKernelArgs(const GPUKernelArgs& args); GPUKernelArgs(const GPUKernelArgs& args);
// //
// Add a placeholder parameter for a tensor pointer which will be resolved at runtime. // Add a placeholder parameter for a tensor pointer which will be resolved at
// runtime.
// //
GPUKernelArgs& add_placeholder(const std::string& type, const std::string& name); GPUKernelArgs& add_placeholder(const std::string& type, const std::string& name);
......
...@@ -25,11 +25,12 @@ namespace ngraph ...@@ -25,11 +25,12 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
// This is RNN op, which is formed by the fusion of multiple RNN cells ( LSTM/ GRU/ vanilla RNN) // This is RNN op, which is formed by the fusion of multiple RNN cells ( LSTM/ GRU/
// across multiple time slices // vanilla RNN) across multiple time slices
// INPUTS: // INPUTS:
// [0] - {X0, X1...., Xt} input tensor of layout TNC, Shape{num_fused_layers*batch_size, feature_size} // [0] - {X0, X1...., Xt} input tensor of layout TNC, Shape{num_fused_layers*batch_size,
// feature_size}
// [1] - recurrent input tensor ht_1 of Shape{sequence length*batch_size, feature_size} // [1] - recurrent input tensor ht_1 of Shape{sequence length*batch_size, feature_size}
// [2] - flat parameter tensor consisting of weights and biases for each layer // [2] - flat parameter tensor consisting of weights and biases for each layer
// {W_x^0 | W_h^0 | W_x^1 | W_h^1 | ... | B_x^0 | B_h^0 | B_x^1 | B_h^1 } // {W_x^0 | W_h^0 | W_x^1 | W_h^1 | ... | B_x^0 | B_h^0 | B_x^1 | B_h^1 }
...@@ -39,12 +40,16 @@ namespace ngraph ...@@ -39,12 +40,16 @@ namespace ngraph
// src_sequence_length - this will be same as number_of_timesteps // src_sequence_length - this will be same as number_of_timesteps
// src_layer_feature_size - feature size w.r.to input tensor // src_layer_feature_size - feature size w.r.to input tensor
// src_iter_feature_size - feature size w.r.to hidden state // src_iter_feature_size - feature size w.r.to hidden state
// num_cell_states - number of recurrent state tensor states , LSTM = 2, GRU = 1, vanilla RNN = 1 // num_cell_states - number of recurrent state tensor states , LSTM = 2, GRU = 1,
// vanilla RNN = 1
// OUTPUT VALUE: A tuple with the following structure: // OUTPUT VALUE: A tuple with the following structure:
// [0] - ht, sequence-wise output tensor with shape (sequence_length*batch_size, feature_size) . // [0] - ht, sequence-wise output tensor with shape (sequence_length*batch_size,
// [1] - hf, layer-wise output tensor with shape (num_fused_layers*batch_size, feature_size) . // feature_size) .
// [2] - ct output cell state tensor with the same shape as states i.e (sequence_length*batch_size, feature_size) // [1] - hf, layer-wise output tensor with shape (num_fused_layers*batch_size,
// feature_size) .
// [2] - ct output cell state tensor with the same shape as states i.e
// (sequence_length*batch_size, feature_size)
class Rnn : public Op class Rnn : public Op
{ {
......
...@@ -34,7 +34,8 @@ bool ngraph::runtime::gpu::pass::BatchNormCache::run_on_function( ...@@ -34,7 +34,8 @@ bool ngraph::runtime::gpu::pass::BatchNormCache::run_on_function(
{ {
if (auto bnbp = std::dynamic_pointer_cast<op::BatchNormTrainingBackprop>(n)) if (auto bnbp = std::dynamic_pointer_cast<op::BatchNormTrainingBackprop>(n))
{ {
// batch norm bprop annotations are used to indicate if variance is in inverse stddev format // batch norm bprop annotations are used to indicate if variance is in inverse stddev
// format
auto op_annotations = auto op_annotations =
std::make_shared<ngraph::runtime::gpu::BatchNormBackpropAnnotations>(); std::make_shared<ngraph::runtime::gpu::BatchNormBackpropAnnotations>();
......
...@@ -57,7 +57,7 @@ ...@@ -57,7 +57,7 @@
using namespace ngraph; using namespace ngraph;
void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid() void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
{ {
//construct variance // construct variance
auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4}); auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
auto neg_input = std::make_shared<op::Negative>(input); auto neg_input = std::make_shared<op::Negative>(input);
auto exp_neg_input = std::make_shared<op::Exp>(neg_input); auto exp_neg_input = std::make_shared<op::Exp>(neg_input);
...@@ -69,7 +69,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid() ...@@ -69,7 +69,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
auto add_exp = std::make_shared<op::Add>(exp_neg_input, broadcast_constant); auto add_exp = std::make_shared<op::Add>(exp_neg_input, broadcast_constant);
auto divide_1_over_exp = std::make_shared<op::Divide>(broadcast_constant, add_exp); auto divide_1_over_exp = std::make_shared<op::Divide>(broadcast_constant, add_exp);
//Define a call back that needs to called once the DFG matches the pattern // Define a call back that needs to called once the DFG matches the pattern
auto callback = [input](pattern::Matcher& m) { auto callback = [input](pattern::Matcher& m) {
NGRAPH_DEBUG << "In a callback for construct_fprop_sigmoid pattern against " NGRAPH_DEBUG << "In a callback for construct_fprop_sigmoid pattern against "
<< m.get_match_root()->get_name(); << m.get_match_root()->get_name();
...@@ -153,7 +153,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() ...@@ -153,7 +153,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
auto input_slice_0 = std::make_shared<op::Slice>(X, Coordinate{0, 0}, Coordinate{10, 100}); auto input_slice_0 = std::make_shared<op::Slice>(X, Coordinate{0, 0}, Coordinate{10, 100});
auto forget_gate = std::make_shared<op::Sigmoid>(input_slice_0); auto forget_gate = std::make_shared<op::Sigmoid>(input_slice_0);
//ct-1 -> cell state // ct-1 -> cell state
auto ct_1 = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 100}); auto ct_1 = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 100});
auto multiply_forget_gate_ct_1 = std::make_shared<op::Multiply>(forget_gate, ct_1); auto multiply_forget_gate_ct_1 = std::make_shared<op::Multiply>(forget_gate, ct_1);
...@@ -176,7 +176,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() ...@@ -176,7 +176,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
auto ht = std::make_shared<op::Multiply>(output_gate, tanh_2); auto ht = std::make_shared<op::Multiply>(output_gate, tanh_2);
auto ht_label = std::make_shared<pattern::op::Label>(ht, nullptr, NodeVector{ht}); auto ht_label = std::make_shared<pattern::op::Label>(ht, nullptr, NodeVector{ht});
//Define a call back that needs to called once the DFG matches the pattern // Define a call back that needs to called once the DFG matches the pattern
auto callback = [ct_label, auto callback = [ct_label,
input_xt, input_xt,
weights_i2h, weights_i2h,
...@@ -211,8 +211,8 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop() ...@@ -211,8 +211,8 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1, RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
"Bias should have rank of 1 for Rnn op"); "Bias should have rank of 1 for Rnn op");
// Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this // Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to
// reliably in the RNN fusion. // capture this reliably in the RNN fusion.
std::shared_ptr<op::gpu::Rnn> lstm = nullptr; std::shared_ptr<op::gpu::Rnn> lstm = nullptr;
bool intermediate_lstm = false; bool intermediate_lstm = false;
if (std::dynamic_pointer_cast<op::GetOutputElement>(pattern_map[ct_1])) if (std::dynamic_pointer_cast<op::GetOutputElement>(pattern_map[ct_1]))
...@@ -411,7 +411,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() ...@@ -411,7 +411,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
if (std::dynamic_pointer_cast<op::Broadcast>(xt_node_array[xt_node_array.size() - 1]) && if (std::dynamic_pointer_cast<op::Broadcast>(xt_node_array[xt_node_array.size() - 1]) &&
std::dynamic_pointer_cast<op::Constant>( std::dynamic_pointer_cast<op::Constant>(
xt_node_array[xt_node_array.size() - 1]->get_argument(0))) xt_node_array[xt_node_array.size() - 1]->get_argument(0)))
// here xt is determined to be the hidden (recurrent) input data and so ht is the feedforward input // here xt is determined to be the hidden (recurrent) input data and so ht is the
// feedforward input
{ {
// concatenate the sequence inputs for a given layer // concatenate the sequence inputs for a given layer
std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{ht_1}; std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{ht_1};
...@@ -425,7 +426,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() ...@@ -425,7 +426,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
hidden_ht_array[hidden_ht_array.size() - 1]) && hidden_ht_array[hidden_ht_array.size() - 1]) &&
std::dynamic_pointer_cast<op::Constant>( std::dynamic_pointer_cast<op::Constant>(
hidden_ht_array[hidden_ht_array.size() - 1]->get_argument(0))) hidden_ht_array[hidden_ht_array.size() - 1]->get_argument(0)))
// here ht is determined to be the hidden (recurrent) input data and so xt is the feedforward input // here ht is determined to be the hidden (recurrent) input data and so xt is the
// feedforward input
{ {
std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{xt}; std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{xt};
src_layer = compute_rnn_args(src_layer_labels, m, true); src_layer = compute_rnn_args(src_layer_labels, m, true);
...@@ -502,10 +504,11 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() ...@@ -502,10 +504,11 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
auto layer_rnn_ht = std::make_shared<op::GetOutputElement>(rnn, 1); auto layer_rnn_ht = std::make_shared<op::GetOutputElement>(rnn, 1);
auto layer_rnn_ct = std::make_shared<op::GetOutputElement>(rnn, 2); auto layer_rnn_ct = std::make_shared<op::GetOutputElement>(rnn, 2);
//slice the rnn ht's // slice the rnn ht's
size_t start_index = 0; size_t start_index = 0;
size_t end_index = batch_size; size_t end_index = batch_size;
// capture the slices in the reverse order, so it corrosponds to lstm_goes order captured by the Pattern matcher // capture the slices in the reverse order, so it corrosponds to lstm_goes order captured by
// the Pattern matcher
for (size_t i = 0; i < num_of_lstm_matched; i++) for (size_t i = 0; i < num_of_lstm_matched; i++)
{ {
ht_slice_per_timestep[i] = (std::make_shared<op::Slice>( ht_slice_per_timestep[i] = (std::make_shared<op::Slice>(
...@@ -574,7 +577,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop() ...@@ -574,7 +577,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
} }
} }
//now go through the lstm goe_0 consumers and replace them with the slice // now go through the lstm goe_0 consumers and replace them with the slice
for (auto& node : lstm_goe0_user) for (auto& node : lstm_goe0_user)
{ {
for (size_t i = 0; i < node->get_input_size(); i++) for (size_t i = 0; i < node->get_input_size(); i++)
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
...@@ -1817,7 +1817,8 @@ shared_ptr<runtime::Executable> ...@@ -1817,7 +1817,8 @@ shared_ptr<runtime::Executable>
if ((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1))) if ((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1)))
{ {
// symmetric padding case temporally excluded (custom kernel executed) due to stability issues // symmetric padding case temporally excluded (custom kernel executed) due to
// stability issues
const CoordinateDiff& pad_below_for = conv_op->get_padding_below_forward(); const CoordinateDiff& pad_below_for = conv_op->get_padding_below_forward();
input_offset_xy = -pad_below_for.at(0); input_offset_xy = -pad_below_for.at(0);
} }
......
...@@ -289,7 +289,7 @@ static CustomKernels::krnl_info do_create_variance_back(const string& output_nam ...@@ -289,7 +289,7 @@ static CustomKernels::krnl_info do_create_variance_back(const string& output_nam
writer.block_begin(); writer.block_begin();
{ // Main function body { // Main function body
gws.push_back(1); //input_shape.at(0)); gws.push_back(1); // input_shape.at(0));
// Channel axis loop // Channel axis loop
writer << "\nconst uint i" << channel_axis << " = get_global_id(" << channel_axis writer << "\nconst uint i" << channel_axis << " = get_global_id(" << channel_axis
<< "); /* channel_axis trip count " << input_shape.at(channel_axis) << "*/\n"; << "); /* channel_axis trip count " << input_shape.at(channel_axis) << "*/\n";
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
...@@ -38,7 +38,8 @@ ...@@ -38,7 +38,8 @@
// INTERPRETER_LOCAL is used for non-api symbols. // INTERPRETER_LOCAL is used for non-api symbols.
// #ifdef INTERPRETER_DLL // defined if INTERPRETER is compiled as a DLL // #ifdef INTERPRETER_DLL // defined if INTERPRETER is compiled as a DLL
#ifdef INTERPRETER_DLL_EXPORTS // defined if we are building the INTERPRETER DLL (instead of using it) #ifdef INTERPRETER_DLL_EXPORTS // defined if we are building the INTERPRETER DLL (instead of using
// it)
#define INTERPRETER_API INTERPRETER_HELPER_DLL_EXPORT #define INTERPRETER_API INTERPRETER_HELPER_DLL_EXPORT
#else #else
#define INTERPRETER_API INTERPRETER_HELPER_DLL_IMPORT #define INTERPRETER_API INTERPRETER_HELPER_DLL_IMPORT
......
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
#
# OVERRIDE TO STYLE: Comments wrap.
#
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Never
Language: Cpp
Standard: Cpp11
AccessModifierOffset: -4
AlignConsecutiveDeclarations: false
AlignConsecutiveAssignments: false
AlignTrailingComments: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Inline
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BreakBeforeBraces: Allman
BreakConstructorInitializersBeforeComma: true
ColumnLimit: 100
#CommentPragmas: '.*'
IndentCaseLabels: false
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: All
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SortIncludes: false
ReflowComments: true
IncludeCategories:
- Regex: '^".*'
Priority: 3
- Regex: '^<.*'
Priority: 2
SortIncludes: true
...@@ -32,13 +32,16 @@ ...@@ -32,13 +32,16 @@
#endif #endif
#endif #endif
// Now we use the generic helper definitions above to define PLAIDML_BACKEND_API and PLAIDML_BACKEND_LOCAL. // Now we use the generic helper definitions above to define PLAIDML_BACKEND_API and
// PLAIDML_BACKEND_LOCAL.
//
// PLAIDML_BACKEND_API is used for the public API symbols. It either DLL imports or DLL exports // PLAIDML_BACKEND_API is used for the public API symbols. It either DLL imports or DLL exports
// (or does nothing for static build) // (or does nothing for static build)
// PLAIDML_BACKEND_LOCAL is used for non-api symbols. // PLAIDML_BACKEND_LOCAL is used for non-api symbols.
// #ifdef PLAIDML_BACKEND_DLL // defined if PLAIDML_BACKEND is compiled as a DLL // #ifdef PLAIDML_BACKEND_DLL // defined if PLAIDML_BACKEND is compiled as a DLL
#ifdef PLAIDML_BACKEND_DLL_EXPORTS // defined if we are building the PLAIDML_BACKEND DLL (instead of using it) #ifdef PLAIDML_BACKEND_DLL_EXPORTS // defined if we are building the PLAIDML_BACKEND DLL (instead of
// using it)
#define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_EXPORT #define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_EXPORT
#else #else
#define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_IMPORT #define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_IMPORT
......
...@@ -50,6 +50,7 @@ public: ...@@ -50,6 +50,7 @@ public:
private: private:
std::mutex m_mu; std::mutex m_mu;
// N.B. The key here is the original source function, *not* the copy that's been processed by the compilation passes. // N.B. The key here is the original source function, *not* the copy that's been processed by
// the compilation passes.
std::unordered_map<std::shared_ptr<Function>, std::shared_ptr<PlaidML_Executable>> m_cache; std::unordered_map<std::shared_ptr<Function>, std::shared_ptr<PlaidML_Executable>> m_cache;
}; };
...@@ -28,9 +28,10 @@ namespace ngraph ...@@ -28,9 +28,10 @@ namespace ngraph
{ {
namespace op namespace op
{ {
// Implements NumPy-style broadcast semantics by passing its single argument through to its // Implements NumPy-style broadcast semantics by passing its single argument through
// output and pretending that this changes the shape. The creator of this node is responsible // to its output and pretending that this changes the shape. The creator of this
// for ensuring that the downstream operation will perform a NumPy-style broadcast. // node is responsible for ensuring that the downstream operation will perform a
// NumPy-style broadcast.
class ImplicitBroadcast; class ImplicitBroadcast;
} }
} }
......
...@@ -75,8 +75,8 @@ namespace ngraph ...@@ -75,8 +75,8 @@ namespace ngraph
} }
})) }))
.set(builder::ContractionInput{"I"}.add_indices("d", 0, dim_limit))) .set(builder::ContractionInput{"I"}.add_indices("d", 0, dim_limit)))
.add( // Compare the input against the (broadcasted) max values, and select the indices .add( // Compare the input against the (broadcasted) max values, and select
// where the max val occurs // the indices where the max val occurs
builder::Elementwise{"SelValIdxs", builder::Elementwise{"SelValIdxs",
"I == SelVal ? index(I, " + reduction_axis_str + "I == SelVal ? index(I, " + reduction_axis_str +
") : D" + reduction_axis_str}) ") : D" + reduction_axis_str})
......
...@@ -52,8 +52,8 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast() ...@@ -52,8 +52,8 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()
if (src_shape.size()) if (src_shape.size())
{ {
// Create a reshape operation to get the right target broadcast shape. (Note that a zero-D tensor // Create a reshape operation to get the right target broadcast shape. (Note that a
// or constant can be passed directly into the ImplicitBroadcast op). // zero-D tensor or constant can be passed directly into the ImplicitBroadcast op).
AxisVector reshape_order; AxisVector reshape_order;
Shape reshape_shape; Shape reshape_shape;
std::size_t input_dim = 0; std::size_t input_dim = 0;
...@@ -76,9 +76,9 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast() ...@@ -76,9 +76,9 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()
auto implicit_broadcast = auto implicit_broadcast =
std::make_shared<plaidml::op::ImplicitBroadcast>(src, broadcast->get_shape()); std::make_shared<plaidml::op::ImplicitBroadcast>(src, broadcast->get_shape());
// N.B. We don't use replace_node() here, since it's important to only replace the broadcast with an // N.B. We don't use replace_node() here, since it's important to only replace the broadcast
// implicit broadcast when the consuming operation is an elementwise operation, since PlaidML // with an implicit broadcast when the consuming operation is an elementwise operation,
// contractions don't provide implicit broadcast semantics. // since PlaidML contractions don't provide implicit broadcast semantics.
bool result = false; bool result = false;
for (size_t i = 0; i < broadcast->get_output_size(); ++i) for (size_t i = 0; i < broadcast->get_output_size(); ++i)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment