Merge branch 'master' into leona/doc_v0.25.1

392ef0e1 · Scott Cyphers · GitHub · e9101633 · e83c2ffa · 392ef0e1
Unverified Commit 392ef0e1 authored Aug 21, 2019 by Scott Cyphers Committed by GitHub Aug 21, 2019
30 changed files
--- a/src/ngraph/runtime/dynamic/.clang-format
+++ b/src/ngraph/runtime/dynamic/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/generic_cpu/.clang-format
+++ b/src/ngraph/runtime/generic_cpu/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/generic_cpu/kernel/dot.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/dot.hpp
@@ -56,8 +56,8 @@ namespace ngraph
                    }
                    else
                    {
-                        // Get the sizes of the dot axes. It's easiest to pull them from arg1 because they're
-                        // right up front.
+                        // Get the sizes of the dot axes. It's easiest to pull them from arg1
+                        // because they're right up front.
                        Shape dot_axis_sizes(reduction_axes_count);
                        std::copy(arg1_shape.begin(),
                                  arg1_shape.begin() + reduction_axes_count,
@@ -67,7 +67,8 @@ namespace ngraph
                        CoordinateTransform arg1_transform(arg1_shape);
                        CoordinateTransform output_transform(out_shape);

-                        // Create coordinate transforms for arg0 and arg1 that throw away the dotted axes.
+                        // Create coordinate transforms for arg0 and arg1 that throw away the dotted
+                        // axes.
                        size_t arg0_projected_rank = arg0_shape.size() - reduction_axes_count;
                        size_t arg1_projected_rank = arg1_shape.size() - reduction_axes_count;

@@ -84,15 +85,16 @@ namespace ngraph
                        CoordinateTransform arg0_projected_transform(arg0_projected_shape);
                        CoordinateTransform arg1_projected_transform(arg1_projected_shape);

-                        // Create a coordinate transform that allows us to iterate over all possible values
-                        // for the dotted axes.
+                        // Create a coordinate transform that allows us to iterate over all possible
+                        // values for the dotted axes.
                        CoordinateTransform dot_axes_transform(dot_axis_sizes);

                        for (const Coordinate& arg0_projected_coord : arg0_projected_transform)
                        {
                            for (const Coordinate& arg1_projected_coord : arg1_projected_transform)
                            {
-                                // The output coordinate is just the concatenation of the projected coordinates.
+                                // The output coordinate is just the concatenation of the projected
+                                // coordinates.
                                Coordinate out_coord(arg0_projected_coord.size() +
                                                     arg1_projected_coord.size());

@@ -116,8 +118,9 @@ namespace ngraph
                                                         arg0_coord.begin());
                                for (const Coordinate& dot_axis_positions : dot_axes_transform)
                                {
-                                    // In order to find the points to multiply together, we need to inject our current
-                                    // positions along the dotted axes back into the projected arg0 and arg1 coordinates.
+                                    // In order to find the points to multiply together, we need to
+                                    // inject our current positions along the dotted axes back into
+                                    // the projected arg0 and arg1 coordinates.
                                    std::copy(dot_axis_positions.begin(),
                                              dot_axis_positions.end(),
                                              arg0_it);

--- a/src/ngraph/runtime/gpu/.clang-format
+++ b/src/ngraph/runtime/gpu/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -107,7 +107,7 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::string& dtype,
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primutive
-    size_t split_input_size = 256; //max num of inputs fit 4KB parameter space: 256 * 8 + 7 * ?
+    size_t split_input_size = 256; // max num of inputs fit 4KB parameter space: 256 * 8 + 7 * ?
    size_t residue = input_num % split_input_size;
    std::stringstream kernel_name_1;
    std::stringstream kernel_name_2;
@@ -243,8 +243,9 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d
    //         __device__ void set_value(float val){value = val;}
    //
    // };
-    // Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the struct can
-    // therfore be given by 'shared_struct_bytes' as calculated below accounting for structure padding
+    // Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the
+    // struct can therfore be given by 'shared_struct_bytes' as calculated below accounting for
+    // structure padding

    size_t shared_struct_bytes = (((dtypes[0].size() + index_elem_type.size()) <= 8) ? 8 : 16);
    size_t shared_data_bytes = num_cols * shared_struct_bytes;
@@ -332,7 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& d
                                              1,
                                              1,
                                              shared_data_bytes, // shared mem
-                                              nullptr,           //stream
+                                              nullptr,           // stream
                                              args_list,
                                              nullptr)); // arguments
                debug_sync();
@@ -849,9 +850,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape_3d(const std::array<std::string,
    std::vector<uint32_t> block_size(3, 0);
    // TODO: currently we set it to 16, will add tuning method later
    uint32_t block_size_x = 16;
-    block_size[0] = block_size_x;                                       //x
-    block_size[2] = (input_order[2] == 0) ? block_size_x : 1;           //z
-    block_size[1] = (block_size[2] == block_size_x) ? 1 : block_size_x; //y
+    block_size[0] = block_size_x;                                       // x
+    block_size[2] = (input_order[2] == 0) ? block_size_x : 1;           // z
+    block_size[1] = (block_size[2] == block_size_x) ? 1 : block_size_x; // y
    uint32_t aligned_grid_size_x = align_to_block_size(input_shape[2], block_size[0]);
    uint32_t aligned_grid_size_y = align_to_block_size(input_shape[1], block_size[1]);
    uint32_t aligned_grid_size_z = align_to_block_size(input_shape[0], block_size[2]);
@@ -1571,7 +1572,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
        input_shape_padded =
            runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
        padded_size = shape_size(input_shape_padded);
-        //currntly we set this to float point only, need to add other datatype support later
+        // currntly we set this to float point only, need to add other datatype support later
        float pad_value = std::numeric_limits<float>::lowest();
        std::vector<float> temp(padded_size, pad_value);
        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
@@ -1609,7 +1610,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
                //                       std::vector<void*>{pad_buffer}.data());

                // gpu::invoke_primitive(
-                //     m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
+                //     m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(),
+                //     outputs);

                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
                gpu::invoke_primitive(m_ctx,
@@ -1701,8 +1703,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<element::Type>
        }});
        return this->m_primitive_emitter->register_primitive(memset, hash);
    }
-    // if reduce not include last axis, this is a heuristic to choose by reduce axis for better cache
-    // a more accurate but slow way is to tune with actual kernel
+    // if reduce not include last axis, this is a heuristic to choose by reduce axis for better
+    // cache. a more accurate but slow way is to tune with actual kernel
    else if (reduce_strides_in_input.back() != 1)
    {
        // TODO: currently we set it to 64, will add tuning method later
@@ -1817,10 +1819,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<element::
                                                     const char* kernel)
 {
    std::vector<std::string> dtypes_str = get_string_vector(dtypes);
-    //if call from reduce, this is duplicated
+    // if call from reduce, this is duplicated
    NVShape simplified_reduce_axis;
    NVShape simplified_input_shape;
-    // simplified_reduce_axis will not be empty, since we checked if input size is same as output size in gpu_emitter
+    // simplified_reduce_axis will not be empty, since we checked if input size is same as output
+    // size in gpu_emitter
    simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
    size_t rank = simplified_input_shape.size();
    size_t reduce_rank = simplified_reduce_axis.size();
@@ -2070,7 +2073,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>&
 {
    NVShape simplified_reduce_axis;
    NVShape simplified_input_shape;
-    // simplified_reduce_axis will not be empty, since we checked if input size is same as output size in gpu_emitter
+    // simplified_reduce_axis will not be empty, since we checked if input size is same as output
+    // size in gpu_emitter
    simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);

    size_t rank = simplified_input_shape.size();
@@ -2166,8 +2170,8 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<element::Type>&
    }
    else
    {
-        //if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
-        //other wise, call reduce to scalar directly.
+        // if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
+        // other wise, call reduce to scalar directly.
        const uint32_t unroll_size = 8;
        if (nthreads > nthreads_acc * (unroll_size + 1))
        {
@@ -2804,12 +2808,15 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
    }

    // launch arguments:
-    // each output pixel is its own block. if the batch size is greater than reg_tile_size * sm_tile_size, a single
-    // output pixel is spread over multiple blocks along the batch axis so that memory coordination is not required
-    // each block consists of 2 warps in an 8 x 8 array used for accessing the SM block of the GEMM
+    // each output pixel is its own block. if the batch size is greater than reg_tile_size *
+    // sm_tile_size, a single output pixel is spread over multiple blocks along the batch axis so
+    // that memory coordination is not required each block consists of 2 warps in an 8 x 8 array
+    // used for accessing the SM block of the GEMM

    // do_i = output pixel coordinates
-    // grid = (do_1*do_2*...*do_N*ceil_div(N, REG_TILE_SIZE*SM_TILE_SIZE), ceil_div(K, REG_TILE_SIZE*SM_TILE_SIZE), 1)
+    // grid = (do_1*do_2*...*do_N*ceil_div(N, REG_TILE_SIZE*SM_TILE_SIZE),
+    //         ceil_div(K, REG_TILE_SIZE*SM_TILE_SIZE),
+    //         1)
    // block = (8, 8, 1)
    dim3 blocks(output_pixels * idiv_ceil(N, reg_tile_size * sm_tile_size),
                idiv_ceil(K, reg_tile_size * sm_tile_size),
@@ -3060,7 +3067,7 @@ void* runtime::gpu::CUDAEmitter::get_init_reduce_val(std::string reduce_op, std:
    }
    else
    {
-        //not defined.
+        // not defined.
        throw std::runtime_error(data_type + "currently not supportted with init value.");
    }
 }

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -228,9 +228,11 @@ namespace ngraph
                                              NVShape input_shape,
                                              const char* op,
                                              const char* kernel);
-                /// \brief This is the preprocess for reduce to scalar if the data size is large than a number.
+                /// \brief This is the preprocess for reduce to scalar if the data size is large
+                ///        than a number.
                /// The number can be tuned based on hardware.
-                /// This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
+                /// This cuda kernel will accumulate reduction to a certain number of bins depends
+                /// on hardware.
                size_t build_reduce_to_scalar_acc(const std::vector<element::Type>& dtypes,
                                                  NVShape input_shape,
                                                  NVShape output_shape,
@@ -239,7 +241,8 @@ namespace ngraph
                                                  const char* kernel);
                /// \brief Simplifed reduce shape and reduce axis, remove dimsion size 1,
                /// combine two or more adjacent reduce/nonreduce axis.
-                /// the simplified reduce shape and reduce axis will make index caculation simplier in cuda kernel.
+                /// the simplified reduce shape and reduce axis will make index caculation simplier
+                /// in cuda kernel.
                /// example:
                /// {1 1 2 2} with reduce axis {3} simplifiy to: {2 2} with reduce_axis {1};
                /// {2 3 4} with reduce axis {0 1} simplify to {6 4} with reduce_axis {0};
@@ -248,13 +251,15 @@ namespace ngraph
                                           NVShape reduce_axis,
                                           NVShape& simplified_shape,
                                           NVShape& simplified_reduce_axis);
-                /// \brief Seperate input_shape to reduced_shape and non_reduce_shape, and calcuate strides for them
-                ///        and strides in input. This help caculate input index and output index for cuda kernel.
+                /// \brief Seperate input_shape to reduced_shape and non_reduce_shape, and calcuate
+                ///        strides for them and strides in input. This help caculate input index and
+                ///        output index for cuda kernel.
                /// example:
                /// input_shape {2 3 4 5} with reduce_axis {0 2}:
                /// input_strides: {60, 20, 5, 1}
                /// reduce_shape {2 4}, reduce_strides {4 1}, reduce_strides_in_input {60 5}
-                /// non_reduce_shape {3 5}, non_reduce_strides {5 1}, non_reduce_strides_in_input {20 1}
+                /// non_reduce_shape {3 5}, non_reduce_strides {5 1}, non_reduce_strides_in_input
+                /// {20 1}
                void get_reduce_strides(NVShape input_shape,
                                        NVShape reduce_axis,
                                        NVShape& non_reduce_shape,
@@ -264,8 +269,8 @@ namespace ngraph
                                        NVShape& reduce_strides,
                                        NVShape& reduce_strides_in_input);

-                /// \brief Calculate magic and shift part of an shape vector (denomitor), change divide to multiply
-                ///        in cuda kernel.
+                /// \brief Calculate magic and shift part of an shape vector (denomitor), change
+                ///        divide to multiply in cuda kernel.
                void div_to_mul(const NVShape& shape,
                                std::vector<int>& magic,
                                std::vector<int>& shift);

--- a/src/ngraph/runtime/gpu/cuda_error_check.hpp
+++ b/src/ngraph/runtime/gpu/cuda_error_check.hpp
@@ -27,8 +27,8 @@
 #include <stdint.h>
 #include <string>

-//why use "do...while.."
-//https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
+// why use "do...while.."
+// https://stackoverflow.com/questions/154136/why-use-apparently-meaningless-do-while-and-if-else-statements-in-macros
 #define NVRTC_SAFE_CALL_NO_THROW(x)                                                                \
    do                                                                                             \
    {                                                                                              \

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
@@ -882,7 +882,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
        input_shape_padded =
            runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
        padded_size = shape_size(input_shape_padded);
-        //currntly we set this to float point only, need to add other datatype support later
+        // currntly we set this to float point only, need to add other datatype support later
        float pad_value = std::numeric_limits<float>::lowest();
        std::vector<float> temp(padded_size, pad_value);
        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
@@ -1125,7 +1125,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::gpu::Rnn* node)
                                              sequence_lengths.data(),
                                              pad_value));

-    // TO DO: with rnn projection layers the third dimension of the hidden_shape should be recProjSize
+    // TO DO: with rnn projection layers the third dimension of the hidden_shape should be
+    // recProjSize
    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
    uint32_t num_layers = node->get_num_fused_layers() * direction;
    Shape hidden_shape{num_layers, batch_size, hidden_size};

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -348,7 +348,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
            writer << "uint32_t init_in_idx = in_idx;\n";
            int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1;

-            //find max
+            // find max
            writer << data_types[1] << " r_max = in[init_in_idx];\n";
            writer << data_types[1] << " input_i;\n";

@@ -406,7 +406,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
            }
            writer.block_end();

-            //exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+            // exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
            writer << data_types[1] << " r_sum = 0;\n";
            writer << data_types[1] << " c = 0;\n";
            writer << data_types[1] << " y;\n";
@@ -457,7 +457,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_op(CodeWriter& writer,
            }
            writer.block_end();

-            //divide
+            // divide
            writer.block_begin();
            for (int64_t j = 0; j < last_r_idx; j++)
            {
@@ -638,7 +638,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
                writer << "r_max = sdata[tid];\n";
            }
            writer.block_end();
-            //accumulate WARPSIZE threads
+            // accumulate WARPSIZE threads
            for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
            {
                if (num_of_warp > i)
@@ -660,7 +660,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
        writer << "__syncthreads();\n";
        writer << "r_max = sdata[0];\n";

-        //exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+        // exp and sum , https://en.wikipedia.org/wiki/Kahan_summation_algorithm
        writer << data_types[1] << " r_sum = 0;\n";
        writer << data_types[1] << " c = 0;\n";
        writer << data_types[1] << " y;\n";
@@ -718,7 +718,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
                writer << "r_sum = sdata[tid];\n";
            }
            writer.block_end();
-            //accumulate WARPSIZE = 32 threads
+            // accumulate WARPSIZE = 32 threads
            for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
            {
                if (num_of_warp > i)
@@ -771,7 +771,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
    return;
 }

-//each thread calculate the whole reduction of one output
+// each thread calculate the whole reduction of one output
 void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
    CodeWriter& writer,
    const std::string& name,
@@ -920,7 +920,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
        writer << "r = in[input_idx];\n";
        writer << "input_idx += step;\n";
        writer.block_end();
-        //accumulate reduction to blockDim.x threads
+        // accumulate reduction to blockDim.x threads
        if (stable_sum)
        {
            writer << data_types[1] << " c = 0;\n";
@@ -946,7 +946,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
        }
        writer.block_end();

-        //accumulate WARPSIZE threads for each warp
+        // accumulate WARPSIZE threads for each warp
        for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
        {
            if (block_size_x > i)
@@ -976,7 +976,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
                writer << "r = sdata[tid];\n";
            }
            writer.block_end();
-            //accumulate WARPSIZE threads
+            // accumulate WARPSIZE threads
            for (int i = (WARPSIZE >> 1); i >= 1; i >>= 1)
            {
                if (num_of_warp > i)
@@ -1034,7 +1034,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
        writer << "r = in[input_idx];\n";
        writer << "input_idx += step;\n";
        writer.block_end();
-        //accumulate reduction to step threads
+        // accumulate reduction to step threads
        if (stable_sum)
        {
            writer << data_types[1] << " c = 0;\n";

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -95,9 +95,11 @@ namespace ngraph
                                                size_t non_reduce_rank,
                                                size_t reduce_rank);

-                /// \brief This is the preprocess to reduce to scalar if the input data size is large than a number.
+                /// \brief This is the preprocess to reduce to scalar if the input data size is
+                ///        large than a number.
                /// The number can be tuned based on hardware.
-                /// This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
+                /// This cuda kernel will accumulate reduction to a certain number of bins depends
+                /// on hardware.
                /// stable kahan sum is been used for float point sum.
                /// no initial value needed since we load one input value as initial
                /// not support 0 sized input
@@ -221,8 +223,9 @@ namespace ngraph
                                                            size_t rank,
                                                            bool register_arguments = false);

-                /// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
-                ///        of the non-reduced tensor and return the string name of integer index into reduced tensor
+                /// \brief Given kernel input variables i_* produce register variables
+                ///        o_coordinates{i} of the non-reduced tensor and return the string name of
+                ///        integer index into reduced tensor
                static std::string
                    collective_coordinate_transform_helper(CodeWriter& writer,
                                                           std::string i_thread_index,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -1043,7 +1043,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
    auto input_order = reshape->get_input_order();
    size_t result_shape_product = shape_size(result_shape);

-    //for a zero-size tensor, or change from 1^m shape to 1^n shape, just do a copy
+    // for a zero-size tensor, or change from 1^m shape to 1^n shape, just do a copy
    if (!reshape->get_is_transpose() || result_shape_product < 2)
    {
        auto& host_emitter = compiled_function->get_primitive_emitter()->get_host_emitter();
@@ -1052,7 +1052,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
        return compiled_function->add_to_runtime(index, function_name, args, out);
    }

-    //combine inordered dimensons after reorder in shape, update output shape and input order
+    // combine inordered dimensons after reorder in shape, update output shape and input order
    Shape in_order_map(arg_rank, 0);
    for (int i = 0; i < arg_rank - 1; i++)
    {
@@ -1089,7 +1089,7 @@ std::string runtime::gpu::GPU_Emitter::emit_Reshape(EMIT_ARGS)
        }
    }

-    //eleminate dimenson size = 1, update input order and output shape
+    // eleminate dimenson size = 1, update input order and output shape
    Shape new_arg_shape;
    Shape new_result_shape;
    Shape new_idx_map(combine_rank, 0);

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -463,9 +463,9 @@ void runtime::gpu::GPUExternalFunction::emit_functions()
                m_variable_name_map[tv->get_name()] = ss.str();

                auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
-                //keep assigning different outputs to a result descriptor
-                //op::Result emitter will check if in and out descriptors are the same
-                //and skip a copy
+                // keep assigning different outputs to a result descriptor
+                // op::Result emitter will check if in and out descriptors are the same
+                // and skip a copy
                auto input_node = res->get_inputs().at(0).get_output().get_node();
                if (!input_node->is_constant() && !input_node->is_parameter())
                {

--- a/src/ngraph/runtime/gpu/gpu_internal_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_internal_function.cpp
@@ -155,7 +155,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime(
    {
        primitive_invocation = [args, out, primitive_index](GPUCallFrame& call_frame,
                                                            GPURuntimeContext* ctx) mutable {
-            // here, these inputs and outputs could be any of [constant, input, output, intermediate]
+            // here, these inputs and outputs could be any of [constant, input, output,
+            // intermediate]
            auto inputs = call_frame.get_tensor_io(args);
            auto outputs = call_frame.get_tensor_io(out);
            runtime::gpu::invoke_primitive(ctx, primitive_index, inputs.data(), outputs.data());
@@ -165,7 +166,8 @@ std::string runtime::gpu::GPUInternalFunction::add_to_runtime(
    {
        primitive_invocation = [this, args, out, primitive_index](GPUCallFrame& call_frame,
                                                                  GPURuntimeContext* ctx) mutable {
-            // here, these inputs and outputs could be any of [constant, input, output, intermediate]
+            // here, these inputs and outputs could be any of [constant, input, output,
+            // intermediate]
            auto inputs = call_frame.get_tensor_io(args);
            auto outputs = call_frame.get_tensor_io(out);
            *m_trace << "(";
@@ -277,9 +279,9 @@ void runtime::gpu::GPUInternalFunction::build_functions()
            m_variable_name_map[tv->get_name()] = std::make_tuple(TensorRole::OUTPUT, i, ss.str());

            auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
-            //keep assigning different outputs to a result descriptor
-            //op::Result emitter will check if in and out descriptors are the same
-            //and skip a copy
+            // keep assigning different outputs to a result descriptor
+            // op::Result emitter will check if in and out descriptors are the same
+            // and skip a copy
            auto input_node = res->get_inputs().at(0).get_output().get_node();
            if (!input_node->is_constant() && !input_node->is_parameter())
            {
@@ -288,7 +290,7 @@ void runtime::gpu::GPUInternalFunction::build_functions()
                auto output_name = ss.str();
                m_variable_name_map[itv->get_name()] =
                    std::make_tuple(TensorRole::OUTPUT, i, ss.str());
-                //propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
+                // propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
            }
        }


--- a/src/ngraph/runtime/gpu/gpu_kernel_args.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_args.hpp
@@ -43,7 +43,8 @@ namespace ngraph
                GPUKernelArgs(const GPUKernelArgs& args);

                //
-                // Add a placeholder parameter for a tensor pointer which will be resolved at runtime.
+                // Add a placeholder parameter for a tensor pointer which will be resolved at
+                // runtime.
                //
                GPUKernelArgs& add_placeholder(const std::string& type, const std::string& name);


--- a/src/ngraph/runtime/gpu/op/rnn.hpp
+++ b/src/ngraph/runtime/gpu/op/rnn.hpp
@@ -25,11 +25,12 @@ namespace ngraph
    {
        namespace gpu
        {
-            // This is RNN op, which is formed by the fusion of multiple RNN cells ( LSTM/ GRU/ vanilla RNN)
-            // across multiple time slices
+            // This is RNN op, which is formed by the fusion of multiple RNN cells ( LSTM/ GRU/
+            // vanilla RNN) across multiple time slices

            // INPUTS:
-            // [0] - {X0, X1...., Xt} input tensor of layout TNC, Shape{num_fused_layers*batch_size, feature_size}
+            // [0] - {X0, X1...., Xt} input tensor of layout TNC, Shape{num_fused_layers*batch_size,
+            //       feature_size}
            // [1] - recurrent input tensor ht_1 of Shape{sequence length*batch_size, feature_size}
            // [2] - flat parameter tensor consisting of weights and biases for each layer
            //       {W_x^0 | W_h^0 | W_x^1 | W_h^1 | ... | B_x^0 | B_h^0 | B_x^1 | B_h^1 }
@@ -39,12 +40,16 @@ namespace ngraph
            // src_sequence_length - this will be same as number_of_timesteps
            // src_layer_feature_size - feature size w.r.to input tensor
            // src_iter_feature_size - feature size w.r.to hidden state
-            // num_cell_states - number of recurrent state tensor states , LSTM = 2, GRU = 1, vanilla RNN = 1
+            // num_cell_states - number of recurrent state tensor states , LSTM = 2, GRU = 1,
+            //                   vanilla RNN = 1

            // OUTPUT VALUE: A tuple with the following structure:
-            //   [0] - ht, sequence-wise output tensor with shape (sequence_length*batch_size, feature_size) .
-            //   [1] - hf, layer-wise output tensor with shape (num_fused_layers*batch_size, feature_size) .
-            //   [2] - ct output cell state tensor with the same shape as states i.e (sequence_length*batch_size, feature_size)
+            //   [0] - ht, sequence-wise output tensor with shape (sequence_length*batch_size,
+            //         feature_size) .
+            //   [1] - hf, layer-wise output tensor with shape (num_fused_layers*batch_size,
+            //         feature_size) .
+            //   [2] - ct output cell state tensor with the same shape as states i.e
+            //         (sequence_length*batch_size, feature_size)

            class Rnn : public Op
            {

--- a/src/ngraph/runtime/gpu/pass/gpu_batch_norm_cache.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_batch_norm_cache.cpp
@@ -34,7 +34,8 @@ bool ngraph::runtime::gpu::pass::BatchNormCache::run_on_function(
    {
        if (auto bnbp = std::dynamic_pointer_cast<op::BatchNormTrainingBackprop>(n))
        {
-            // batch norm bprop annotations are used to indicate if variance is in inverse stddev format
+            // batch norm bprop annotations are used to indicate if variance is in inverse stddev
+            // format
            auto op_annotations =
                std::make_shared<ngraph::runtime::gpu::BatchNormBackpropAnnotations>();


--- a/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_rnn_fusion.cpp
@@ -57,7 +57,7 @@
 using namespace ngraph;
 void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
 {
-    //construct variance
+    // construct variance
    auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
    auto neg_input = std::make_shared<op::Negative>(input);
    auto exp_neg_input = std::make_shared<op::Exp>(neg_input);
@@ -69,7 +69,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_sigmoid()
    auto add_exp = std::make_shared<op::Add>(exp_neg_input, broadcast_constant);
    auto divide_1_over_exp = std::make_shared<op::Divide>(broadcast_constant, add_exp);

-    //Define a call back that needs to called once the DFG matches the pattern
+    // Define a call back that needs to called once the DFG matches the pattern
    auto callback = [input](pattern::Matcher& m) {
        NGRAPH_DEBUG << "In a callback for construct_fprop_sigmoid pattern against "
                     << m.get_match_root()->get_name();
@@ -153,7 +153,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
    auto input_slice_0 = std::make_shared<op::Slice>(X, Coordinate{0, 0}, Coordinate{10, 100});
    auto forget_gate = std::make_shared<op::Sigmoid>(input_slice_0);

-    //ct-1 -> cell state
+    // ct-1 -> cell state
    auto ct_1 = std::make_shared<pattern::op::Label>(element::f32, Shape{10, 100});
    auto multiply_forget_gate_ct_1 = std::make_shared<op::Multiply>(forget_gate, ct_1);

@@ -176,7 +176,7 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
    auto ht = std::make_shared<op::Multiply>(output_gate, tanh_2);
    auto ht_label = std::make_shared<pattern::op::Label>(ht, nullptr, NodeVector{ht});

-    //Define a call back that needs to called once the DFG matches the pattern
+    // Define a call back that needs to called once the DFG matches the pattern
    auto callback = [ct_label,
                     input_xt,
                     weights_i2h,
@@ -211,8 +211,8 @@ void ngraph::runtime::gpu::pass::LSTMFusion::construct_lstm_fprop()
        RETURN_IF_FALSE(bias_i2h->get_shape().size() == 1 && bias_h2h->get_shape().size() == 1,
                        "Bias should have rank of 1 for Rnn op");

-        // Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to capture this
-        // reliably in the RNN fusion.
+        // Determine which is ht_1 and xt. but if both xt and ht_1 have the same shape we need to
+        // capture this reliably in the RNN fusion.
        std::shared_ptr<op::gpu::Rnn> lstm = nullptr;
        bool intermediate_lstm = false;
        if (std::dynamic_pointer_cast<op::GetOutputElement>(pattern_map[ct_1]))
@@ -411,7 +411,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
        if (std::dynamic_pointer_cast<op::Broadcast>(xt_node_array[xt_node_array.size() - 1]) &&
            std::dynamic_pointer_cast<op::Constant>(
                xt_node_array[xt_node_array.size() - 1]->get_argument(0)))
-        // here xt is determined to be the hidden (recurrent) input data and so ht is the feedforward input
+        // here xt is determined to be the hidden (recurrent) input data and so ht is the
+        // feedforward input
        {
            // concatenate the sequence inputs for a given layer
            std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{ht_1};
@@ -425,7 +426,8 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
                     hidden_ht_array[hidden_ht_array.size() - 1]) &&
                 std::dynamic_pointer_cast<op::Constant>(
                     hidden_ht_array[hidden_ht_array.size() - 1]->get_argument(0)))
-        // here ht is determined to be the hidden (recurrent) input data and so xt is the feedforward input
+        // here ht is determined to be the hidden (recurrent) input data and so xt is the
+        // feedforward input
        {
            std::vector<std::shared_ptr<pattern::op::Label>> src_layer_labels{xt};
            src_layer = compute_rnn_args(src_layer_labels, m, true);
@@ -502,10 +504,11 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
        auto layer_rnn_ht = std::make_shared<op::GetOutputElement>(rnn, 1);
        auto layer_rnn_ct = std::make_shared<op::GetOutputElement>(rnn, 2);

-        //slice the rnn ht's
+        // slice the rnn ht's
        size_t start_index = 0;
        size_t end_index = batch_size;
-        // capture the slices in the reverse order, so it corrosponds to lstm_goes order captured by the Pattern matcher
+        // capture the slices in the reverse order, so it corrosponds to lstm_goes order captured by
+        // the Pattern matcher
        for (size_t i = 0; i < num_of_lstm_matched; i++)
        {
            ht_slice_per_timestep[i] = (std::make_shared<op::Slice>(
@@ -574,7 +577,7 @@ void ngraph::runtime::gpu::pass::RNNFusion::construct_rnn_lstm_fprop()
            }
        }

-        //now go through the lstm goe_0 consumers and replace them with the slice
+        // now go through the lstm goe_0 consumers and replace them with the slice
        for (auto& node : lstm_goe0_user)
        {
            for (size_t i = 0; i < node->get_input_size(); i++)

--- a/src/ngraph/runtime/gpuh/.clang-format
+++ b/src/ngraph/runtime/gpuh/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/intelgpu/.clang-format
+++ b/src/ngraph/runtime/intelgpu/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -1817,7 +1817,8 @@ shared_ptr<runtime::Executable>

                if ((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1)))
                {
-                    // symmetric padding case temporally excluded (custom kernel executed) due to stability issues
+                    // symmetric padding case temporally excluded (custom kernel executed) due to
+                    // stability issues
                    const CoordinateDiff& pad_below_for = conv_op->get_padding_below_forward();
                    input_offset_xy = -pad_below_for.at(0);
                }

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
@@ -289,7 +289,7 @@ static CustomKernels::krnl_info do_create_variance_back(const string& output_nam
    writer.block_begin();
    { // Main function body

-        gws.push_back(1); //input_shape.at(0));
+        gws.push_back(1); // input_shape.at(0));
        // Channel axis loop
        writer << "\nconst uint i" << channel_axis << " = get_global_id(" << channel_axis
               << "); /* channel_axis trip count " << input_shape.at(channel_axis) << "*/\n";

--- a/src/ngraph/runtime/interpreter/.clang-format
+++ b/src/ngraph/runtime/interpreter/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/interpreter/int_visibility.hpp
+++ b/src/ngraph/runtime/interpreter/int_visibility.hpp
@@ -38,7 +38,8 @@
 // INTERPRETER_LOCAL is used for non-api symbols.

 // #ifdef INTERPRETER_DLL         // defined if INTERPRETER is compiled as a DLL
-#ifdef INTERPRETER_DLL_EXPORTS // defined if we are building the INTERPRETER DLL (instead of using it)
+#ifdef INTERPRETER_DLL_EXPORTS // defined if we are building the INTERPRETER DLL (instead of using
+                               // it)
 #define INTERPRETER_API INTERPRETER_HELPER_DLL_EXPORT
 #else
 #define INTERPRETER_API INTERPRETER_HELPER_DLL_IMPORT

--- a/src/ngraph/runtime/nop/.clang-format
+++ b/src/ngraph/runtime/nop/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/plaidml/.clang-format
+++ b/src/ngraph/runtime/plaidml/.clang-format
+#
+# OVERRIDE TO STYLE: Comments wrap.
+#
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+
+AlignConsecutiveDeclarations: false
+AlignConsecutiveAssignments: false
+AlignTrailingComments: true
+
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+
+BinPackArguments: false
+BinPackParameters: false
+
+BreakBeforeBraces: Allman
+BreakConstructorInitializersBeforeComma: true
+
+ColumnLimit: 100
+#CommentPragmas: '.*'
+
+IndentCaseLabels: false
+IndentWrappedFunctionNames: true
+
+KeepEmptyLinesAtTheStartOfBlocks: false
+NamespaceIndentation: All
+
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+SortIncludes: false
+ReflowComments: true
+
+IncludeCategories:
+  - Regex:           '^".*'
+    Priority:        3
+  - Regex:           '^<.*'
+    Priority:        2
+SortIncludes: true
--- a/src/ngraph/runtime/plaidml/plaidml_backend_visibility.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_backend_visibility.hpp
@@ -32,13 +32,16 @@
 #endif
 #endif

-// Now we use the generic helper definitions above to define PLAIDML_BACKEND_API and PLAIDML_BACKEND_LOCAL.
+// Now we use the generic helper definitions above to define PLAIDML_BACKEND_API and
+// PLAIDML_BACKEND_LOCAL.
+//
 // PLAIDML_BACKEND_API is used for the public API symbols. It either DLL imports or DLL exports
 //    (or does nothing for static build)
 // PLAIDML_BACKEND_LOCAL is used for non-api symbols.

 // #ifdef PLAIDML_BACKEND_DLL         // defined if PLAIDML_BACKEND is compiled as a DLL
-#ifdef PLAIDML_BACKEND_DLL_EXPORTS // defined if we are building the PLAIDML_BACKEND DLL (instead of using it)
+#ifdef PLAIDML_BACKEND_DLL_EXPORTS // defined if we are building the PLAIDML_BACKEND DLL (instead of
+                                   // using it)
 #define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_EXPORT
 #else
 #define PLAIDML_BACKEND_API PLAIDML_BACKEND_HELPER_DLL_IMPORT

--- a/src/ngraph/runtime/plaidml/plaidml_compilation_cache.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_compilation_cache.hpp
@@ -50,6 +50,7 @@ public:
 private:
    std::mutex m_mu;

-    // N.B. The key here is the original source function, *not* the copy that's been processed by the compilation passes.
+    // N.B. The key here is the original source function, *not* the copy that's been processed by
+    // the compilation passes.
    std::unordered_map<std::shared_ptr<Function>, std::shared_ptr<PlaidML_Executable>> m_cache;
 };
--- a/src/ngraph/runtime/plaidml/plaidml_ops_implicit_broadcast.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_ops_implicit_broadcast.hpp
@@ -28,9 +28,10 @@ namespace ngraph
        {
            namespace op
            {
-                // Implements NumPy-style broadcast semantics by passing its single argument through to its
-                // output and pretending that this changes the shape.  The creator of this node is responsible
-                // for ensuring that the downstream operation will perform a NumPy-style broadcast.
+                // Implements NumPy-style broadcast semantics by passing its single argument through
+                // to its output and pretending that this changes the shape.  The creator of this
+                // node is responsible for ensuring that the downstream operation will perform a
+                // NumPy-style broadcast.
                class ImplicitBroadcast;
            }
        }

--- a/src/ngraph/runtime/plaidml/plaidml_ops_index_reduction.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_ops_index_reduction.cpp
@@ -75,8 +75,8 @@ namespace ngraph
                                            }
                                        }))
                                .set(builder::ContractionInput{"I"}.add_indices("d", 0, dim_limit)))
-                        .add( // Compare the input against the (broadcasted) max values, and select the indices
-                            // where the max val occurs
+                        .add( // Compare the input against the (broadcasted) max values, and select
+                              // the indices where the max val occurs
                            builder::Elementwise{"SelValIdxs",
                                                 "I == SelVal ? index(I, " + reduction_axis_str +
                                                     ") : D" + reduction_axis_str})

--- a/src/ngraph/runtime/plaidml/plaidml_pass_implicit_broadcast.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_pass_implicit_broadcast.cpp
@@ -52,8 +52,8 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()

        if (src_shape.size())
        {
-            // Create a reshape operation to get the right target broadcast shape.  (Note that a zero-D tensor
-            // or constant can be passed directly into the ImplicitBroadcast op).
+            // Create a reshape operation to get the right target broadcast shape.  (Note that a
+            // zero-D tensor or constant can be passed directly into the ImplicitBroadcast op).
            AxisVector reshape_order;
            Shape reshape_shape;
            std::size_t input_dim = 0;
@@ -76,9 +76,9 @@ ngraph::runtime::plaidml::pass::ImplicitBroadcast::ImplicitBroadcast()
        auto implicit_broadcast =
            std::make_shared<plaidml::op::ImplicitBroadcast>(src, broadcast->get_shape());

-        // N.B. We don't use replace_node() here, since it's important to only replace the broadcast with an
-        // implicit broadcast when the consuming operation is an elementwise operation, since PlaidML
-        // contractions don't provide implicit broadcast semantics.
+        // N.B. We don't use replace_node() here, since it's important to only replace the broadcast
+        // with an implicit broadcast when the consuming operation is an elementwise operation,
+        // since PlaidML contractions don't provide implicit broadcast semantics.
        bool result = false;
        for (size_t i = 0; i < broadcast->get_output_size(); ++i)
        {