Merge remote-tracking branch 'origin/master' into r0.9

36dd64ad · Adam Procter · 413e9617 · c579e245 · 36dd64ad · 36dd64ad
Commit 36dd64ad authored Oct 12, 2018 by Adam Procter
36 changed files
--- a/.ci/onnx/jenkins/prepare_environment.sh
+++ b/.ci/onnx/jenkins/prepare_environment.sh
@@ -68,7 +68,7 @@ function build_ngraph() {
    make install || return 1
    cd "${ngraph_directory}/ngraph/python"
    if [ ! -d ./pybind11 ]; then
-        git clone --recursive -b allow-nonconstructible-holders https://github.com/jagerman/pybind11.git
+        git clone --recursive https://github.com/pybind/pybind11.git
    fi
    export PYBIND_HEADERS_PATH="${ngraph_directory}/ngraph/python/pybind11"
    export NGRAPH_CPP_BUILD_PATH="${ngraph_directory}/ngraph_dist"

--- a/.ci/travis/ubuntu/Dockerfile
+++ b/.ci/travis/ubuntu/Dockerfile
@@ -39,7 +39,7 @@ RUN make install

 # Prepare nGraph Python API
 WORKDIR /root/ngraph/python
-RUN git clone --recursive -b allow-nonconstructible-holders https://github.com/jagerman/pybind11.git
+RUN git clone --recursive https://github.com/pybind/pybind11.git
 ENV NGRAPH_CPP_BUILD_PATH /root/ngraph_dist
 ENV LD_LIBRARY_PATH /root/ngraph_dist/lib
 ENV PYBIND_HEADERS_PATH /root/ngraph/python/pybind11

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -558,3 +558,16 @@ bool ngraph::is_strided(const Strides& strides)
 {
    return std::any_of(strides.begin(), strides.end(), [](size_t stride) { return stride != 1; });
 }
+
+bool ngraph::is_valid_rank(const std::shared_ptr<Node>& node, std::vector<size_t> valid_ranks)
+{
+    auto node_rank = node->get_shape().size();
+    for (auto rank : valid_ranks)
+    {
+        if (rank == node_rank)
+        {
+            return true;
+        }
+    }
+    return false;
+}
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -315,4 +315,6 @@ namespace ngraph
    bool possibly_overwritten(Node* node);

    bool is_strided(const Strides& strides);
+
+    bool is_valid_rank(const std::shared_ptr<Node>& node, std::vector<size_t> valid_ranks);
 }
--- a/src/ngraph/node.hpp
+++ b/src/ngraph/node.hpp
@@ -103,10 +103,11 @@ namespace ngraph
        void validate_and_infer_elementwise_logical();

        Node(const std::string& node_type, const NodeVector& arguments, size_t output_size = 1);
-        virtual ~Node();

        virtual void generate_adjoints(autodiff::Adjoints& adjoints, const NodeVector& deltas) {}
    public:
+        virtual ~Node();
+        void revalidate_and_infer_types() { validate_and_infer_types(); }
        // Called after transition
        void delayed_validate_and_infer_types();


--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -110,6 +110,7 @@ set(SRC
    pass/cpu_post_layout_optimizations.cpp
    pass/cpu_rnn_fusion.cpp
    pass/cpu_workspace_insertion.cpp
+    pass/cpu_reshape_sinking.cpp
 )

 if (NOT NGRAPH_DEX_ONLY)

--- a/src/ngraph/runtime/cpu/builder/softmax.cpp
+++ b/src/ngraph/runtime/cpu/builder/softmax.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/runtime/cpu/kernel/softmax.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/reference/softmax.hpp"

 using namespace std;
 using namespace ngraph;
@@ -131,8 +132,35 @@ namespace ngraph
                        };
                        functors.emplace_back(functor);
                    }
+                    else if (arg_shape.size() == 4 && axes.size() == 3)
+                    {
+                        std::function<decltype(runtime::cpu::kernel::softmax_4d_3rd<float>)> kernel;
+
+                        SELECT_KERNEL(kernel,
+                                      args[0].get_element_type(),
+                                      runtime::cpu::kernel::softmax_4d_3rd);
+
+                        auto functor = [&, kernel, arg_shape, axes](CPURuntimeContext* ctx) {
+                            kernel(arg_tensor, out_tensor, arg_shape, axes);
+                        };
+                        functors.emplace_back(functor);
+                    }
+                    else if (softmax->get_element_type() == element::f32)
+                    {
+                        NGRAPH_WARN << "Falling back to refernce kernel for softmax " << arg_shape
+                                    << " over " << axes;
+                        auto functor = [&, arg_shape, axes](CPURuntimeContext* ctx) {
+                            runtime::reference::softmax<float>(static_cast<float*>(arg_tensor),
+                                                               static_cast<float*>(out_tensor),
+                                                               arg_shape,
+                                                               axes);
+                        };
+                        functors.emplace_back(functor);
+                    }
                    else
                    {
+                        NGRAPH_ERR << "Unsupported Softmax " << arg_shape << " over " << axes
+                                   << " in cpu buiilder";
                        throw ngraph_error("Unsupported Softmax");
                    }
                }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -126,6 +126,7 @@
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/pass/nop_elimination.hpp"
+#include "ngraph/pass/zero_dim_tensor_elimination.hpp"
 #include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
@@ -1001,6 +1002,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
 {
    pass_manager.register_pass<ngraph::pass::LikeReplacement>();
    pass_manager.register_pass<ngraph::pass::NopElimination>();
+    pass_manager.register_pass<ngraph::pass::ZeroDimTensorElimination>();
    // TODO (pruthvi): Enable all the disabeled RNN fusion graph pass after fixing
    // failing mxnet unit tests.
    // pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
@@ -1013,7 +1015,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUHorizontalFusion>();
+    // pass_manager.register_pass<runtime::cpu::pass::CPUHorizontalFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>();
    NodeVector nv_cwi; // We dont need CPUWorkspaceInsertion to return list of indices
    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi, false);

--- a/src/ngraph/runtime/cpu/kernel/result.hpp
+++ b/src/ngraph/runtime/cpu/kernel/result.hpp
@@ -16,6 +16,9 @@

 #pragma once

+#include <cstddef>
+#include <cstring>
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/cpu/kernel/softmax.hpp
+++ b/src/ngraph/runtime/cpu/kernel/softmax.hpp
@@ -21,6 +21,7 @@

 #include "ngraph/axis_set.hpp"
 #include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/shape.hpp"

 namespace ngraph
 {
@@ -147,6 +148,15 @@ namespace ngraph
                {
                    softmax<ElementType, 3, 2>(input, output, input_shape, softmax_axes);
                }
+
+                template <typename ElementType>
+                void softmax_4d_3rd(void* input,
+                                    void* output,
+                                    const Shape& input_shape,
+                                    const AxisSet& softmax_axes)
+                {
+                    softmax<ElementType, 4, 3>(input, output, input_shape, softmax_axes);
+                }
            }
        }
    }

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -311,6 +311,20 @@ bool runtime::cpu::mkldnn_utils::is_perm_sorted(const Strides& a, const AxisVect
 mkldnn::memory::desc runtime::cpu::mkldnn_utils::create_blocked_mkldnn_md(
    const Shape& dims, const Strides& strides, const ngraph::element::Type type)
 {
+    if (dims.size() > TENSOR_MAX_DIMS || strides.size() > TENSOR_MAX_DIMS)
+    {
+        throw ngraph_error("In create_blocked_mkldnn_md: Dimensions (dims, stride): (" +
+                           std::to_string(dims.size()) + ", " + std::to_string(strides.size()) +
+                           ") exceed maximum supported by MKLDNN " +
+                           std::to_string(TENSOR_MAX_DIMS));
+    }
+
+    if (dims.size() != strides.size())
+    {
+        throw ngraph_error("In create_blocked_mkldnn_md: Rank mismatch between shape and strides " +
+                           std::to_string(dims.size()) + " " + std::to_string(strides.size()));
+    }
+
    memory::dims dim(dims.begin(), dims.end());
    memory::dims stride(strides.begin(), strides.end());
    memory::data_type dtype = get_mkldnn_data_type(type);
@@ -515,7 +529,7 @@ memory::desc runtime::cpu::mkldnn_utils::expand_blocked_md(const memory::desc& i
    size_t k = 0;
    for (size_t i = 0, j = 0; j < md.ndims; j++)
    {
-        if (j == axis_list[k])
+        if (k < axis_list.size() && j == axis_list[k])
        {
            k++;
            md.dims[j] = 1;
@@ -531,7 +545,8 @@ memory::desc runtime::cpu::mkldnn_utils::expand_blocked_md(const memory::desc& i
            }
            else
            {
-                md.layout_desc.blocking.strides[1][j] = 0;
+                md.layout_desc.blocking.strides[1][j] =
+                    in.data.layout_desc.blocking.strides[0][in.data.ndims - 1];
                size_t nelems = 1;
                for (size_t idx = 0; idx < in.data.ndims; idx++)
                    nelems *= in.data.dims[idx];

--- a/src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp
@@ -124,7 +124,7 @@ void ngraph::runtime::cpu::pass::CPUHorizontalFusion::cpu_conv_horizontal_fusion
            NGRAPH_DEBUG << "conv_horizontal_fusion: slice shape " << slice_shape << "\n";
            auto lower_bounds = Coordinate{0, index, 0, 0};
            index += slice_shape[1];
-            auto upper_bounds = Coordinate{slice_shape[0], index, slice_shape[2], slice_shape[2]};
+            auto upper_bounds = Coordinate{slice_shape[0], index, slice_shape[2], slice_shape[3]};
            NGRAPH_DEBUG << "conv_horizontal_fusion: lower_bounds " << lower_bounds << "\n";
            NGRAPH_DEBUG << "conv_horizontal_fusion: upper_bounds " << upper_bounds << "\n";
            auto slice =

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -1533,7 +1533,18 @@ namespace ngraph
                    }
                    else
                    {
-                        set_native_layouts(external_function, node);
+                        if (mkldnn_utils::get_input_mkldnn_md(node.get(), 0).data.format ==
+                            mkldnn_format_undef)
+                        {
+                            set_native_layouts(external_function, node);
+                        }
+                        else
+                        {
+                            auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
+                            vector<memory::desc> o_mds;
+                            o_mds.push_back(input_md);
+                            set_output_layouts(node, o_mds);
+                        }
                    }
                }

@@ -1775,33 +1786,36 @@ namespace ngraph
                        auto result_shape = slice->get_output_shape(0);

                        auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
-                        auto input_pd = mkldnn::memory::primitive_desc(
-                            input_md, runtime::cpu::mkldnn_utils::global_cpu_engine);
-                        auto dims = mkldnn::memory::dims(result_shape.begin(), result_shape.end());
-                        auto offsets =
-                            mkldnn::memory::dims(lower_bounds.begin(), lower_bounds.end());
+                        NGRAPH_DEBUG << "input memory format: " << input_md.data.format << "\n";
+                        auto result_format =
+                            static_cast<mkldnn::memory::format>(input_md.data.format);

-                        try
-                        {
-                            // MKLDNN currently doesn't support views for blocked layouts
-                            // when the dims and offsets are not divisible by the block size
-                            auto view_md = mkldnn::view::primitive_desc(input_pd, dims, offsets)
-                                               .dst_primitive_desc()
-                                               .desc();
-                            vector<memory::desc> o_mds;
-                            o_mds.push_back(view_md);
-                            set_output_layouts(node, o_mds);
-                        }
-                        catch (const mkldnn::error& e)
+                        // check lower bounds and output shape
+                        for (auto i = 0; i < input_md.data.ndims; i++)
                        {
-                            if (e.status == mkldnn_unimplemented)
+                            auto block_size = input_md.data.layout_desc.blocking.block_dims[i];
+                            if (block_size != 0 && (lower_bounds[i] % block_size != 0 ||
+                                                    result_shape[i] % block_size != 0))
                            {
+                                NGRAPH_DEBUG << "slice: number of channels in lower bounds or "
+                                                "output shape is not multiple of block size, "
+                                                "set native layout\n";
                                set_native_layouts(external_function, node);
+                                return;
                            }
-                            else
-                            {
-                                throw ngraph_error(e.message);
-                            }
+                        }
+
+                        if (result_format == mkldnn::memory::blocked)
+                        {
+                            set_native_layouts(external_function, node);
+                        }
+                        else
+                        {
+                            vector<memory::desc> o_mds;
+                            auto result_desc = mkldnn_utils::create_default_mkldnn_md(
+                                node.get(), 0, true, result_format);
+                            o_mds.push_back(result_desc);
+                            set_output_layouts(node, o_mds);
                        }
                    }
                    else

--- a/src/ngraph/runtime/cpu/pass/cpu_mat_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_mat_fusion.cpp
@@ -23,6 +23,7 @@
 #include <unordered_map>

 #include "cpu_mat_fusion.hpp"
+#include "ngraph/graph_util.hpp"
 #include "ngraph/op/add.hpp"
 #include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/concat.hpp"
@@ -147,6 +148,26 @@ bool runtime::cpu::pass::CPURnnMatFusion::run_on_function(std::shared_ptr<Functi
            auto matched_weight = matcher_v2->get_pattern_map()[W]->get_argument(0);
            auto matched_data = matcher_v2->get_pattern_map()[input_data];
            auto matched_bias = matcher_v2->get_pattern_map()[b]->get_argument(0);
+            std::vector<size_t> supported_ranks{2, 3};
+
+            if (!ngraph::is_valid_rank(matcher_v2->get_match_root(), supported_ranks))
+            {
+                NGRAPH_DEBUG << "Add (mat_fusion_v2) " << matcher_v2->get_match_root()->get_name()
+                             << " isn't 2D or 3D";
+                continue;
+            }
+            if (!ngraph::is_valid_rank(matched_weight, supported_ranks))
+            {
+                NGRAPH_DEBUG << "Weights (mat_fusion_v2) " << matched_weight << " isn't 2D or 3D";
+                continue;
+            }
+
+            if (!ngraph::is_valid_rank(matched_data, supported_ranks))
+            {
+                NGRAPH_DEBUG << "Data (mat_fusion_v2) " << matched_data << " isn't 2D or 3D";
+                continue;
+            }
+
            map_weights_to_pattern[matched_weight].push_back(matcher_v2->get_match_root());
            map_weights_bias_to_data[std::make_pair(matched_weight, matched_bias)].push_back(
                matched_data);
@@ -234,6 +255,7 @@ bool runtime::cpu::pass::CPURnnMatFusion::run_on_function(std::shared_ptr<Functi
                    concated_data, data_order, Shape{data_shape[0] * data_shape[1], data_shape[2]});
            }
            auto new_input_node = data_shape.size() == 2 ? concated_data : input_reshape_node;
+            NGRAPH_ASSERT(new_input_node);
            auto w_reshape_node = std::make_shared<op::Reshape>(
                weights, AxisVector{1, 0}, Shape{w_shape[1], w_shape[0]});
            auto new_dot = std::make_shared<op::Dot>(new_input_node, w_reshape_node);
@@ -248,8 +270,15 @@ bool runtime::cpu::pass::CPURnnMatFusion::run_on_function(std::shared_ptr<Functi
            size_t end_index = batch_size;
            for (auto& matched_root_node : map_weights_to_pattern[weights])
            {
-                auto slice_node = std::make_shared<op::Slice>(
+                std::shared_ptr<Node> slice_node = std::make_shared<op::Slice>(
                    new_add_bias, Coordinate{start_index, 0}, Coordinate{end_index, shape_axis_1});
+
+                if (matched_root_node->get_shape().size() != 2)
+                {
+                    NGRAPH_ASSERT(matched_root_node->get_shape().size() == 3);
+                    slice_node = std::make_shared<op::Reshape>(
+                        slice_node, AxisVector{0, 1}, matched_root_node->get_shape());
+                }
                start_index += batch_size;
                end_index += batch_size;
                NGRAPH_DEBUG << "Replacing op " << matched_root_node->get_name() << " with "

--- a/src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/pass/pass.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                class CPUReshapeSinking : public ngraph::pass::FunctionPass
+                {
+                public:
+                    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
+                };
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -170,13 +170,15 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& d
 size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
                                               NVShape input_shape,
                                               NVShape output_shape,
-                                               size_t one_hot_axis)
+                                               size_t one_hot_axis,
+                                               size_t output_datatype_size)
 {
    std::stringstream kernel_name;
    kernel_name << "onehot_" << join(dtypes, "_");

    std::string hash = kernel_name.str() + "_i_" + join(input_shape, "_") + "_o_" +
-                       join(output_shape, "_") + std::to_string(one_hot_axis);
+                       join(output_shape, "_") + "_axis_" + std::to_string(one_hot_axis) +
+                       "_datasize_" + std::to_string(output_datatype_size);
    // For backwards compatability we currently use two unordered maps
    // 1. one looks up the compiled cuda kernel (CudaFunctionPool)
    // 2. the other looks to see if this kernel is already in the primitive list
@@ -206,18 +208,19 @@ size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>&
    uint32_t block_size_x = 64;
    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);

-    uint32_t repeat_times = static_cast<uint32_t>(output_shape[one_hot_axis]);
-    uint32_t repeat_size = 1;
+    uint32_t hot_axis_shape = static_cast<uint32_t>(output_shape[one_hot_axis]);
+    uint32_t hot_axis_stride = 1;
    for (size_t i = one_hot_axis + 1; i < output_shape.size(); i++)
    {
-        repeat_size *= output_shape[i];
+        hot_axis_stride *= output_shape[i];
    }
-
+    uint32_t output_size = static_cast<uint32_t>(shape_size(output_shape) * output_datatype_size);
    // create the launch primitive
    std::unique_ptr<gpu::primitive> kernel_launch(
        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
            std::vector<void*> args_list{
-                &inputs[0], &outputs[0], &repeat_size, &repeat_times, &nthreads};
+                &inputs[0], &outputs[0], &hot_axis_stride, &hot_axis_shape, &nthreads};
+            runtime::gpu::cuda_memset(outputs[0], 0, output_size);
            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
                                          aligned_grid_size_x,
                                          1,
@@ -1780,8 +1783,9 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
    auto output_type = out[0].get_element_type().c_type_string();

    auto exp_index = build_elementwise<ngraph::op::Exp>({input_type, output_type}, input_shape);
+    std::vector<element::Type> dtypes{args[0].get_element_type(), out[0].get_element_type()};
    auto reduce_index = cudnn_emitter->build_reduce_forward(
-        CUDNN_REDUCE_TENSOR_ADD, output_type, input_shape, axes);
+        CUDNN_REDUCE_TENSOR_ADD, dtypes, input_shape, axes, CUDNNEmitter::ReductionMode::Reduce);
    size_t divide_index = build_softmax_divide(
        std::vector<std::string>(3, output_type), input_shape, reduced_shape, axes_flag);


--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -99,7 +99,8 @@ namespace ngraph
                size_t build_onehot(const std::array<std::string, 2>& dtypes,
                                    NVShape input_shape,
                                    NVShape output_shape,
-                                    size_t one_hot_axis);
+                                    size_t one_hot_axis,
+                                    size_t output_datatype_size);

                size_t build_reverse(const std::array<std::string, 2>& dtypes,
                                     NVShape input_shape,

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
@@ -72,6 +72,19 @@ namespace ngraph
                    Backward
                };

+                enum class ReductionMode
+                {
+                    Reduce,
+                    ArgReduce
+                };
+
+                enum class algo_search
+                {
+                    HEURISTIC,
+                    EXPLICIT,
+                    NONE
+                };
+
                size_t build_convolution(const std::string& dtype,
                                         const Shape& input_tensor_shape,
                                         const Shape& input_filter_shape,
@@ -79,30 +92,33 @@ namespace ngraph
                                         const Strides& window_movement_strides,
                                         const Strides& window_dilation_strides,
                                         const Shape& padding_below,
-                                         const bool find_algo = false);
-
-                size_t build_convolution_backward_data(const std::string& dtype,
-                                                       const Shape& input_filter_shape,
-                                                       const Shape& input_tensor_shape,
-                                                       const Shape& output_tensor_shape,
-                                                       const Strides& window_movement_strides,
-                                                       const Strides& window_dilation_strides,
-                                                       const Shape& padding_below,
-                                                       const bool find_algo = false);
-
-                size_t build_convolution_backward_filter(const std::string& dtype,
-                                                         const Shape& input_tensor_shape_0,
-                                                         const Shape& input_tensor_shape_1,
-                                                         const Shape& output_filter_shape,
-                                                         const Strides& window_movement_strides,
-                                                         const Strides& window_dilation_strides,
-                                                         const Shape& padding_below,
-                                                         const bool find_algo = false);
+                                         const algo_search find_algo = algo_search::NONE);
+
+                size_t build_convolution_backward_data(
+                    const std::string& dtype,
+                    const Shape& input_filter_shape,
+                    const Shape& input_tensor_shape,
+                    const Shape& output_tensor_shape,
+                    const Strides& window_movement_strides,
+                    const Strides& window_dilation_strides,
+                    const Shape& padding_below,
+                    const algo_search find_algo = algo_search::NONE);
+
+                size_t build_convolution_backward_filter(
+                    const std::string& dtype,
+                    const Shape& input_tensor_shape_0,
+                    const Shape& input_tensor_shape_1,
+                    const Shape& output_filter_shape,
+                    const Strides& window_movement_strides,
+                    const Strides& window_dilation_strides,
+                    const Shape& padding_below,
+                    const algo_search find_algo = algo_search::NONE);

                size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
-                                            const std::string& dtype,
+                                            const std::vector<element::Type>& dtypes,
                                            const Shape& input_shape,
-                                            const AxisSet& reduction_axes);
+                                            const AxisSet& reduction_axes,
+                                            const ReductionMode& reduction_mode);

                size_t build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
                                       const std::string& dtype,
@@ -154,6 +170,7 @@ namespace ngraph
                void* get_data_by_type(cudnnDataType_t data_type, double value);

                cudnnDataType_t get_cudnn_datatype(std::string dtype);
+                cudnnDataType_t get_cudnn_datatype(const element::Type& dtype);

                cudnnTensorDescriptor_t&
                    tensor_descriptor_from_shape(const Shape& shape,
@@ -178,6 +195,24 @@ namespace ngraph
                                                     cudnnConvolutionMode_t mode,
                                                     cudnnDataType_t data_type);

+                template <typename PERF_TYPE, typename ALGO_TYPE>
+                ALGO_TYPE
+                    select_cudnn_algo(const std::vector<PERF_TYPE>& perf_results,
+                                      size_t workspace_byte = std::numeric_limits<size_t>::max())
+                {
+                    for (auto i = 0; i != perf_results.size(); ++i)
+                    {
+                        auto const& result = perf_results[i];
+                        if (result.status == CUDNN_STATUS_SUCCESS &&
+                            result.memory <= workspace_byte)
+                        {
+                            return result.algo;
+                        }
+                    }
+                    throw ngraph_error(
+                        "No suitable cuDNN algorithm was found for the requested operation.");
+                }
+
                CUDNNDescriptors m_descriptors;
                CUDNNHostParameters m_host_parameters;


--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -435,15 +435,24 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
                                                    const std::array<std::string, 2>& data_types)
 {
    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
-           << data_types[1] << "* out, uint32_t m, uint32_t k, uint32_t n)\n";
+           << data_types[1]
+           << "* out, uint32_t hot_axis_stride, uint32_t hot_axis_shape, uint32_t n)\n";
    writer.block_begin();
    {
        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
        writer << "if (tid < n)\n";
        writer.block_begin();
        {
-            writer << "uint32_t idx = (tid / m) * m * k + (m * in[tid]) + tid % m;\n";
-            writer << "out[idx] = 1;\n";
+            writer << "int32_t in_pixel = static_cast<int32_t>(in[tid]);\n";
+            writer << "if(in_pixel >= 0 && in_pixel < hot_axis_shape)\n";
+            writer.block_begin();
+            {
+                writer << "uint32_t idx = tid / hot_axis_stride * hot_axis_stride * hot_axis_shape "
+                          "+ (hot_axis_stride * in_pixel) + tid % "
+                          "hot_axis_stride;\n";
+                writer << "out[idx] = 1;\n";
+            }
+            writer.block_end();
        }
        writer.block_end();
    }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -164,12 +164,45 @@ void runtime::gpu::GPU_Emitter::emit_And(EMIT_ARGS)

 void runtime::gpu::GPU_Emitter::emit_ArgMax(EMIT_ARGS)
 {
-    throw unsupported_op("Unsupported op '" + node->description() + "'");
+    cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_MAX;
+    runtime::gpu::GPU_Emitter::emit_ArgReduce(
+        external_function, writer, node, args, out, reduce_op);
 }

 void runtime::gpu::GPU_Emitter::emit_ArgMin(EMIT_ARGS)
 {
-    throw unsupported_op("Unsupported op '" + node->description() + "'");
+    cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_MIN;
+    runtime::gpu::GPU_Emitter::emit_ArgReduce(
+        external_function, writer, node, args, out, reduce_op);
+}
+
+void runtime::gpu::GPU_Emitter::emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t reduce_mode)
+{
+    if (out[0].get_size() == 0)
+    {
+        return;
+    }
+    auto argmax = static_cast<const ngraph::op::ArgMax*>(node);
+    std::vector<size_t> axes{argmax->get_reduction_axis()};
+    auto axis_set = AxisSet(axes);
+
+    std::vector<element::Type> dtypes{args[0].get_element_type(), out[0].get_element_type()};
+
+    writer.block_begin();
+    {
+        auto& cudnn_emitter = external_function->get_primitive_emitter()->get_cudnn_emitter();
+
+        auto index = cudnn_emitter->build_reduce_forward(reduce_mode,
+                                                         dtypes,
+                                                         args[0].get_shape(),
+                                                         axis_set,
+                                                         CUDNNEmitter::ReductionMode::ArgReduce);
+
+        writer << "void* input[] = {" << node_names(args) << "};\n";
+        writer << "void* output[] = {" << node_names(out) << "};\n";
+        writer << "gpu::invoke_primitive(ctx, " << index << ", input, output);\n";
+    }
+    writer.block_end();
 }

 void runtime::gpu::GPU_Emitter::emit_Asin(EMIT_ARGS)
@@ -792,13 +825,17 @@ void runtime::gpu::GPU_Emitter::emit_OneHot(EMIT_ARGS)
    auto onehot = static_cast<const ngraph::op::OneHot*>(node);
    auto arg_shape = args[0].get_shape();
    auto result_shape = out[0].get_shape();
+    auto output_datatype_size = out[0].get_element_type().size();
    size_t idx = onehot->get_one_hot_axis();

    writer.block_begin();
    {
        auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
-        auto index = cuda_emitter->build_onehot(
-            {{args[0].get_type(), out[0].get_type()}}, arg_shape, result_shape, idx);
+        auto index = cuda_emitter->build_onehot({{args[0].get_type(), out[0].get_type()}},
+                                                arg_shape,
+                                                result_shape,
+                                                idx,
+                                                output_datatype_size);

        writer.block_begin();
        writer << "void* input[] = {" << node_names(args) << "};\n";
@@ -852,6 +889,7 @@ void runtime::gpu::GPU_Emitter::emit_Power(EMIT_ARGS)
 void runtime::gpu::GPU_Emitter::emit_Product(EMIT_ARGS)
 {
    const ngraph::op::Product* product = static_cast<const ngraph::op::Product*>(node);
+
    writer.block_begin();
    {
        if (out[0].get_size() != 0)
@@ -873,12 +911,16 @@ void runtime::gpu::GPU_Emitter::emit_Product(EMIT_ARGS)
            // descriptors for tensors  with <= 4 dimensions
            else
            {
+                std::vector<element::Type> dtypes{args[0].get_element_type(),
+                                                  out[0].get_element_type()};
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                auto index = cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MUL,
-                                                                 out[0].get_type(),
-                                                                 args[0].get_shape(),
-                                                                 product->get_reduction_axes());
+                auto index =
+                    cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MUL,
+                                                        dtypes,
+                                                        args[0].get_shape(),
+                                                        product->get_reduction_axes(),
+                                                        CUDNNEmitter::ReductionMode::Reduce);

                writer << "void* input[] = {" << node_names(args) << "};\n";
                writer << "void* output[] = {" << node_names(out) << "};\n";
@@ -967,14 +1009,16 @@ void runtime::gpu::GPU_Emitter::emit_Reduce(EMIT_ARGS)
                        reduce_tensor_op = f_ptr->second;
                    }
                }
-
+                std::vector<element::Type> dtypes{args[0].get_element_type(),
+                                                  out[0].get_element_type()};
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
                auto reduce_index =
                    cudnn_emitter->build_reduce_forward(reduce_tensor_op,
-                                                        out[0].get_type(),
+                                                        dtypes,
                                                        args[0].get_shape(),
-                                                        reduce_op->get_reduction_axes());
+                                                        reduce_op->get_reduction_axes(),
+                                                        CUDNNEmitter::ReductionMode::Reduce);

                writer << "void* input[] = {" << node_names(args) << "};\n";
                writer << "void* output[] = {" << node_names(out) << "};\n";

--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -75,6 +75,8 @@ namespace ngraph
                    writer.block_end();
                }

+                static void emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t);
+
            private:
                /// \brief Create a list of node names for each arg in args
                /// \param args list of tensor arguments

--- a/src/ngraph/runtime/gpu/gpu_host_parameters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_host_parameters.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <cinttypes>
 #include <list>

 namespace ngraph

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -31,8 +31,6 @@ backwards_avgpool_n1_c1_hw4x4
 backwards_avgpool_n2_c2_hw4x4
 max_pool_3d
 avg_pool_3d
-argmin_trivial
-argmax_trivial
 topk_1d_max_all
 topk_1d_max_partial
 topk_1d_max_one

--- a/src/ngraph/runtime/performance_counter.hpp
+++ b/src/ngraph/runtime/performance_counter.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <cstddef>
+#include <string>

 namespace ngraph
 {

--- a/src/ngraph/util.cpp
+++ b/src/ngraph/util.cpp
@@ -488,3 +488,13 @@ AxisVector ngraph::get_default_order(size_t rank)
    std::iota(begin(default_order), end(default_order), 0);
    return default_order;
 }
+
+AxisVector ngraph::get_permutation_to_default_order(const AxisVector& axis_order)
+{
+    AxisVector out(axis_order.size());
+    for (size_t i = 0; i < axis_order.size(); i++)
+    {
+        out.at(axis_order[i]) = i;
+    }
+    return out;
+}
--- a/src/ngraph/util.hpp
+++ b/src/ngraph/util.hpp
@@ -204,6 +204,8 @@ namespace ngraph
    AxisVector get_default_order(size_t rank);
    AxisVector get_default_order(const Shape& shape);

+    AxisVector get_permutation_to_default_order(const AxisVector& axis_order);
+
    /*
    * Return type struct for cache_fprop, with the modified fprop and bprop
    * functions

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -29,6 +29,7 @@ set(SRC
    element_type.cpp
    file_util.cpp
    graph_partition.cpp
+    includes.cpp
    inliner.cpp
    input_output_assign.cpp
    main.cpp
@@ -49,6 +50,9 @@ set(SRC
    zero_dim_tensor_elimination.cpp
 )

+set_source_files_properties(includes.cpp PROPERTIES COMPILE_DEFINITIONS
+    NGRAPH_INCLUDES="${PROJECT_SOURCE_DIR}/src/ngraph")
+
 if (NGRAPH_ONNX_IMPORT_ENABLE)
    list(APPEND SRC onnx_import.cpp)
    if (NGRAPH_ONNXIFI_ENABLE)
@@ -69,7 +73,7 @@ add_subdirectory(files)
 add_subdirectory(util)

 if(NGRAPH_CPU_ENABLE)
-    set(SRC ${SRC} backend_performance.cpp cpu_fusion.cpp cpu_test.cpp)
+    set(SRC ${SRC} backend_performance.cpp cpu_fusion.cpp cpu_test.cpp cpu_reshape_sinking.cpp)
 endif()

 if(NGRAPH_GPU_ENABLE)

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
--- a/test/coordinate.cpp
+++ b/test/coordinate.cpp
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -890,6 +890,7 @@ TEST(cpu_fusion, conv_bias_relu_n2c1h2w2_2)
    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
 }

+#if 0
 TEST(cpu_fusion, conv_horizontal_fusion)
 {
    Shape shape_a{2, 1, 6, 6};
@@ -940,6 +941,7 @@ TEST(cpu_fusion, conv_horizontal_fusion)
    size_t cpu_cb = count_ops_of_type<op::ConvolutionBias>(cpu_f);
    ASSERT_EQ(cpu_cb, 1);
 }
+#endif

 // ConvolutionBiasAdd relies on an in-place fused MKLDNN kernel.
 // Need to ensure that it is fused only when in-place buffer allocation is feasible

--- a/test/cpu_reshape_sinking.cpp
+++ b/test/cpu_reshape_sinking.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+#include <list>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "ngraph/autodiff/adjoints.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/batch_norm.hpp"
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/op/parameter.hpp"
+#include "ngraph/pass/core_fusion.hpp"
+#include "ngraph/pass/cse.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/reshape_elimination.hpp"
+#include "ngraph/pass/visualize_tree.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_reshape_sinking.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+#include "nlohmann/json.hpp"
+#include "util/all_close.hpp"
+#include "util/autodiff/backprop_function.hpp"
+#include "util/autodiff/numeric_compare.hpp"
+#include "util/ndarray.hpp"
+#include "util/random.hpp"
+#include "util/test_tools.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+TEST(cpu_reshape_sinking, edge_splitting)
+{
+    //checks if Reshapes are pushed through op::Abs, but stopped by Sum
+    Shape shape_nhwc{16, 28, 28, 1};
+    Shape shape_nchw{16, 1, 28, 28};
+    auto a = make_shared<op::Parameter>(element::i32, shape_nhwc);
+    auto reshape = make_shared<op::Reshape>(a, AxisVector{0, 3, 1, 2}, shape_nchw);
+    auto absn = make_shared<op::Abs>(reshape);
+    auto absn2 = make_shared<op::Abs>(absn);
+    auto sum = make_shared<op::Sum>(reshape, AxisSet{0, 1, 2, 3});
+    auto func = make_shared<Function>(NodeVector{absn2, sum}, op::ParameterVector{a});
+    pass::Manager pass_manager;
+    //size_t before_count = count_ops_of_type<op::Reshape>(func);
+    pass_manager.register_pass<pass::VisualizeTree>("before.pdf");
+    pass_manager.register_pass<runtime::cpu::pass::CPUReshapeSinking>();
+    pass_manager.register_pass<pass::ReshapeElimination>();
+    pass_manager.register_pass<pass::CommonSubexpressionElimination>();
+    pass_manager.register_pass<pass::VisualizeTree>("after.pdf");
+    pass_manager.run_passes(func);
+    ASSERT_EQ(func->get_results().at(1)->get_argument(0), sum);
+    auto new_reshape =
+        std::dynamic_pointer_cast<op::Reshape>(func->get_results().at(0)->get_argument(0));
+    ASSERT_TRUE(new_reshape);
+    ASSERT_EQ(new_reshape->get_shape(), shape_nchw);
+}
+
+TEST(cpu_reshape_sinking, mnist_conv)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "tf_conv_mnist_nhwc.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass::Manager pass_manager;
+    size_t before_count = count_ops_of_type<op::Reshape>(func);
+    //pass_manager.register_pass<pass::VisualizeTree>("before.pdf");
+    pass_manager.register_pass<runtime::cpu::pass::CPUReshapeSinking>();
+    pass_manager.register_pass<pass::ReshapeElimination>();
+    pass_manager.register_pass<pass::CommonSubexpressionElimination>();
+    //pass_manager.register_pass<pass::CoreFusion>();
+    //pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    //pass_manager.register_pass<pass::VisualizeTree>("after.pdf");
+    pass_manager.run_passes(func);
+    size_t before_after = count_ops_of_type<op::Reshape>(func);
+    ASSERT_LE(before_after, before_count);
+}
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -195,8 +195,9 @@ TEST(cpu_test, mkldnn_layouts)
    EXPECT_EQ(vector<float>{expected_result}, rv);
 }

-TEST(cpu_test, reshape_squeeze)
+TEST(cpu_test, reshape_layout_optimizations1)
 {
+    // Squeeze outermost dimension
    auto make_function = []() -> std::shared_ptr<Function> {
        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 2});
        auto B = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
@@ -233,8 +234,9 @@ TEST(cpu_test, reshape_squeeze)
    }
 }

-TEST(cpu_test, reshape_expand)
+TEST(cpu_test, reshape_layout_optimizations2)
 {
+    // ExpandDims - inner most and internal dims
    auto make_function = []() -> std::shared_ptr<Function> {
        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 2});
        auto B = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
@@ -271,8 +273,9 @@ TEST(cpu_test, reshape_expand)
    }
 }

-TEST(cpu_test, reshape_squeeze_padded)
+TEST(cpu_test, reshape_layout_optimizations3)
 {
+    // Squeeze padded dimension
    auto make_function = []() -> std::shared_ptr<Function> {
        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 2});
        auto B = make_shared<op::Parameter>(element::f32, Shape{1, 16, 1, 1});
@@ -310,8 +313,9 @@ TEST(cpu_test, reshape_squeeze_padded)
    }
 }

-TEST(cpu_test, reshape_expand_squeeze)
+TEST(cpu_test, reshape_layout_optimizations4)
 {
+    // Squeeze and expand dimensions. Ensure no extra conversions downstream
    auto make_function = []() -> std::shared_ptr<Function> {
        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 1, 8});
        auto B1 = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
@@ -322,7 +326,7 @@ TEST(cpu_test, reshape_expand_squeeze)
                                                  CoordinateDiff{0, 0},
                                                  CoordinateDiff{0, 0},
                                                  Strides{1, 1});
-        auto squeeze = make_shared<op::Reshape>(conv1, AxisVector{0, 1, 2, 3}, Shape{1, 32, 8});
+        auto squeeze = make_shared<op::Reshape>(conv1, AxisVector{0, 1, 2, 3}, Shape{32, 1, 8});
        auto relu = make_shared<op::Relu>(squeeze);
        auto expand = make_shared<op::Reshape>(relu, AxisVector{0, 1, 2}, Shape{1, 32, 1, 8});
        auto B2 = make_shared<op::Parameter>(element::f32, Shape{8, 32, 1, 1});
@@ -357,3 +361,120 @@ TEST(cpu_test, reshape_expand_squeeze)
    }
    EXPECT_LE(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 4);
 }
+
+TEST(cpu_test, reshape_layout_optimizations5)
+{
+    auto make_function = []() -> std::shared_ptr<Function> {
+        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 1, 8});
+        auto B1 = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
+        auto conv1 = make_shared<op::Convolution>(A,
+                                                  B1,
+                                                  Strides{1, 1},
+                                                  Strides{1, 1},
+                                                  CoordinateDiff{0, 0},
+                                                  CoordinateDiff{0, 0},
+                                                  Strides{1, 1});
+        auto expand =
+            make_shared<op::Reshape>(conv1, AxisVector{0, 1, 2, 3}, Shape{1, 1, 32, 1, 8});
+        auto relu = make_shared<op::Relu>(expand);
+        auto squeeze =
+            make_shared<op::Reshape>(relu, AxisVector{0, 1, 2, 3, 4}, Shape{1, 32, 1, 8});
+        auto B2 = make_shared<op::Parameter>(element::f32, Shape{8, 32, 1, 1});
+        auto conv2 = make_shared<op::Convolution>(squeeze,
+                                                  B2,
+                                                  Strides{1, 1},
+                                                  Strides{1, 1},
+                                                  CoordinateDiff{0, 0},
+                                                  CoordinateDiff{0, 0},
+                                                  Strides{1, 1});
+        return make_shared<Function>(NodeVector{conv2}, op::ParameterVector{A, B1, B2});
+    };
+
+    auto backend = runtime::Backend::create("CPU");
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+    EXPECT_LE(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 4);
+}
+
+TEST(cpu_test, reshape_layout_optimizations6)
+{
+    // Squeeze and expand dimensions. Ensure no extra conversions downstream
+    auto make_function = []() -> std::shared_ptr<Function> {
+        auto A = make_shared<op::Parameter>(element::f32, Shape{2, 4, 3, 2});
+        auto mul = make_shared<op::Multiply>(A, A);
+        auto sum = make_shared<op::Sum>(mul, AxisVector{0});
+        auto reshape = make_shared<op::Reshape>(sum, AxisVector{0, 1, 2}, Shape{1, 4, 3, 2});
+        auto sqrt = make_shared<op::Sqrt>(reshape);
+        return make_shared<Function>(NodeVector{sqrt}, op::ParameterVector{A});
+    };
+
+    auto backend = runtime::Backend::create("CPU");
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i)));
+    }
+    EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 0);
+}
+
+TEST(cpu_test, reshape_layout_optimizations7)
+{
+    // Expand multiple dimensions. Ensure no extra conversions downstream
+    auto make_function = []() -> std::shared_ptr<Function> {
+        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 4, 10, 6, 10});
+        auto mul = make_shared<op::Multiply>(A, A);
+        auto sum = make_shared<op::Sum>(mul, AxisVector{0, 1});
+        auto reshape = make_shared<op::Reshape>(sum, AxisVector{0, 1, 2}, Shape{1, 1, 10, 6, 10});
+        return make_shared<Function>(NodeVector{reshape}, op::ParameterVector{A});
+    };
+
+    auto backend = runtime::Backend::create("CPU");
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i)));
+    }
+    EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 0);
+}
--- a/test/includes.cpp
+++ b/test/includes.cpp
--- a/test/models/tf_conv_mnist_nhwc.json
+++ b/test/models/tf_conv_mnist_nhwc.json
+[{
+  "name" : "Function_0",
+  "ops" : [
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_12",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_12_0"],
+      "shape" : [ 2, 224, 224, 3 ]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_11",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_11_0"],
+      "shape" : [10]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_10",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_10_0"],
+      "shape" : [ 37632, 10 ]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_9",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_9_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_8",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_8_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_7",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_7_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_6",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_6_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_5",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_5_0"],
+      "shape" : [ 3, 3, 3, 3 ]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_4",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_4_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_3",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_3_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_2",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_2_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_1",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_1_0"],
+      "shape" : [3]
+    },
+    {
+      "cacheable" : false,
+      "element_type" : "float",
+      "inputs" : [],
+      "name" : "Parameter_0",
+      "op" : "Parameter",
+      "outputs" : ["Parameter_0_0"],
+      "shape" : [ 3, 3, 3, 3 ]
+    },
+    {
+      "input_order" : [ 0, 3, 1, 2 ],
+      "inputs" : ["Parameter_12"],
+      "name" : "Reshape_13",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3, 224, 224 ],
+      "outputs" : ["Reshape_13_0"]
+    },
+    {
+      "axes" : [0],
+      "inputs" : ["Parameter_11"],
+      "name" : "Broadcast_36",
+      "op" : "Broadcast",
+      "outputs" : ["Broadcast_36_0"],
+      "shape" : [ 2, 10 ]
+    },
+    {
+      "input_order" : [ 3, 2, 0, 1 ],
+      "inputs" : ["Parameter_5"],
+      "name" : "Reshape_22",
+      "op" : "Reshape",
+      "output_shape" : [ 3, 3, 3, 3 ],
+      "outputs" : ["Reshape_22_0"]
+    },
+    {
+      "input_order" : [ 3, 2, 0, 1 ],
+      "inputs" : ["Parameter_0"],
+      "name" : "Reshape_14",
+      "op" : "Reshape",
+      "output_shape" : [ 3, 3, 3, 3 ],
+      "outputs" : ["Reshape_14_0"]
+    },
+    {
+      "data_dilation_strides" : [ 1, 1 ],
+      "inputs" : [ "Reshape_13", "Reshape_14" ],
+      "name" : "Convolution_15",
+      "op" : "Convolution",
+      "outputs" : ["Convolution_15_0"],
+      "padding_above" : [ 1, 1 ],
+      "padding_below" : [ 1, 1 ],
+      "window_dilation_strides" : [ 1, 1 ],
+      "window_movement_strides" : [ 1, 1 ]
+    },
+    {
+      "input_order" : [ 0, 2, 3, 1 ],
+      "inputs" : ["Convolution_15"],
+      "name" : "Reshape_16",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 224, 224, 3 ],
+      "outputs" : ["Reshape_16_0"]
+    },
+    {
+      "input_order" : [ 0, 3, 1, 2 ],
+      "inputs" : ["Reshape_16"],
+      "name" : "Reshape_17",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3, 224, 224 ],
+      "outputs" : ["Reshape_17_0"]
+    },
+    {
+      "eps" : 1.0009999641624745e-05,
+      "inputs" : [
+        "Parameter_1", "Parameter_2", "Reshape_17", "Parameter_3",
+        "Parameter_4"
+      ],
+      "name" : "BatchNorm_18",
+      "op" : "BatchNorm",
+      "outputs" : ["BatchNorm_18_0"],
+      "training" : false
+    },
+    {
+      "input_order" : [ 0, 2, 3, 1 ],
+      "inputs" : ["BatchNorm_18"],
+      "name" : "Reshape_19",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 224, 224, 3 ],
+      "outputs" : ["Reshape_19_0"]
+    },
+    {
+      "inputs" : ["Reshape_19"],
+      "name" : "Relu_20",
+      "op" : "Relu",
+      "outputs" : ["Relu_20_0"]
+    },
+    {
+      "input_order" : [ 0, 3, 1, 2 ],
+      "inputs" : ["Relu_20"],
+      "name" : "Reshape_21",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3, 224, 224 ],
+      "outputs" : ["Reshape_21_0"]
+    },
+    {
+      "data_dilation_strides" : [ 1, 1 ],
+      "inputs" : [ "Reshape_21", "Reshape_22" ],
+      "name" : "Convolution_23",
+      "op" : "Convolution",
+      "outputs" : ["Convolution_23_0"],
+      "padding_above" : [ 1, 1 ],
+      "padding_below" : [ 1, 1 ],
+      "window_dilation_strides" : [ 1, 1 ],
+      "window_movement_strides" : [ 1, 1 ]
+    },
+    {
+      "input_order" : [ 0, 2, 3, 1 ],
+      "inputs" : ["Convolution_23"],
+      "name" : "Reshape_24",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 224, 224, 3 ],
+      "outputs" : ["Reshape_24_0"]
+    },
+    {
+      "input_order" : [ 0, 3, 1, 2 ],
+      "inputs" : ["Reshape_24"],
+      "name" : "Reshape_25",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3, 224, 224 ],
+      "outputs" : ["Reshape_25_0"]
+    },
+    {
+      "eps" : 1.0009999641624745e-05,
+      "inputs" : [
+        "Parameter_6", "Parameter_7", "Reshape_25", "Parameter_8",
+        "Parameter_9"
+      ],
+      "name" : "BatchNorm_26",
+      "op" : "BatchNorm",
+      "outputs" : ["BatchNorm_26_0"],
+      "training" : false
+    },
+    {
+      "input_order" : [ 0, 2, 3, 1 ],
+      "inputs" : ["BatchNorm_26"],
+      "name" : "Reshape_27",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 224, 224, 3 ],
+      "outputs" : ["Reshape_27_0"]
+    },
+    {
+      "inputs" : [ "Reshape_27", "Parameter_12" ],
+      "name" : "Add_28",
+      "op" : "Add",
+      "outputs" : ["Add_28_0"]
+    },
+    {
+      "inputs" : ["Add_28"],
+      "name" : "Relu_29",
+      "op" : "Relu",
+      "outputs" : ["Relu_29_0"]
+    },
+    {
+      "input_order" : [ 0, 3, 1, 2 ],
+      "inputs" : ["Relu_29"],
+      "name" : "Reshape_30",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 3, 224, 224 ],
+      "outputs" : ["Reshape_30_0"]
+    },
+    {
+      "include_padding_in_avg_computation" : false,
+      "inputs" : ["Reshape_30"],
+      "name" : "AvgPool_31",
+      "op" : "AvgPool",
+      "outputs" : ["AvgPool_31_0"],
+      "padding_above" : [ 0, 0 ],
+      "padding_below" : [ 0, 0 ],
+      "window_movement_strides" : [ 2, 2 ],
+      "window_shape" : [ 2, 2 ]
+    },
+    {
+      "input_order" : [ 0, 2, 3, 1 ],
+      "inputs" : ["AvgPool_31"],
+      "name" : "Reshape_32",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 112, 112, 3 ],
+      "outputs" : ["Reshape_32_0"]
+    },
+    {
+      "input_order" : [ 0, 1, 2, 3 ],
+      "inputs" : ["Reshape_32"],
+      "name" : "Reshape_34",
+      "op" : "Reshape",
+      "output_shape" : [ 2, 37632 ],
+      "outputs" : ["Reshape_34_0"]
+    },
+    {
+      "inputs" : [ "Reshape_34", "Parameter_10" ],
+      "name" : "Dot_35",
+      "op" : "Dot",
+      "outputs" : ["Dot_35_0"],
+      "reduction_axes_count" : 1
+    },
+    {
+      "inputs" : [ "Dot_35", "Broadcast_36" ],
+      "name" : "Add_37",
+      "op" : "Add",
+      "outputs" : ["Add_37_0"]
+    },
+    {
+      "inputs" : ["Add_37"],
+      "name" : "Result_38",
+      "op" : "Result",
+      "outputs" : ["Result_38_0"]
+    }
+  ],
+  "parameters" : [
+    "Parameter_0", "Parameter_1", "Parameter_2", "Parameter_3", "Parameter_4",
+    "Parameter_5", "Parameter_6", "Parameter_7", "Parameter_8", "Parameter_9",
+    "Parameter_10", "Parameter_11", "Parameter_12"
+  ],
+  "result" : ["Result_38"]
+}]
\ No newline at end of file