merge and resolve conflict with origin master

3d53e58a · fenglei.tian · 39dc384d · b5467550 · 3d53e58a · 3d53e58a
Commit 3d53e58a authored Mar 13, 2018 by fenglei.tian
20 changed files
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -3279,6 +3279,45 @@ namespace ngraph
                       << to_string(sigmoid_index) << ");\n";
            }

+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::SigmoidBackprop)
+            {
+                auto input_shape = args[0].get_shape();
+                auto delta_shape = args[1].get_shape();
+                auto result_shape = out[0].get_shape();
+                int input_1d_size = static_cast<int>(shape_size(input_shape));
+                int delta_1d_size = static_cast<int>(shape_size(delta_shape));
+                int result_1d_size = static_cast<int>(shape_size(result_shape));
+
+                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                auto input_desc = mkldnn::memory::desc(
+                    {input_1d_size},
+                    mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto delta_desc = mkldnn::memory::desc(
+                    {delta_1d_size},
+                    mkldnn_utils::get_mkldnn_data_type(args[1].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto result_desc = mkldnn::memory::desc(
+                    {result_1d_size},
+                    mkldnn_utils::get_mkldnn_data_type(out[0].get_element_type()),
+                    mkldnn::memory::format::x);
+
+                size_t sigmoid_index =
+                    mkldnn_emitter->build_sigmoid_backward(input_desc, delta_desc, result_desc);
+
+                auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
+                writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0]) << ", "
+                       << args[0].get_name() << ");\n";
+                writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1]) << ", "
+                       << args[1].get_name() << ");\n";
+                writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2]) << ", "
+                       << out[0].get_name() << ");\n";
+
+                writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                       << to_string(sigmoid_index) << ");\n";
+            }
+
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
            {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -252,6 +252,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::ReluBackprop), &runtime::cpu::CPU_Emitter::emit<op::ReluBackprop>},
    {TI(ngraph::op::Sigmoid), &runtime::cpu::CPU_Emitter::emit<op::Sigmoid>},
    {TI(ngraph::op::Softmax), &runtime::cpu::CPU_Emitter::emit<op::Softmax>},
+    {TI(ngraph::op::SigmoidBackprop), &runtime::cpu::CPU_Emitter::emit<op::SigmoidBackprop>},
 };

 runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -513,6 +513,32 @@ size_t MKLDNNEmitter::build_sigmoid_forward(const mkldnn::memory::desc& input_de
    return primitive_index;
 }

+size_t MKLDNNEmitter::build_sigmoid_backward(const mkldnn::memory::desc& input_desc,
+                                             const mkldnn::memory::desc& delta_desc,
+                                             const mkldnn::memory::desc& result_desc)
+{
+    size_t input_index = build_memory_primitive(input_desc);
+    size_t delta_index = build_memory_primitive(delta_desc);
+    size_t result_index = build_memory_primitive(result_desc);
+
+    // sigmoid forward primitive desc
+    mkldnn::eltwise_forward::primitive_desc sigmoid_fwd_pd =
+        mkldnn::eltwise_forward::primitive_desc(
+            {mkldnn::prop_kind::forward, mkldnn::algorithm::eltwise_logistic, input_desc, 0, 0},
+            mkldnn_utils::global_cpu_engine);
+
+    size_t primitive_index = insert_primitive(new mkldnn::eltwise_backward(
+        {{mkldnn::algorithm::eltwise_logistic, delta_desc, input_desc, 0, 0},
+         mkldnn_utils::global_cpu_engine,
+         sigmoid_fwd_pd},
+        *m_mkldnn_primitives[input_index],
+        *m_mkldnn_primitives[delta_index],
+        *m_mkldnn_primitives[result_index]));
+
+    m_primitive_deps[primitive_index] = {input_index, delta_index, result_index};
+    return primitive_index;
+}
+
 size_t MKLDNNEmitter::build_elementwise_add(
    const mkldnn::memory::desc& input0_data_desc,
    const mkldnn::memory::desc& input1_data_desc,

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -153,6 +153,10 @@ namespace ngraph
                size_t build_sigmoid_forward(const mkldnn::memory::desc& input_desc,
                                             const mkldnn::memory::desc& result_desc);

+                size_t build_sigmoid_backward(const mkldnn::memory::desc& input_desc,
+                                              const mkldnn::memory::desc& delta_desc,
+                                              const mkldnn::memory::desc& result_desc);
+
                size_t build_elementwise_add(
                    const mkldnn::memory::desc& input0_data_desc,
                    const mkldnn::memory::desc& input1_data_desc,

--- a/src/ngraph/runtime/cpu/ops/sigmoid.cpp
+++ b/src/ngraph/runtime/cpu/ops/sigmoid.cpp
@@ -35,3 +35,24 @@ ngraph::op::Sigmoid::Sigmoid(std::shared_ptr<ngraph::Node> input)
 {
    add_output(input->get_element_type(), m_shape_input);
 }
+
+ngraph::op::SigmoidBackprop::SigmoidBackprop(std::shared_ptr<Node> arg, std::shared_ptr<Node> delta)
+    : RequiresTensorViewArgs("SigmoidBackprop", {arg, delta})
+{
+    if (arg->get_element_type() != delta->get_element_type())
+    {
+        throw ngraph_error("Argument and delta element types for Sigmoid backprop do not match");
+    }
+    if (arg->get_shape() != delta->get_shape())
+    {
+        throw ngraph_error("Argument and delta shape for Sigmoid backprop do not match");
+    }
+    set_value_type_checked(delta->get_element_type(), delta->get_shape());
+}
+
+void ngraph::op::Sigmoid::generate_adjoints(ngraph::autodiff::Adjoints& adjoints,
+                                            const std::shared_ptr<Node>& delta)
+{
+    auto backprop = std::make_shared<op::SigmoidBackprop>(get_input_op(0), delta);
+    adjoints.add_delta(get_input_op(0), backprop);
+}
--- a/src/ngraph/runtime/cpu/ops/sigmoid.hpp
+++ b/src/ngraph/runtime/cpu/ops/sigmoid.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include "ngraph/ops/util/requires_tensor_view_args.hpp"
+#include "ngraph/util.hpp"

 namespace ngraph
 {
@@ -29,9 +30,32 @@ namespace ngraph
            Shape get_input_shape() const { return m_shape_input; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;
+            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
+                                           const std::shared_ptr<Node>& delta) override;

        private:
            Shape m_shape_input;
        };
+
+        /// \brief Elementwise SigmoidBackprop operation.
+        ///
+        class SigmoidBackprop : public util::RequiresTensorViewArgs
+        {
+        public:
+            /// \brief Constructs a SigmoidBackprop operation.
+            ///
+            /// \param arg Node that produces the Sigmoid forward input tensor.
+            SigmoidBackprop(std::shared_ptr<ngraph::Node> arg, std::shared_ptr<ngraph::Node> delta);
+
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override
+            {
+                if (new_args.size() != 2)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<SigmoidBackprop>(new_args.at(0), new_args.at(1));
+            }
+        };
    }
 }
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -316,6 +316,19 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::SigmoidBackprop)
+                {
+                    auto sigmoid = static_cast<op::SigmoidBackprop*>(node);
+                    if (node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        sigmoid->set_op_annotations(op_annotations);
+                    }
+                }
+
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::ReluBackprop)
                {
@@ -386,6 +399,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::ReluBackprop),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReluBackprop>},
    {TI(ngraph::op::Sigmoid), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Sigmoid>},
+    {TI(ngraph::op::SigmoidBackprop),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::SigmoidBackprop>},
 };

 bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -568,6 +568,57 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_sigmoid()
    this->add_matcher(m);
 }

+void ngraph::runtime::cpu::pass::CPUFusion::construct_sigmoid_bprop()
+{
+    //construct variance
+    auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
+    auto neg_input = std::make_shared<op::Negative>(input);
+    auto exp_neg_input = std::make_shared<op::Exp>(neg_input);
+
+    // broadcast input
+    auto constant = std::make_shared<pattern::op::Label>(element::f32, Shape{});
+    auto broadcast_constant = std::make_shared<op::Broadcast>(constant, Shape{3, 4}, AxisSet{0, 1});
+
+    auto add_exp = std::make_shared<op::Add>(exp_neg_input, broadcast_constant);
+    // //auto divide_1_over_exp = std::make_shared<op::Divide>(broadcast_constant, add_exp);
+    auto sigmoid_fwd = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
+
+    auto delta = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
+    auto neg_delta = std::make_shared<op::Negative>(delta);
+
+    auto multiply_sigmoid_delta = std::make_shared<op::Multiply>(sigmoid_fwd, neg_delta);
+    auto divide_2 = std::make_shared<op::Divide>(multiply_sigmoid_delta, add_exp);
+
+    auto multiply_2 = std::make_shared<op::Multiply>(divide_2, exp_neg_input);
+    auto negtive_2 = std::make_shared<op::Negative>(multiply_2);
+
+    //Define a call back that needs to called once the DFG matches the pattern
+    ngraph::pattern::gr_callback_fn callback =
+        [input, delta](pattern::Matcher& m) -> std::shared_ptr<Node> {
+        NGRAPH_DEBUG << "In a callback for construct_fprop_sigmoid pattern against "
+                     << m.match_root()->get_name();
+        auto pattern_map = m.get_pattern_map();
+        if (m.match_root()->get_element_type() != element::f32)
+        {
+            NGRAPH_DEBUG << "mpattern = " << m.match_root()->get_name() << " type is not float!";
+            return nullptr;
+        }
+
+        if (m.match_root()->get_shape().size() != pattern_map[input]->get_shape().size())
+        {
+            NGRAPH_DEBUG << "mpattern = " << m.match_root()->get_name()
+                         << "input= " << pattern_map[input]->get_name() << "size dont match!";
+            return nullptr;
+        }
+        auto dsigmoid =
+            std::make_shared<op::SigmoidBackprop>(pattern_map[input], pattern_map[delta]);
+        return dsigmoid;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(negtive_2, callback);
+    this->add_matcher(m);
+}
+
 void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias()
 {
    Shape shape{2, 2, 1, 1};

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -44,6 +44,7 @@ public:
        construct_zero_padded_reshaped_conv();
        construct_zero_padded_conv();
        construct_sigmoid();
+        construct_sigmoid_bprop();
        construct_conv_bias();
    }

@@ -53,6 +54,7 @@ private:
    void construct_conv_bias();
    void construct_fprop_bn();
    void construct_sigmoid();
+    void construct_sigmoid_bprop();
    void construct_zero_padded_reshaped_conv();
    void construct_zero_padded_conv();
 };
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -960,6 +960,29 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::SigmoidBackprop)
+                {
+                    if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        auto input_layout =
+                            runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node.get(), 0);
+                        vector<memory::format> prim_input_formats;
+                        vector<memory::format> prim_output_formats;
+                        //ensure delta and input have same layout
+                        prim_input_formats.push_back(input_layout);
+                        prim_input_formats.push_back(input_layout);
+                        prim_output_formats.push_back(input_layout);
+                        node =
+                            insert_input_conversions(external_function, node, prim_input_formats);
+                        set_output_layouts(node, prim_output_formats);
+                    }
+                    else
+                    {
+                        set_default_layouts(external_function, node);
+                    }
+                }
+
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::ReluBackprop)
                {
@@ -1095,6 +1118,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::ReluBackprop),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::ReluBackprop>},
    {TI(ngraph::op::Sigmoid), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Sigmoid>},
+    {TI(ngraph::op::SigmoidBackprop),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::SigmoidBackprop>},
 };

 bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)

--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
@@ -19,25 +19,18 @@

 #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"

-namespace ngraph
+using namespace ngraph;
+
+runtime::gpu::CudaContextManager& runtime::gpu::CudaContextManager::instance()
 {
-    namespace runtime
-    {
-        namespace gpu
-        {
-            CudaContextManager& CudaContextManager::instance()
-            {
-                static CudaContextManager manager;
-                return manager;
-            }
+    static CudaContextManager manager;
+    return manager;
+}

-            CudaContextManager::CudaContextManager()
-            {
-                CUDA_SAFE_CALL(cuInit(0));
-                CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
-                CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device));
-                m_context_ptr = std::make_shared<CUcontext>(m_context);
-            }
-        }
-    }
+runtime::gpu::CudaContextManager::CudaContextManager()
+{
+    CUDA_SAFE_CALL(cuInit(0));
+    CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
+    CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device));
+    m_context_ptr = std::make_shared<CUcontext>(m_context);
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
@@ -20,46 +20,39 @@
 #include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"

-namespace ngraph
+using namespace ngraph;
+
+std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::string& name,
+                                                                   const std::string& kernel,
+                                                                   int number_of_options,
+                                                                   const char** options)
 {
-    namespace runtime
-    {
-        namespace gpu
-        {
-            std::shared_ptr<CUfunction> CudaFunctionBuilder::get(const std::string& name,
-                                                                 const std::string& kernel,
-                                                                 int number_of_options,
-                                                                 const char** options)
-            {
-                nvrtcProgram prog;
-                NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
-                                                   kernel.c_str(),
-                                                   "op.cu",
-                                                   0,      // numHeaders
-                                                   NULL,   // headers
-                                                   NULL)); // includeNames
+    nvrtcProgram prog;
+    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
+                                       kernel.c_str(),
+                                       "op.cu",
+                                       0,      // numHeaders
+                                       NULL,   // headers
+                                       NULL)); // includeNames

-                nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);

-                if (compile_result != NVRTC_SUCCESS)
-                {
-                    throw std::runtime_error("compile error: \n" + kernel + "\n options");
-                }
+    if (compile_result != NVRTC_SUCCESS)
+    {
+        throw std::runtime_error("compile error: \n" + kernel + "\n options");
+    }

-                size_t ptx_size;
-                NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
-                char* ptx = new char[ptx_size];
-                NVRTC_SAFE_CALL(nvrtcGetPTX(
-                    prog,
+    size_t ptx_size;
+    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
+    char* ptx = new char[ptx_size];
+    NVRTC_SAFE_CALL(
+        nvrtcGetPTX(prog,
                    ptx)); // Load the generated PTX and get a handle to the parent kernel.
-                NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
+    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.

-                CUmodule module;
-                CUfunction function;
-                CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
-                CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
-                return std::make_shared<CUfunction>(function);
-            }
-        }
-    }
+    CUmodule module;
+    CUfunction function;
+    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
+    CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
+    return std::make_shared<CUfunction>(function);
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
@@ -26,40 +26,31 @@

 static const std::string s_output_dir = "gpu_codegen";

-namespace ngraph
+using namespace ngraph;
+
+runtime::gpu::CudaFunctionPool& runtime::gpu::CudaFunctionPool::instance()
 {
-    namespace runtime
-    {
-        namespace gpu
-        {
-            CudaFunctionPool& CudaFunctionPool::instance()
-            {
-                static CudaFunctionPool pool;
-                return pool;
-            }
+    static CudaFunctionPool pool;
+    return pool;
+}

-            void CudaFunctionPool::set(const std::string& name, const std::string& kernel)
-            {
-                const char* opts[] = {"--gpu-architecture=compute_35",
-                                      "--relocatable-device-code=true"};
-                std::string filename =
-                    file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
-                std::ofstream out(filename);
-                out << kernel;
-                out.close();
-                m_function_map.insert(
-                    {name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
-            }
+void runtime::gpu::CudaFunctionPool::set(const std::string& name, const std::string& kernel)
+{
+    const char* opts[] = {"--gpu-architecture=compute_35", "--relocatable-device-code=true"};
+    std::string filename =
+        file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
+    std::ofstream out(filename);
+    out << kernel;
+    out.close();
+    m_function_map.insert({name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
+}

-            std::shared_ptr<CUfunction> CudaFunctionPool::get(const std::string& name)
-            {
-                auto it = m_function_map.find(name);
-                if (it != m_function_map.end())
-                {
-                    return (*it).second;
-                }
-                return nullptr;
-            }
-        }
+std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::get(const std::string& name)
+{
+    auto it = m_function_map.find(name);
+    if (it != m_function_map.end())
+    {
+        return (*it).second;
    }
+    return nullptr;
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -16,74 +16,67 @@
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/codegen/code_writer.hpp"

-namespace ngraph
+using namespace ngraph;
+
+void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
+                                                         const std::string& name,
+                                                         const std::string& data_type,
+                                                         const std::string& op,
+                                                         const size_t& num_inputs)
 {
-    namespace runtime
+    writer << "extern \"C\" __global__ void cuda_" << name << "(";
+    for (size_t i = 0; i < num_inputs; i++)
+    {
+        writer << data_type << "* in" << i << ", ";
+    }
+    writer << data_type << "* out,"
+           << "size_t n)\n";
+    writer << "{\n";
+    writer.indent++;
    {
-        namespace gpu
+        writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
+        writer << "if (tid < n)\n";
+        writer << "{\n";
+        writer.indent++;
        {
-            void CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
-                                                       const std::string& name,
-                                                       const std::string& data_type,
-                                                       const std::string& op,
-                                                       const size_t& num_inputs)
+            writer << "out[tid] = " << op << "(";
+            for (size_t i = 0; i < num_inputs - 1; i++)
            {
-                writer << "extern \"C\" __global__ void cuda_" << name << "(";
-                for (size_t i = 0; i < num_inputs; i++)
-                {
-                    writer << data_type << "* in" << i << ", ";
-                }
-                writer << data_type << "* out,"
-                       << "size_t n)\n";
-                writer << "{\n";
-                writer.indent++;
-                {
-                    writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
-                    writer << "if (tid < n)\n";
-                    writer << "{\n";
-                    writer.indent++;
-                    {
-                        writer << "out[tid] = " << op << "(";
-                        for (size_t i = 0; i < num_inputs - 1; i++)
-                        {
-                            writer << "in" << i << "[tid], ";
-                        }
-                        writer << "in" << num_inputs - 1 << "[tid]);\n";
-                    }
-                    writer.indent--;
-                    writer << "}\n";
-                }
-                writer.indent--;
-                writer << "}\n";
-
-                return;
+                writer << "in" << i << "[tid], ";
            }
+            writer << "in" << num_inputs - 1 << "[tid]);\n";
+        }
+        writer.indent--;
+        writer << "}\n";
+    }
+    writer.indent--;
+    writer << "}\n";

-            void CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
-                                                      const std::string& name,
-                                                      const std::string& data_type,
-                                                      const std::string& math_kernel,
-                                                      const size_t& num_inputs)
-            {
-                if (math_kernel.size())
-                {
-                    writer << "__device__ " << data_type << " " << name << "(";
-                    for (size_t i = 0; i < num_inputs - 1; i++)
-                    {
-                        writer << data_type << " x" << i << ", ";
-                    }
-                    writer << data_type << " x" << num_inputs - 1;
-                    writer << ")\n";
-                    writer << "{\n";
-                    writer.indent++;
-                    {
-                        writer << "return " + math_kernel << ";\n";
-                    }
-                    writer.indent--;
-                    writer << "}\n";
-                }
-                return;
-            }
+    return;
+}
+
+void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
+                                                        const std::string& name,
+                                                        const std::string& data_type,
+                                                        const std::string& math_kernel,
+                                                        const size_t& num_inputs)
+{
+    if (math_kernel.size())
+    {
+        writer << "__device__ " << data_type << " " << name << "(";
+        for (size_t i = 0; i < num_inputs - 1; i++)
+        {
+            writer << data_type << " x" << i << ", ";
+        }
+        writer << data_type << " x" << num_inputs - 1;
+        writer << ")\n";
+        writer << "{\n";
+        writer.indent++;
+        {
+            writer << "return " + math_kernel << ";\n";
        }
+        writer.indent--;
+        writer << "}\n";
    }
+    return;
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -20,26 +20,22 @@
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"

-namespace ngraph
+using namespace ngraph;
+void runtime::gpu::emit_broadcast(
+    void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
 {
-    namespace runtime
+    std::string name = "broadcast";
+    // Create an instance of nvrtcProgram with the code string.
+    if (CudaFunctionPool::instance().get(name) == nullptr)
    {
-        namespace gpu
-        {
-            void emit_broadcast(
-                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
-            {
-                std::string name = "broadcast";
-                // Create an instance of nvrtcProgram with the code string.
-                if (CudaFunctionPool::instance().get(name) == nullptr)
-                {
-                    std::string kernel;
-                    std::string data_type("float");
+        std::string kernel;
+        std::string data_type("float");

-                    kernel = R"(
+        kernel = R"(
 extern "C" __global__
-void cuda_)" + name + "(" + data_type +
-                             "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"(
+void cuda_)" + name +
+                 "(" + data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" +
+                 R"(
 {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid < n)
@@ -48,28 +44,25 @@ void cuda_)" + name + "(" + data_type +
        out[tid] = in[idx];
    }
 })";
-                    CudaFunctionPool::instance().set(name, kernel);
-                }
+        CudaFunctionPool::instance().set(name, kernel);
+    }

-                //convert runtime ptr to driver api ptr
-                CUdeviceptr d_ptr_in, d_ptr_out;
-                d_ptr_in = (CUdeviceptr)in;
-                d_ptr_out = (CUdeviceptr)out;
+    //convert runtime ptr to driver api ptr
+    CUdeviceptr d_ptr_in, d_ptr_out;
+    d_ptr_in = CUdeviceptr(in);
+    d_ptr_out = CUdeviceptr(out);

-                void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
-                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
-                                              count,
-                                              1,
-                                              1, // grid dim
-                                              1,
-                                              1,
-                                              1, // block dim
-                                              0,
-                                              NULL, // shared mem and stream
-                                              args_list,
-                                              0));  // arguments
-                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
-            }
-        }
-    }
+    void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
+    CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                  static_cast<unsigned int>(count),
+                                  1,
+                                  1, // grid dim
+                                  1,
+                                  1,
+                                  1, // block dim
+                                  0,
+                                  NULL, // shared mem and stream
+                                  args_list,
+                                  0));  // arguments
+    CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
 }
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -518,15 +518,11 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                writer.indent++;
                auto arg_shape = args[0].get_shape();
                auto arg_rank = arg_shape.size();
-
                auto result_shape = out[0].get_shape();
-                auto& result_element_type = out[0].get_element_type();
-
                auto input_order = reshape->get_input_order();
-
                bool same_layout = is_sorted(input_order.begin(), input_order.end());
-
                size_t result_shape_product = 1;
+
                for (auto i : result_shape)
                {
                    result_shape_product *= i;

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -114,6 +114,7 @@
 #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"

 using namespace std;
+using namespace ngraph;

 static const string s_output_dir = "gpu_codegen";

@@ -159,119 +160,113 @@ static StaticInitializers s_static_initializers;

 #define TI(x) type_index(typeid(x))

-namespace ngraph
+static const runtime::gpu::OpMap dispatcher{
+    {TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Add>},
+    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>},
+    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>},
+    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop},
+    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
+    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Equal>},
+    {TI(ngraph::op::GetOutputElement),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
+    {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Greater>},
+    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::GreaterEq>},
+    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Less>},
+    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::LessEq>},
+    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
+    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
+    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
+    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit<ngraph::op::NotEqual>},
+    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
+    {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
+    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convert>},
+    {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
+    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
+    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
+    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>},
+    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>},
+    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>},
+    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
+    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>},
+    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>},
+    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>},
+    {TI(ngraph::op::ConvolutionBackpropFilters),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
+    {TI(ngraph::op::ConvolutionBackpropData),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
+    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
+    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
+    {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
+    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReduceWindow>},
+    {TI(ngraph::op::SelectAndScatter),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
+    {TI(ngraph::op::AvgPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::AvgPool>},
+    {TI(ngraph::op::AvgPoolBackprop),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
+    {TI(ngraph::op::Pad), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Pad>},
+    {TI(ngraph::op::BatchNorm), &runtime::gpu::GPU_Emitter::emit<ngraph::op::BatchNorm>},
+    {TI(ngraph::op::BatchNormBackprop),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
+    {TI(ngraph::op::MaxPoolBackprop),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
+    {TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>},
+    {TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>},
+    {TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>},
+    {TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Relu>},
+    {TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReluBackprop>},
+    {TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>},
+};
+
+runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
+    const shared_ptr<ngraph::Function>& function, bool release_function)
+    : ngraph::runtime::ExternalFunction(function, release_function)
+    , m_compiled_function(nullptr)
+    , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
 {
-    namespace runtime
-    {
-        namespace gpu
-        {
-            static const OpMap dispatcher{
-                {TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
-                {TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
-                {TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
-                {TI(ngraph::op::Parameter), &GPU_Emitter::nop},
-                {TI(ngraph::op::Abs), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
-                {TI(ngraph::op::Divide), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
-                {TI(ngraph::op::GetOutputElement),
-                 &GPU_Emitter::emit<ngraph::op::GetOutputElement>},
-                {TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
-                {TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
-                {TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
-                {TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
-                {TI(ngraph::op::Log), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
-                {TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
-                {TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
-                {TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
-                {TI(ngraph::op::Power), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
-                {TI(ngraph::op::Subtract), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
-                {TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
-                {TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
-                {TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
-                {TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
-                {TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
-                {TI(ngraph::op::Sign), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
-                {TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
-                {TI(ngraph::op::Exp), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Sin), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Sinh), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Cos), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Cosh), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Tan), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Tanh), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Asin), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Acos), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Atan), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
-                {TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
-                {TI(ngraph::op::Floor), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Ceiling), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
-                {TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
-                {TI(ngraph::op::ConvolutionBackpropFilters),
-                 &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
-                {TI(ngraph::op::ConvolutionBackpropData),
-                 &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
-                {TI(ngraph::op::Not), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
-                {TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
-                {TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
-                {TI(ngraph::op::ReduceWindow), &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
-                {TI(ngraph::op::SelectAndScatter),
-                 &GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
-                {TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
-                {TI(ngraph::op::AvgPoolBackprop), &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
-                {TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
-                {TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
-                {TI(ngraph::op::BatchNormBackprop),
-                 &GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
-                {TI(ngraph::op::MaxPoolBackprop), &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
-                {TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
-                {TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
-                {TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
-                {TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
-                {TI(ngraph::op::ReluBackprop), &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
-                {TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
-            };
-
-            GPU_ExternalFunction::GPU_ExternalFunction(const shared_ptr<ngraph::Function>& function,
-                                                       bool release_function)
-                : ngraph::runtime::ExternalFunction(function, release_function)
-                , m_compiled_function(nullptr)
-                , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
-            {
-            }
+}

-            void GPU_ExternalFunction::compile()
-            {
-                if (m_is_compiled)
-                {
-                    return;
-                }
+void runtime::gpu::GPU_ExternalFunction::compile()
+{
+    if (m_is_compiled)
+    {
+        return;
+    }

-                string function_name = m_function->get_name();
-                string dump_filename =
-                    file_util::path_join(s_output_dir, function_name + "_ops.txt");
+    string function_name = m_function->get_name();
+    string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");

-                pass::Manager pass_manager;
-                // pass_manager.register_pass<pass::TopologicalSort>();
-                // For now, just make everyone row-major.
-                pass_manager
-                    .register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
-                pass_manager.register_pass<pass::Liveness>();
-                pass_manager.register_pass<pass::MemoryLayout>(64);
-                pass_manager.register_pass<pass::DumpSorted>(dump_filename);
-                pass_manager.run_passes(m_function);
+    pass::Manager pass_manager;
+    // pass_manager.register_pass<pass::TopologicalSort>();
+    // For now, just make everyone row-major.
+    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+    pass_manager.register_pass<pass::Liveness>();
+    pass_manager.register_pass<pass::MemoryLayout>(64);
+    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
+    pass_manager.run_passes(m_function);

-                codegen::CodeWriter writer;
+    codegen::CodeWriter writer;

-                writer +=
-                    R"(// Generated by the NGraph GPU backend
+    writer +=
+        R"(// Generated by the NGraph GPU backend
    #include <cublas_v2.h>
    #include <cuda.h>
    #include <cuda_runtime.h>
@@ -297,563 +292,516 @@ namespace ngraph
    #include "ngraph/util.hpp"
 )";

-                string pch_header_source = writer.get_code();
+    string pch_header_source = writer.get_code();

-                writer += R"(
+    writer += R"(
 using namespace ngraph;
 using namespace std;
    )";

-                if (m_emit_timing)
-                {
-                    writer << "// Declare debug timers\n";
-                    vector<string> names;
-                    for (shared_ptr<Function> current_function :
-                         pass_manager.get_state().get_functions())
-                    {
-                        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                        {
-                            if (!node->is_parameter() && !node->is_constant())
-                            {
-                                names.push_back(node->get_name());
-                            }
-                        }
-                    }
-                    for (const string& s : names)
-                    {
-                        writer << "ngraph::stopwatch timer_" << s << ";\n";
-                    }
-                    writer << "extern \"C\" size_t get_debug_timer_count() { return "
-                           << names.size() << "; }\n";
-                    writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
-                    writer << "{\n";
-                    writer.indent++;
-                    writer << "const char* rc;\n";
-                    writer << "switch(index)\n";
-                    writer << "{\n";
-                    for (size_t i = 0; i < names.size(); i++)
-                    {
-                        writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
-                    }
-                    writer << "default: rc = \"\";\n";
-                    writer << "}\n";
-                    writer << "return rc;\n";
-                    writer.indent--;
-                    writer << "}\n";
-                    writer
-                        << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-                    writer << "{\n";
-                    writer.indent++;
-                    writer << "size_t rc;\n";
-                    writer << "switch(index)\n";
-                    writer << "{\n";
-                    for (size_t i = 0; i < names.size(); i++)
-                    {
-                        writer << "case " << i << ": rc = timer_" << names[i]
-                               << ".get_total_microseconds(); break;\n";
-                    }
-                    writer << "default: rc = 0;\n";
-                    writer << "}\n";
-                    writer << "return rc;\n";
-                    writer.indent--;
-                    writer << "}\n";
-                    writer
-                        << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-                    writer << "{\n";
-                    writer.indent++;
-                    writer << "size_t rc;\n";
-                    writer << "switch(index)\n";
-                    writer << "{\n";
-                    for (size_t i = 0; i < names.size(); i++)
-                    {
-                        writer << "case " << i << ": rc = timer_" << names[i]
-                               << ".get_call_count(); break;\n";
-                    }
-                    writer << "default: rc = 0;\n";
-                    writer << "}\n";
-                    writer << "return rc;\n";
-                    writer.indent--;
-                    writer << "}\n";
-                    writer << "\n";
-                }
-                //     // The "dso_handle" symbol is required by __cxa_atexit()
-                //     // which is enabled because the JIT uses it as the default mechanism
-                //     // to register cleanup handlers. We use it, and not atexit(), because
-                //     // atexit() happens too late, when the JIT is no longer alive
-
-                writer << "void *__dso_handle = 0;\n\n";
-                writer << "// Declare all constants\n";
-                for (shared_ptr<Function> current_function :
-                     pass_manager.get_state().get_functions())
+    if (m_emit_timing)
+    {
+        writer << "// Declare debug timers\n";
+        vector<string> names;
+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        {
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                if (!node->is_parameter() && !node->is_constant())
                {
-                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                    {
-                        const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
-                        if (c)
-                        {
-                            shared_ptr<descriptor::TensorView> tv =
-                                node->get_outputs()[0].get_tensor_view();
-                            auto c_value_strings = c->get_value_strings();
-                            writer << "static "
-                                   << tv->get_tensor().get_element_type().c_type_string() << " "
-                                   << tv->get_tensor().get_name() << "_cpu["
-                                   << c_value_strings.size() << "] =\n";
-                            writer << "{\n";
-                            writer.indent++;
-                            writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
-                            writer.indent--;
-                            writer << "\n};\n\n";
-                            writer << "static "
-                                   << tv->get_tensor().get_element_type().c_type_string() << " *"
-                                   << tv->get_tensor().get_name() << ";\n";
-                            m_variable_name_map[tv->get_tensor().get_name()] =
-                                tv->get_tensor().get_name();
-                        }
-                    }
+                    names.push_back(node->get_name());
                }
+            }
+        }
+        for (const string& s : names)
+        {
+            writer << "ngraph::stopwatch timer_" << s << ";\n";
+        }
+        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
+               << "; }\n";
+        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "const char* rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
+        }
+        writer << "default: rc = \"\";\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "size_t rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = timer_" << names[i]
+                   << ".get_total_microseconds(); break;\n";
+        }
+        writer << "default: rc = 0;\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "size_t rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
+        }
+        writer << "default: rc = 0;\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "\n";
+    }
+    //     // The "dso_handle" symbol is required by __cxa_atexit()
+    //     // which is enabled because the JIT uses it as the default mechanism
+    //     // to register cleanup handlers. We use it, and not atexit(), because
+    //     // atexit() happens too late, when the JIT is no longer alive
+
+    writer << "void *__dso_handle = 0;\n\n";
+    writer << "// Declare all constants\n";
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    {
+        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        {
+            const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
+            if (c)
+            {
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
+                auto c_value_strings = c->get_value_strings();
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
+                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
+                       << "] =\n";
+                writer << "{\n";
+                writer.indent++;
+                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
+                writer.indent--;
+                writer << "\n};\n\n";
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
+                       << tv->get_tensor().get_name() << ";\n";
+                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
+            }
+        }
+    }

-                writer << "// Declare all functions\n";
-                for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
-                {
-                    writer << "extern \"C\" void " << f->get_name()
-                           << "(void** inputs, void** outputs, "
-                              "cublasHandle_t& cublas_handle, "
-                              "cudnnHandle_t& cudnn_handle);\n";
-                }
+    writer << "// Declare all functions\n";
+    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
+    {
+        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
+                                                           "cublasHandle_t& cublas_handle, "
+                                                           "cudnnHandle_t& cudnn_handle);\n";
+    }

-                writer << "\n";
+    writer << "\n";

-                unordered_map<Node*, string> match_functions;
-                for (shared_ptr<Function> current_function :
-                     pass_manager.get_state().get_functions())
+    unordered_map<Node*, string> match_functions;
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    {
+        set<string> output_names;
+        for (shared_ptr<Node> op : current_function->get_results())
+        {
+            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+            output_names.insert(tv->get_tensor().get_name());
+        }
+        const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
+        if (tmp.size() < 2)
+        {
+            // Since we are comparing ops there must be at least two ops to proceed.
+            continue;
+        }
+        vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
+        for (size_t i = 0; i < op_list.size() - 1; i++)
+        {
+            if (op_list[i]->is_constant() || op_list[i]->is_parameter())
+            {
+                continue;
+            }
+            if (contains_key(match_functions, op_list[i].get()))
+            {
+                continue;
+            }
+            string match_function_name;
+            if (!match_function_name.empty())
+            {
+                writer << "static void " << match_function_name << "(";
+                writer.indent++;
+                // Work around a compiler warning (*node inside typeid may have effects
+                // with shared pointers, which is fine here but clang doesn't like it.)
+                auto& n = *op_list[i];
+                auto handler = dispatcher.find(type_index(typeid(n)));
+                vector<GPU_TensorViewWrapper> in;
+                size_t arg_index = 0;
+                set<string> arg_names;
+                for (const descriptor::Input& input : n.get_inputs())
                {
-                    bool temporaries_used = false;
-                    size_t worst_case_tmp_size = 0;
-
-                    set<string> output_names;
-                    for (shared_ptr<Node> op : current_function->get_results())
+                    const descriptor::Output& output = input.get_output();
+                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                    GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
+                    if (!contains(arg_names, tvw.get_name()))
                    {
-                        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-                        output_names.insert(tv->get_tensor().get_name());
-                    }
-                    const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
-                    if (tmp.size() < 2)
-                    {
-                        // Since we are comparing ops there must be at least two ops to proceed.
-                        continue;
-                    }
-                    vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
-                    for (size_t i = 0; i < op_list.size() - 1; i++)
-                    {
-                        if (op_list[i]->is_constant() || op_list[i]->is_parameter())
-                        {
-                            continue;
-                        }
-                        if (contains_key(match_functions, op_list[i].get()))
-                        {
-                            continue;
-                        }
-                        string match_function_name;
-                        for (size_t j = i + 1; j < op_list.size(); j++)
-                        {
-                            if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
-                            {
-                                if (match_function_name.empty())
-                                {
-                                    match_function_name = "func_" + op_list[i]->get_name();
-                                    match_functions.insert({op_list[i].get(), match_function_name});
-                                }
-                                match_functions.insert({op_list[j].get(), match_function_name});
-                            }
-                        }
-                        if (!match_function_name.empty())
+                        arg_names.insert(tvw.get_name());
+                        if (arg_index++ > 0)
                        {
-                            writer << "static void " << match_function_name << "(";
-                            writer.indent++;
-                            // Work around a compiler warning (*node inside typeid may have effects
-                            // with shared pointers, which is fine here but clang doesn't like it.)
-                            auto& n = *op_list[i];
-                            auto handler = dispatcher.find(type_index(typeid(n)));
-                            vector<GPU_TensorViewWrapper> in;
-                            size_t arg_index = 0;
-                            set<string> arg_names;
-                            for (const descriptor::Input& input : n.get_inputs())
-                            {
-                                const descriptor::Output& output = input.get_output();
-                                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                                GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
-                                if (!contains(arg_names, tvw.get_name()))
-                                {
-                                    arg_names.insert(tvw.get_name());
-                                    if (arg_index++ > 0)
-                                    {
-                                        writer << ",";
-                                    }
-                                    writer << "\n";
-                                    writer << tvw.get_type() << "* " << tvw.get_name();
-                                }
-                                in.push_back(tvw);
-                            }
-                            vector<GPU_TensorViewWrapper> out;
-                            for (const descriptor::Output& output : n.get_outputs())
-                            {
-                                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                                GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
-                                if (arg_index++ > 0)
-                                {
-                                    writer << ",";
-                                }
-                                writer << "\n";
-                                writer << tvw.get_type() << "* " << tvw.get_name();
-                                out.push_back(tvw);
-                            }
-                            writer.indent--;
-                            writer << "\n)\n";
-                            writer << "{\n";
-                            writer.indent++;
-                            handler->second(this, writer, &n, in, out);
-                            writer.indent--;
-                            writer << "}\n";
+                            writer << ",";
                        }
+                        writer << "\n";
+                        writer << tvw.get_type() << "* " << tvw.get_name();
                    }
+                    in.push_back(tvw);
                }
-
-                for (shared_ptr<Function> current_function :
-                     pass_manager.get_state().get_functions())
+                vector<GPU_TensorViewWrapper> out;
+                for (const descriptor::Output& output : n.get_outputs())
                {
-                    set<string> output_names;
-                    for (shared_ptr<Node> op : current_function->get_results())
-                    {
-                        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-                        output_names.insert(tv->get_tensor().get_name());
-                    }
-                    set<descriptor::TensorView*> constants;
-                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                    {
-                        if (dynamic_cast<ngraph::op::Constant*>(node.get()))
-                        {
-                            shared_ptr<descriptor::TensorView> tv =
-                                node->get_outputs()[0].get_tensor_view();
-                            constants.insert(tv.get());
-                        }
-                    }
-
-                    writer << "extern \"C\" void " << current_function->get_name();
-                    writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
-                              "cudnnHandle_t& "
-                              "cudnn_handle)\n";
-                    writer << "{\n";
-                    writer.indent++;
-
-                    for (shared_ptr<Function> current_function :
-                         pass_manager.get_state().get_functions())
-                    {
-                        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                        {
-                            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
-                            if (c)
-                            {
-                                shared_ptr<descriptor::TensorView> tv =
-                                    node->get_outputs()[0].get_tensor_view();
-                                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
-                                writer << "{\n";
-                                writer.indent++;
-                                writer << tv->get_tensor().get_name() << " = ("
-                                       << tv->get_tensor().get_element_type().c_type_string()
-                                       << " *) ngraph::runtime::gpu::create_gpu_buffer("
-                                       << tv->get_tensor().size() << ");\n";
-                                writer << "runtime::gpu::cuda_memcpyHtD("
-                                       << tv->get_tensor().get_name() << ", "
-                                       << tv->get_tensor().get_name() << "_cpu, "
-                                       << tv->get_tensor().size() << ");\n";
-                                writer.indent--;
-                                writer << "}\n";
-                            }
-                        }
-                    }
-                    bool temporaries_used = false;
-                    size_t worst_case_tmp_size = 0;
-                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
+                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                    GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
+                    if (arg_index++ > 0)
                    {
-                        if (node->liveness_new_list.size() > 0)
-                        {
-                            temporaries_used = true;
-                            for (descriptor::Tensor* tensor : node->liveness_new_list)
-                            {
-                                worst_case_tmp_size += tensor->size();
-                            }
-                        }
-                    }
-                    if (temporaries_used)
-                    {
-                        size_t temp_pool_size = current_function->get_temporary_pool_size();
-                        writer << "// Allocate the memory pool\n";
-                        // TODO memory pool malloc.
-                        writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
-                               << temp_pool_size << ");\n";
-
-                        // Add temporaries to the variable name map
-                        for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                        {
-                            for (descriptor::Tensor* tensor : node->liveness_new_list)
-                            {
-                                stringstream ss;
-                                ss << "((" << tensor->get_element_type().c_type_string()
-                                   << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset()
-                                   << "))";
-                                m_variable_name_map[tensor->get_name()] = ss.str();
-                            }
-                        }
+                        writer << ",";
                    }
+                    writer << "\n";
+                    writer << tvw.get_type() << "* " << tvw.get_name();
+                    out.push_back(tvw);
+                }
+                writer.indent--;
+                writer << "\n)\n";
+                writer << "{\n";
+                writer.indent++;
+                handler->second(this, writer, &n, in, out);
+                writer.indent--;
+                writer << "}\n";
+            }
+        }
+    }

-                    // Add inputs to the variable name map
-                    size_t arg_index = 0;
-                    for (shared_ptr<ngraph::op::Parameter> param :
-                         current_function->get_parameters())
-                    {
-                        for (size_t i = 0; i < param->get_output_size(); ++i)
-                        {
-                            shared_ptr<descriptor::TensorView> tv =
-                                param->get_output_tensor_view(i);
-                            const element::Type& et =
-                                tv->get_tensor_view_type()->get_element_type();
-                            string type = et.c_type_string();
-                            stringstream ss;
-                            ss << "((" << type << "*)(inputs[" << arg_index << "]))";
-                            m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
-                            arg_index++;
-                        }
-                    }
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    {
+        set<string> output_names;
+        for (shared_ptr<Node> op : current_function->get_results())
+        {
+            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+            output_names.insert(tv->get_tensor().get_name());
+        }
+        set<descriptor::TensorView*> constants;
+        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        {
+            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
+            {
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
+                constants.insert(tv.get());
+            }
+        }

-                    // create output alias map
-                    size_t output_index = 0;
-                    unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
-                    vector<size_t> aliases;
-                    for (size_t i = 0; i < current_function->get_output_size(); ++i)
-                    {
-                        shared_ptr<Node> op = current_function->get_output_op(i);
-                        shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
-                        vector<size_t>& al = output_alias_map[otv.get()];
-                        al.push_back(output_index);
-                        if (al.size() > 1)
-                        {
-                            aliases.push_back(output_index);
-                        }
-                        output_index++;
-                    }
+        writer << "extern \"C\" void " << current_function->get_name();
+        writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
+                  "cudnnHandle_t& "
+                  "cudnn_handle)\n";
+        writer << "{\n";
+        writer.indent++;

-                    // Add outputs to the variable name map
-                    output_index = 0;
-                    for (size_t i = 0; i < current_function->get_output_size(); ++i)
-                    {
-                        shared_ptr<Node> op = current_function->get_output_op(i);
-                        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-                        const element::Type& et = tv->get_tensor_view_type()->get_element_type();
-                        bool parameter_as_output = false;
-                        for (shared_ptr<ngraph::op::Parameter> param :
-                             current_function->get_parameters())
-                        {
-                            for (const descriptor::Output& pout : param->get_outputs())
-                            {
-                                shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
-                                if (tv == ptv)
-                                {
-                                    parameter_as_output = true;
-                                    writer
-                                        << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
-                                        << et.c_type_string() << "*>(outputs[" << output_index
-                                        << "]), "
-                                        << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
-                                        << ptv->get_tensor().size() << ");\n";
-                                    break;
-                                }
-                            }
-                        }
-                        if (!parameter_as_output && !contains(aliases, output_index))
-                        {
-                            if (contains(constants, tv.get()))
-                            {
-                                writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
-                                       << output_index << "], " << tv->get_tensor().get_name()
-                                       << ", " << tv->get_tensor().size() << ");\n";
-                            }
-                            else
-                            {
-                                string type = et.c_type_string();
-                                stringstream ss;
-                                ss << "((" << type << "*)(outputs[" << output_index << "]))";
-                                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
-                            }
-                        }
-                        output_index++;
-                    }
+        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        {
+            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+            if (c)
+            {
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
+                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
+                writer << "{\n";
+                writer.indent++;
+                writer << tv->get_tensor().get_name() << " = ("
+                       << tv->get_tensor().get_element_type().c_type_string()
+                       << " *) runtime::gpu::create_gpu_buffer(" << tv->get_tensor().size()
+                       << ");\n";
+                writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
+                       << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
+                       << ");\n";
+                writer.indent--;
+                writer << "}\n";
+            }
+        }

-                    for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                    {
-                        auto& n =
-                            *node; // Work around a compiler warning (*node inside typeid may have effects
-                        // with shared pointers, which is fine here but clang doesn't like it.)
-                        auto handler = dispatcher.find(type_index(typeid(n)));
-                        if (handler == dispatcher.end())
-                        {
-                            throw ngraph_error("Unhandled op during code generation : " +
-                                               node->description());
-                        }
-                        vector<GPU_TensorViewWrapper> in;
-                        for (const descriptor::Input& input : node->get_inputs())
-                        {
-                            const descriptor::Output& output = input.get_output();
-                            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                            in.push_back(GPU_TensorViewWrapper(
-                                tv, m_variable_name_map[tv->get_tensor().get_name()]));
-                        }
-                        vector<GPU_TensorViewWrapper> out;
-                        for (const descriptor::Output& output : node->get_outputs())
-                        {
-                            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                            out.push_back(GPU_TensorViewWrapper(
-                                tv, m_variable_name_map[tv->get_tensor().get_name()]));
-                        }
+        bool temporaries_used = false;
+        size_t worst_case_tmp_size = 0;
+        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        {
+            if (node->liveness_new_list.size() > 0)
+            {
+                temporaries_used = true;
+                for (descriptor::Tensor* tensor : node->liveness_new_list)
+                {
+                    worst_case_tmp_size += tensor->size();
+                }
+            }
+        }
+        if (temporaries_used)
+        {
+            size_t temp_pool_size = current_function->get_temporary_pool_size();
+            writer << "// Allocate the memory pool\n";
+            // TODO memory pool malloc.
+            writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
+                   << temp_pool_size << ");\n";
+
+            // Add temporaries to the variable name map
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                for (descriptor::Tensor* tensor : node->liveness_new_list)
+                {
+                    stringstream ss;
+                    ss << "((" << tensor->get_element_type().c_type_string()
+                       << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
+                    m_variable_name_map[tensor->get_name()] = ss.str();
+                }
+            }
+        }

-                        // Emit operation prologue
-                        if (!node->is_parameter() && !node->is_constant())
-                        {
-                            if (m_emit_timing)
-                            {
-                                emit_debug_function_entry(writer, node.get(), in, out);
-                            }
-                        }
+        // Add inputs to the variable name map
+        size_t arg_index = 0;
+        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
+        {
+            for (size_t i = 0; i < param->get_output_size(); ++i)
+            {
+                shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
+                const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+                string type = et.c_type_string();
+                stringstream ss;
+                ss << "((" << type << "*)(inputs[" << arg_index << "]))";
+                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+                arg_index++;
+            }
+        }

-                        // Emit operation body
-                        string func_name;
-                        auto it = match_functions.find(node.get());
-                        if (it != match_functions.end())
-                        {
-                            func_name = it->second;
-                        }
-                        if (func_name.empty())
-                        {
-                            handler->second(this, writer, node.get(), in, out);
-                        }
-                        else
-                        {
-                            vector<string> names;
-                            for (const GPU_TensorViewWrapper& tv : in)
-                            {
-                                names.push_back(tv.get_name());
-                            }
-                            for (const GPU_TensorViewWrapper& tv : out)
-                            {
-                                names.push_back(tv.get_name());
-                            }
-                            writer << func_name << "(" << join(names) << ");\n";
-                        }
+        // create output alias map
+        size_t output_index = 0;
+        unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
+        vector<size_t> aliases;
+        for (size_t i = 0; i < current_function->get_output_size(); ++i)
+        {
+            shared_ptr<Node> op = current_function->get_output_op(i);
+            shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
+            vector<size_t>& al = output_alias_map[otv.get()];
+            al.push_back(output_index);
+            if (al.size() > 1)
+            {
+                aliases.push_back(output_index);
+            }
+            output_index++;
+        }

-                        // Emit operation epilogue
-                        if (!node->is_parameter() && !node->is_constant())
-                        {
-                            if (m_emit_timing)
-                            {
-                                emit_debug_function_exit(writer, node.get(), in, out);
-                            }
-                        }
+        // Add outputs to the variable name map
+        output_index = 0;
+        for (size_t i = 0; i < current_function->get_output_size(); ++i)
+        {
+            shared_ptr<Node> op = current_function->get_output_op(i);
+            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
+            bool parameter_as_output = false;
+            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
+            {
+                for (const descriptor::Output& pout : param->get_outputs())
+                {
+                    shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
+                    if (tv == ptv)
+                    {
+                        parameter_as_output = true;
+                        writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
+                               << et.c_type_string() << "*>(outputs[" << output_index << "]), "
+                               << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
+                               << ptv->get_tensor().size() << ");\n";
+                        break;
                    }
-                    writer.indent--;
-                    // End generated function
-                    writer += "}\n\n";
                }
-                // TODO: Cleanup and make this a utility function
-
-                file_util::make_directory(s_output_dir);
-                string filename =
-                    file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
-                ofstream out(filename);
-                string code = writer.get_code();
-                out << code;
-                out.close();
-
-                m_compiler.reset(new codegen::Compiler());
-                m_execution_engine.reset(new codegen::ExecutionEngine());
-
-                m_compiler->set_precompiled_header_source(pch_header_source);
-
-                auto codegen_module = m_compiler->compile(code);
-
-                if (codegen_module == nullptr)
+            }
+            if (!parameter_as_output && !contains(aliases, output_index))
+            {
+                if (contains(constants, tv.get()))
                {
-                    throw runtime_error("function failed to compile");
+                    writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
+                           << "], " << tv->get_tensor().get_name() << ", "
+                           << tv->get_tensor().size() << ");\n";
                }
-                m_execution_engine->add_module(codegen_module);
-                m_execution_engine->finalize();
-                m_compiled_function =
-                    m_execution_engine->find_function<EntryPoint_t>(function_name);
-                assert(m_compiled_function);
-
-                m_is_compiled = true;
-                if (m_release_function)
+                else
                {
-                    release_function();
+                    string type = et.c_type_string();
+                    stringstream ss;
+                    ss << "((" << type << "*)(outputs[" << output_index << "]))";
+                    m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
                }
            }
+            output_index++;
+        }

-            void GPU_ExternalFunction::handle_output_alias(
-                codegen::CodeWriter& writer,
-                const Node& node,
-                const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
+        for (shared_ptr<Node> node : current_function->get_ordered_ops())
+        {
+            auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
+            // with shared pointers, which is fine here but clang doesn't like it.)
+            auto handler = dispatcher.find(type_index(typeid(n)));
+            if (handler == dispatcher.end())
+            {
+                throw ngraph_error("Unhandled op during code generation : " + node->description());
+            }
+            vector<GPU_TensorViewWrapper> in;
+            for (const descriptor::Input& input : node->get_inputs())
            {
-                for (const descriptor::Output& output : node.get_outputs())
+                const descriptor::Output& output = input.get_output();
+                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                in.push_back(
+                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+            }
+            vector<GPU_TensorViewWrapper> out;
+            for (const descriptor::Output& output : node->get_outputs())
+            {
+                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                out.push_back(
+                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+            }
+
+            // Emit operation prologue
+            if (!node->is_parameter() && !node->is_constant())
+            {
+                if (m_emit_timing)
                {
-                    shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
-                    auto it = output_alias_map.find(otv.get());
-                    if (it != output_alias_map.end())
-                    {
-                        const vector<size_t>& outputs = it->second;
-                        if (outputs.size() > 1)
-                        {
-                            writer << "{    // handle output alias for previous op\n";
-                            writer.indent++;
-                            for (size_t i = 1; i < outputs.size(); i++)
-                            {
-                                writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
-                                          "outputs["
-                                       << outputs[i] << "]), static_cast<void*>(outputs["
-                                       << outputs[0] << "]), " << otv->get_tensor().size()
-                                       << ");\n";
-                            }
-                            writer.indent--;
-                            writer << "}\n";
-                        }
-                    }
+                    emit_debug_function_entry(writer, node.get(), in, out);
                }
            }

-            shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame()
+            // Emit operation body
+            string func_name;
+            auto it = match_functions.find(node.get());
+            if (it != match_functions.end())
+            {
+                func_name = it->second;
+            }
+            if (func_name.empty())
            {
-                if (!m_is_compiled)
+                handler->second(this, writer, node.get(), in, out);
+            }
+            else
+            {
+                vector<string> names;
+                for (const GPU_TensorViewWrapper& tv : in)
                {
-                    compile();
+                    names.push_back(tv.get_name());
                }
-
-                return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
+                for (const GPU_TensorViewWrapper& tv : out)
+                {
+                    names.push_back(tv.get_name());
+                }
+                writer << func_name << "(" << join(names) << ");\n";
            }

-            void GPU_ExternalFunction::emit_debug_function_entry(
-                codegen::CodeWriter& writer,
-                Node* node,
-                const std::vector<GPU_TensorViewWrapper>& in,
-                const std::vector<GPU_TensorViewWrapper>& out)
+            // Emit operation epilogue
+            if (!node->is_parameter() && !node->is_constant())
            {
-                writer << "timer_" << node->get_name() << ".start();\n";
+                if (m_emit_timing)
+                {
+                    emit_debug_function_exit(writer, node.get(), in, out);
+                }
            }
+        }
+
+        writer.indent--;
+        // End generated function
+        writer += "}\n\n";
+    }
+    // TODO: Cleanup and make this a utility function
+
+    file_util::make_directory(s_output_dir);
+    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
+    ofstream out(filename);
+    string code = writer.get_code();
+    out << code;
+    out.close();
+
+    m_compiler.reset(new codegen::Compiler());
+    m_execution_engine.reset(new codegen::ExecutionEngine());
+
+    m_compiler->set_precompiled_header_source(pch_header_source);
+
+    auto codegen_module = m_compiler->compile(code);
+
+    if (codegen_module == nullptr)
+    {
+        throw runtime_error("function failed to compile");
+    }
+    m_execution_engine->add_module(codegen_module);
+    m_execution_engine->finalize();
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
+    assert(m_compiled_function);

-            void GPU_ExternalFunction::emit_debug_function_exit(
-                codegen::CodeWriter& writer,
-                Node* node,
-                const std::vector<GPU_TensorViewWrapper>& in,
-                const std::vector<GPU_TensorViewWrapper>& out)
+    m_is_compiled = true;
+    if (m_release_function)
+    {
+        release_function();
+    }
+}
+
+void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
+    codegen::CodeWriter& writer,
+    const Node& node,
+    const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
+{
+    for (const descriptor::Output& output : node.get_outputs())
+    {
+        shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
+        auto it = output_alias_map.find(otv.get());
+        if (it != output_alias_map.end())
+        {
+            const vector<size_t>& outputs = it->second;
+            if (outputs.size() > 1)
            {
-                writer << "timer_" << node->get_name() << ".stop();\n";
+                writer << "{    // handle output alias for previous op\n";
+                writer.indent++;
+                for (size_t i = 1; i < outputs.size(); i++)
+                {
+                    writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
+                              "outputs["
+                           << outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
+                           << "]), " << otv->get_tensor().size() << ");\n";
+                }
+                writer.indent--;
+                writer << "}\n";
            }
        }
    }
 }
+
+shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
+{
+    if (!m_is_compiled)
+    {
+        compile();
+    }
+
+    return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
+    codegen::CodeWriter& writer,
+    Node* node,
+    const std::vector<GPU_TensorViewWrapper>& in,
+    const std::vector<GPU_TensorViewWrapper>& out)
+{
+    writer << "timer_" << node->get_name() << ".start();\n";
+}
+
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
+    codegen::CodeWriter& writer,
+    Node* node,
+    const std::vector<GPU_TensorViewWrapper>& in,
+    const std::vector<GPU_TensorViewWrapper>& out)
+{
+    writer << "timer_" << node->get_name() << ".stop();\n";
+}
--- a/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
@@ -41,7 +41,7 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
    m_buffer_size = shape_size(shape) * element_type.size();
    if (m_buffer_size > 0)
    {
-        cudaMalloc((void**)&m_allocated_buffer_pool, m_buffer_size);
+        cudaMalloc(static_cast<void**>(&m_allocated_buffer_pool), m_buffer_size);
    }
 }


--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -50,7 +50,7 @@ void runtime::gpu::check_cuda_errors(CUresult err)
 void* runtime::gpu::create_gpu_buffer(size_t buffer_size)
 {
    void* allocated_buffer_pool;
-    cudaMalloc((void**)&allocated_buffer_pool, buffer_size);
+    cudaMalloc(static_cast<void**>(&allocated_buffer_pool), buffer_size);
    return allocated_buffer_pool;
 }


--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -48,6 +48,8 @@
 #include "ngraph/util.hpp"
 #include "nlohmann/json.hpp"
 #include "util/all_close.hpp"
+#include "util/autodiff/backprop_function.hpp"
+#include "util/autodiff/numeric_compare.hpp"
 #include "util/matcher.hpp"
 #include "util/test_tools.hpp"

@@ -914,3 +916,47 @@ TEST(cpu_fusion, sigmoid_n1c1h4)
    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
    ASSERT_TRUE(read_vector<float>(result) == expected);
 }
+
+TEST(cpu_fusion, sigmoid_bprop_fusion)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/Graph_fprop_sigmoid.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    auto df = autodiff::backprop_function(func);
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(df);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    size_t ccg = count_ops_of_type<op::SigmoidBackprop>(df);
+    ASSERT_EQ(ccg, 1);
+}
+
+TEST(cpu_fusion, sigmoid_bprop_n1c1h4)
+{
+    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto delta = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto sigmoid_node = make_shared<op::SigmoidBackprop>(input, delta);
+    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input, delta});
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(func);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a =
+        backend->make_primary_tensor_view(element::f32, input->get_shape());
+    shared_ptr<runtime::TensorView> b =
+        backend->make_primary_tensor_view(element::f32, delta->get_shape());
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, input->get_shape());
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{1.0f, 1.0f, 1.0f, 1.0f};
+
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+    cf->call({a, b}, {result});
+
+    vector<float> expected{0.196612f, 0.0176627f, 0.196612f, 0.0176627f};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}