Merge branch 'master' into jbobba/maxpool-layouts

4203a832 · Jayaram Bobba · GitHub · e37677c0 · 529362b5 · 4203a832
Unverified Commit 4203a832 authored Mar 08, 2018 by Jayaram Bobba Committed by GitHub Mar 08, 2018
18 changed files
--- a/src/ngraph/autodiff/adjoints.hpp
+++ b/src/ngraph/autodiff/adjoints.hpp
@@ -75,10 +75,5 @@ namespace ngraph
        protected:
            std::unordered_map<Node*, std::shared_ptr<Node>> m_adjoint_map;
        };
-        /// @brief Returns a FunctionSpec for the backprop derivative of its argument.
-        /// @param f is f(X_i...)
-        /// @returns f'(X_i..., c) where f'(x_i, ..., c)_j is backprop for X_j
-        std::shared_ptr<Function> backprop_function(const std::shared_ptr<Function>& f);
    }
 }
--- a/src/ngraph/ops/batch_norm.cpp
+++ b/src/ngraph/ops/batch_norm.cpp
@@ -21,21 +21,20 @@
 ngraph::op::BatchNorm::BatchNorm(double eps,
                                 std::shared_ptr<ngraph::Node> gamma,
                                 std::shared_ptr<ngraph::Node> beta,
-                                 std::shared_ptr<ngraph::Node> input,
+                                 std::shared_ptr<ngraph::Node> input)
-                                 std::shared_ptr<ngraph::Node> mean,
+    : RequiresTensorViewArgs("BatchNorm", {gamma, beta, input})
-                                 std::shared_ptr<ngraph::Node> variance)
-    : RequiresTensorViewArgs("BatchNorm", {gamma, beta, input, mean, variance})
    , m_bn_input_shape(input->get_shape())
-    , m_bn_variance_shape(variance->get_shape())
-    , m_bn_mean_shape(mean->get_shape())
    , m_epsilon(eps)
 {
-    add_output(input->get_element_type(), m_bn_input_shape);
    if (m_bn_input_shape.size() < 2)
    {
        throw ngraph_error("input tensor to batchnorm much have tensor of atleast rank 2");
    }
+    else
+    {
+        this->m_bn_variance_shape.push_back(input->get_shape()[1]);
+        this->m_bn_mean_shape.push_back(input->get_shape()[1]);
+    }
    if (m_bn_input_shape[1] == 0)
    {
@@ -49,51 +48,27 @@ ngraph::op::BatchNorm::BatchNorm(double eps,
        throw ngraph_error("gamma, beta, mean, variance shoud have all rank 1");
    }
-    // assuming input shape (N, C, H, W), check if the size of mean and
-    // variance are equal to channel axis
-    if (mean->get_shape()[0] != m_bn_input_shape[1])
-    {
-        throw ngraph_error("mean size is not equal to input channel size");
-    }
-    if (variance->get_shape()[0] != m_bn_input_shape[1])
-    {
-        throw ngraph_error("variance size is not equal to input channel size");
-    }
-    if (variance->get_shape().size() != mean->get_shape().size())
-    {
-        throw ngraph_error("mean and variance rank does not match");
-    }
    if (gamma->get_shape().size() != beta->get_shape().size())
    {
        throw ngraph_error("gamma and beta rank does not match");
    }
-    if (input->get_element_type() != mean->get_element_type())
-    {
-        throw ngraph_error("input tensor and mean element type does not match");
-    }
-    if (input->get_element_type() != variance->get_element_type())
-    {
-        throw ngraph_error("input tensor and variance element type does not match");
-    }
    if (gamma->get_element_type() != beta->get_element_type())
    {
        throw ngraph_error("gamma and beta element type does not match");
    }
+    add_output(input->get_element_type(), m_bn_input_shape);
+    add_output(input->get_element_type(), m_bn_mean_shape);
+    add_output(input->get_element_type(), m_bn_variance_shape);
 }
 std::shared_ptr<ngraph::Node>
    ngraph::op::BatchNorm::copy_with_new_args(const NodeVector& new_args) const
 {
-    if (new_args.size() != 5)
+    if (new_args.size() != 3)
        throw ngraph_error("Incorrect number of new arguments");
-    return std::make_shared<BatchNorm>(
+    return std::make_shared<BatchNorm>(m_epsilon, new_args.at(0), new_args.at(1), new_args.at(2));
-        m_epsilon, new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), new_args.at(4));
 }
 ngraph::op::BatchNormBackprop::BatchNormBackprop(double eps,
@@ -174,10 +149,10 @@ void ngraph::op::BatchNorm::generate_adjoints(autodiff::Adjoints& adjoints,
    auto gamma = get_input_op(0);
    auto beta = get_input_op(1);
    auto input = get_input_op(2);
-    auto mean = get_input_op(3);
+    auto mean = std::make_shared<op::GetOutputElement>(shared_from_this(), 1);
-    auto variance = get_input_op(4);
+    auto var = std::make_shared<op::GetOutputElement>(shared_from_this(), 2);
    auto bbn = std::make_shared<op::BatchNormBackprop>(
-        get_eps_value(), gamma, beta, input, mean, variance, delta);
+        get_eps_value(), gamma, beta, input, mean, var, delta);
    auto dinput = std::make_shared<op::GetOutputElement>(bbn, 0);
    auto dgamma = std::make_shared<op::GetOutputElement>(bbn, 1);
    auto dbeta = std::make_shared<op::GetOutputElement>(bbn, 2);

--- a/src/ngraph/ops/batch_norm.hpp
+++ b/src/ngraph/ops/batch_norm.hpp
@@ -33,9 +33,7 @@ namespace ngraph
            BatchNorm(double eps,
                      std::shared_ptr<Node> gamma,
                      std::shared_ptr<Node> beta,
-                      std::shared_ptr<Node> input,
+                      std::shared_ptr<Node> input);
-                      std::shared_ptr<Node> mean,
-                      std::shared_ptr<Node> variance);
            const Shape& get_inputs_shape() const { return m_bn_input_shape; }
            const Shape& get_variance_shape() const { return m_bn_variance_shape; }

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -301,14 +301,26 @@ namespace ngraph
                auto gamma_shape = args[0].get_shape();
                auto beta_shape = args[1].get_shape();
                auto input_shape = args[2].get_shape();
-                auto mean_shape = args[3].get_shape();
-                auto variance_shape = args[4].get_shape();
                auto result_shape = out[0].get_shape();
+                auto mean_shape = out[1].get_shape();
+                auto variance_shape = out[2].get_shape();
                // get input element type
                const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
                    args[2].get_element_type());
+                const string& gamma_format = runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                    runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0));
+                const string& beta_format = runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                    runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1));
+                if (gamma_format.compare("memory::format::x") != 0 &&
+                    beta_format.compare("memory::format::x") != 0)
+                {
+                    throw std::runtime_error(
+                        "gamma layout->" + gamma_format + ", beta layout->" + beta_format +
+                        " should match and both should have memory::format::x format");
+                }
                writer << "{\n";
                writer.indent++;
@@ -329,16 +341,20 @@ namespace ngraph
                // get the eps value from the bn node
                writer << "auto epsilon = " << batchnorm->get_eps_value() << ";\n";
+                const string& input_format = runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                    runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 2));
+                const string& result_format = runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                    runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
                // Bind to CPU engine
                writer << "engine cpu_engine = engine(engine::cpu, 0);\n";
                // create memory descriptors
                writer << "memory::desc input_data_desc = memory::desc({" << join(input_shape)
-                       << "}, " << et << ", memory::format::nchw);\n";
+                       << "}, " << et << ", " << input_format << ");\n";
                // TODO define weights by stacking gamma and beta values
                writer << "memory::desc weights_desc = memory::desc({" << join(weights_shape)
                       << "}, " << et << ", memory::format::nc);\n";
                writer << "memory::desc result_desc = memory::desc({" << join(result_shape) << "}, "
-                       << et << ", memory::format::nchw);\n";
+                       << et << ", " << result_format << ");\n";
                writer << "memory::desc mean_desc = memory::desc({" << join(mean_shape) << "}, "
                       << et << ", memory::format::x);\n";
                writer << "memory::desc variance_desc = memory::desc({" << join(variance_shape)
@@ -349,17 +365,17 @@ namespace ngraph
                       << args[2].get_name() << ");\n";
                writer << "memory weights = memory({weights_desc, cpu_engine}, bn_weights.data()"
                       << ");\n";
-                writer << "memory mean = memory({mean_desc, cpu_engine}, " << args[3].get_name()
-                       << ");\n";
-                writer << "memory variance = memory({variance_desc, cpu_engine}, "
-                       << args[4].get_name() << ");\n";
                writer << "memory result = memory({result_desc, cpu_engine}, " << out[0].get_name()
                       << ");\n";
+                writer << "memory mean = memory({mean_desc, cpu_engine}, " << out[1].get_name()
+                       << ");\n";
+                writer << "memory variance = memory({variance_desc, cpu_engine}, "
+                       << out[2].get_name() << ");\n";
                // create batchnorm descriptor
                writer << "batch_normalization_forward::desc bn_fprop_desc = "
                          "batch_normalization_forward::desc(forward_training,"
-                       << "input_data_desc, epsilon, use_global_stats|use_scale_shift);\n";
+                       << "input_data_desc, epsilon, use_scale_shift);\n";
                // bn fprop primitive descriptor
                writer
                    << "batch_normalization_forward::primitive_desc bn_fprop_prim_desc = "
@@ -368,8 +384,8 @@ namespace ngraph
                // create a batchnorm fprop primitive
                writer << "batch_normalization_forward bn_fprop = "
                          "batch_normalization_forward(bn_fprop_prim_desc, "
-                          "primitive::at(input_data),primitive::at(mean), primitive::at(variance),"
+                          "primitive::at(input_data),"
-                       << "primitive::at(weights), result); \n";
+                       << "primitive::at(weights), result, mean, variance); \n";
                // create stream and execute
                writer << "stream s = stream(stream::kind::eager);\n"

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/dot.hpp"
+#include "ngraph/ops/get_output_element.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/pad.hpp"
 #include "ngraph/ops/parameter.hpp"
@@ -301,14 +302,12 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_fprop_bn()
            // get epsilon value
            auto eps_ptr = std::dynamic_pointer_cast<op::Constant>(pattern_map[eps_label]);
            double epsilon = *(reinterpret_cast<const double*>(eps_ptr->get_data_ptr()));
-            auto bn_node = std::shared_ptr<Node>(new op::BatchNorm(epsilon,
+            auto bn_node = std::make_shared<op::BatchNorm>(
-                                                                   pattern_map[gamma_label],
+                epsilon, pattern_map[gamma_label], pattern_map[beta_label], pattern_map[input]);
-                                                                   pattern_map[beta_label],
-                                                                   pattern_map[input],
+            auto normalized_output = std::shared_ptr<Node>(new op::GetOutputElement(bn_node, 0));
-                                                                   pattern_map[mean_label],
-                                                                   pattern_map[variance_label]));
+            return normalized_output;
-            return bn_node;
        };
    auto m = std::make_shared<ngraph::pattern::Matcher>(add_beta, callback);

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 namespace ngraph
@@ -22,10 +21,10 @@ namespace ngraph
    {
        namespace gpu
        {
-            void CudaKernelBuilder::get_1_element_op(const std::string& name,
+            void CudaKernelBuilder::get_unary_elementwise_op(const std::string& name,
-                                                     const std::string& data_type,
+                                                             const std::string& data_type,
-                                                     const std::string& op,
+                                                             const std::string& op,
-                                                     std::string& kernel)
+                                                             std::string& kernel)
            {
                kernel = R"(  
 extern "C" __global__
@@ -40,10 +39,10 @@ out[tid] =)" + op + "(in[tid]);\n" +
                return;
            }
-            void CudaKernelBuilder::get_2_element_op(const std::string& name,
+            void CudaKernelBuilder::get_binary_elementwise_op(const std::string& name,
-                                                     const std::string& data_type,
+                                                              const std::string& data_type,
-                                                     const std::string& op,
+                                                              const std::string& op,
-                                                     std::string& kernel)
+                                                              std::string& kernel)
            {
                kernel = R"(  
 extern "C" __global__
@@ -60,10 +59,11 @@ out[tid] = in1[tid] )" + op +
                return;
            }
-            void CudaKernelBuilder::get_n_element_op(const std::string& name,
+            void
-                                                     const std::string& data_type,
+                CudaKernelBuilder::get_arbitrary_elementwise_op(const std::string& name,
-                                                     const std::vector<std::string>& ops,
+                                                                const std::string& data_type,
-                                                     std::string& kernel)
+                                                                const std::vector<std::string>& ops,
+                                                                std::string& kernel)
            {
                kernel = "";
                return;

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -28,20 +28,20 @@ namespace ngraph
            class CudaKernelBuilder
            {
            public:
-                static void get_1_element_op(const std::string& name,
+                static void get_unary_elementwise_op(const std::string& name,
-                                             const std::string& data_type,
+                                                     const std::string& data_type,
-                                             const std::string& op,
+                                                     const std::string& op,
-                                             std::string& kernel);
+                                                     std::string& kernel);
-                static void get_2_element_op(const std::string& name,
+                static void get_binary_elementwise_op(const std::string& name,
-                                             const std::string& data_type,
+                                                      const std::string& data_type,
-                                             const std::string& op,
+                                                      const std::string& op,
-                                             std::string& kernel);
+                                                      std::string& kernel);
-                static void get_n_element_op(const std::string& name,
+                static void get_arbitrary_elementwise_op(const std::string& name,
-                                             const std::string& data_type,
+                                                         const std::string& data_type,
-                                             const std::vector<std::string>& ops,
+                                                         const std::vector<std::string>& ops,
-                                             std::string& kernel);
+                                                         std::string& kernel);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -17,10 +17,8 @@
 #include <algorithm>
 #include <map>
-#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
 namespace ngraph
 {
@@ -28,40 +26,6 @@ namespace ngraph
    {
        namespace gpu
        {
-            void emit_abs(void* in, void* out, size_t count)
-            {
-                std::string name = "abs";
-                // Create an instance of nvrtcProgram with the code string.
-                if (CudaFunctionPool::instance().get(name) == nullptr)
-                {
-                    const char* opts[] = {"--gpu-architecture=compute_35",
-                                          "--relocatable-device-code=true"};
-                    std::string kernel;
-                    CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
-                    CudaFunctionPool::instance().set(
-                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
-                }
-                //convert runtime ptr to driver api ptr
-                CUdeviceptr d_ptr_in, d_ptr_out;
-                d_ptr_in = (CUdeviceptr)in;
-                d_ptr_out = (CUdeviceptr)out;
-                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
-                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
-                                              count,
-                                              1,
-                                              1, // grid dim
-                                              1,
-                                              1,
-                                              1, // block dim
-                                              0,
-                                              NULL, // shared mem and stream
-                                              args_list,
-                                              0));  // arguments
-                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
-            }
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
            {

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -18,6 +18,9 @@
 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/coordinate.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/strides.hpp"
 namespace ngraph
@@ -26,9 +29,46 @@ namespace ngraph
    {
        namespace gpu
        {
-            void emit_abs(void* in, void* out, size_t count);
+            template <typename T>
+            struct CudaOpMap;
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
+            template <typename T>
+            void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
+            {
+                // Create an instance of nvrtcProgram with the code string.
+                if (CudaFunctionPool::instance().get(name) == nullptr)
+                {
+                    const char* opts[] = {"--gpu-architecture=compute_35",
+                                          "--relocatable-device-code=true"};
+                    std::string kernel;
+                    CudaKernelBuilder::get_unary_elementwise_op(
+                        name, "float", CudaOpMap<T>::op, kernel);
+                    CudaFunctionPool::instance().set(
+                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
+                }
+                //convert runtime ptr to driver api ptr
+                CUdeviceptr d_ptr_in, d_ptr_out;
+                d_ptr_in = (CUdeviceptr)in;
+                d_ptr_out = (CUdeviceptr)out;
+                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
+                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                              count,
+                                              1,
+                                              1, // grid dim
+                                              1,
+                                              1,
+                                              1, // block dim
+                                              0,
+                                              NULL, // shared mem and stream
+                                              args_list,
+                                              0));  // arguments
+                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+namespace ngraph
+{
+    namespace op
+    {
+        class Abs;
+        class Acos;
+        class Asin;
+        class Atan;
+        class Ceiling;
+        class Cos;
+        class Cosh;
+        class Exp;
+        class Floor;
+        class Log;
+        class Sin;
+        class Sinh;
+        class Tan;
+        class Tanh;
+        // Unimplemented or unused in favor of cuDNN impl.
+        class Max;
+        class Min;
+        class Negative;
+        class Not;
+        class Sign;
+        class Sqrt;
+    }
+    namespace runtime
+    {
+        namespace gpu
+        {
+            template <>
+            struct CudaOpMap<ngraph::op::Abs>
+            {
+                static constexpr const char* op = "fabsf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Acos>
+            {
+                static constexpr const char* op = "acosf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Asin>
+            {
+                static constexpr const char* op = "asinf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Atan>
+            {
+                static constexpr const char* op = "atanf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Ceiling>
+            {
+                static constexpr const char* op = "ceilf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Cos>
+            {
+                static constexpr const char* op = "cosf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Cosh>
+            {
+                static constexpr const char* op = "coshf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Exp>
+            {
+                static constexpr const char* op = "expf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Floor>
+            {
+                static constexpr const char* op = "floorf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Log>
+            {
+                static constexpr const char* op = "logf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Max>
+            {
+                static constexpr const char* op = "fmaxf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Min>
+            {
+                static constexpr const char* op = "fminf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Sin>
+            {
+                static constexpr const char* op = "sinf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Sinh>
+            {
+                static constexpr const char* op = "sinhf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Sqrt>
+            {
+                static constexpr const char* op = "sqrtf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Tan>
+            {
+                static constexpr const char* op = "tanf";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Tanh>
+            {
+                static constexpr const char* op = "tanhf";
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -45,8 +45,8 @@ namespace ngraph
                static void EMITTER_DECL(EmitMultiply);
                static void EMITTER_DECL(EmitGetOutputElement);
                static void EMITTER_DECL(EmitXLAGetTupleElement);
+                static void EMITTER_DECL(EmitUnaryElementwise);
                static void EMITTER_DECL(EmitTuple);
-                static void EMITTER_DECL(EmitAbs);
                static void EMITTER_DECL(EmitConcat);
                static void EMITTER_DECL(EmitDivide);
                static void EMITTER_DECL(EmitEqual);
@@ -54,7 +54,6 @@ namespace ngraph
                static void EMITTER_DECL(EmitGreaterEq);
                static void EMITTER_DECL(EmitLess);
                static void EMITTER_DECL(EmitLessEq);
-                static void EMITTER_DECL(EmitLog);
                static void EMITTER_DECL(EmitMaximum);
                static void EMITTER_DECL(EmitMinimum);
                static void EMITTER_DECL(EmitNegative);
@@ -67,31 +66,18 @@ namespace ngraph
                static void EMITTER_DECL(EmitReshape);
                static void EMITTER_DECL(EmitFunctionCall);
                static void EMITTER_DECL(EmitReduce);
-                static void EMITTER_DECL(EmitSign);
                static void EMITTER_DECL(EmitSlice);
                static void EMITTER_DECL(EmitSum);
-                static void EMITTER_DECL(EmitExp);
-                static void EMITTER_DECL(EmitSin);
-                static void EMITTER_DECL(EmitSinh);
-                static void EMITTER_DECL(EmitCos);
-                static void EMITTER_DECL(EmitCosh);
-                static void EMITTER_DECL(EmitTan);
-                static void EMITTER_DECL(EmitTanh);
-                static void EMITTER_DECL(EmitAsin);
-                static void EMITTER_DECL(EmitAcos);
-                static void EMITTER_DECL(EmitAtan);
                static void EMITTER_DECL(EmitPower);
                static void EMITTER_DECL(EmitReplaceSlice);
                static void EMITTER_DECL(EmitOneHot);
-                static void EMITTER_DECL(EmitFloor);
-                static void EMITTER_DECL(EmitCeiling);
                static void EMITTER_DECL(EmitSqrt);
                static void EMITTER_DECL(EmitConvolution);
-                static void EMITTER_DECL(EmitNot);
                static void EMITTER_DECL(EmitMaxPool);
                static void EMITTER_DECL(EmitReverse);
                static void EMITTER_DECL(EmitReduceWindow);
                static void EMITTER_DECL(EmitSelectAndScatter);
+                static void EMITTER_DECL(EmitResult);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -151,7 +151,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
-    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
+    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
@@ -159,7 +159,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
-    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
+    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
@@ -173,30 +173,31 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
-    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
+    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
-    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
+    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
+    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
+    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
+    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
+    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
+    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
+    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
+    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
+    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
+    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
-    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
+    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
-    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
+    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
-    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
+    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
    {TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
+    {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult},
 };
 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
@@ -250,6 +251,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    #include "ngraph/pass/memory_layout.hpp"
    #include "ngraph/runtime/aligned_buffer.hpp"
    #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+    #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
    #include "ngraph/runtime/gpu/gpu_util.hpp"
    #include "ngraph/util.hpp"
 )";
@@ -346,12 +348,15 @@ using namespace std;
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
                auto c_value_strings = c->get_value_strings();
                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
-                       << tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n";
+                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
+                       << "] =\n";
                writer << "{\n";
                writer.indent++;
                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
                writer.indent--;
                writer << "\n};\n\n";
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
+                       << tv->get_tensor().get_name() << ";\n";
                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
            }
        }
@@ -485,6 +490,26 @@ using namespace std;
        writer << "{\n";
        writer.indent++;
+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        {
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+                if (c)
+                {
+                    shared_ptr<descriptor::TensorView> tv =
+                        node->get_outputs()[0].get_tensor_view();
+                    writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
+                           << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
+                           << ");\n";
+                    writer.indent--;
+                    writer << "}\n";
+                }
+            }
+        }
        bool temporaries_used = false;
        size_t worst_case_tmp_size = 0;
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
@@ -657,7 +682,6 @@ using namespace std;
            // Emit operation epilogue
            if (!node->is_parameter() && !node->is_constant())
            {
-                handle_output_alias(writer, *node, output_alias_map);
                if (m_emit_timing)
                {
                    emit_debug_function_exit(writer, node.get(), in, out);

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -328,7 +328,7 @@ static shared_ptr<ngraph::Function>
        else if (node_op == "BatchNorm")
        {
            auto epsilon = node_js.at("eps").get<double>();
-            node = make_shared<op::BatchNorm>(epsilon, args[0], args[1], args[2], args[3], args[4]);
+            node = make_shared<op::BatchNorm>(epsilon, args[0], args[1], args[2]);
        }
        else if (node_op == "BatchNormBackprop")
        {

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1305,6 +1305,7 @@ TEST(${BACKEND_NAME}, backwards_slice)
 TEST(${BACKEND_NAME}, backwards_softmax_all)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1322,6 +1323,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_all)
 TEST(${BACKEND_NAME}, backwards_softmax_axis)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1339,6 +1341,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_axis)
 TEST(${BACKEND_NAME}, backwards_softmax_underflow)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -1358,6 +1361,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_underflow)
 TEST(${BACKEND_NAME}, backwards_softmax_3d)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -120,6 +120,7 @@ TEST(${BACKEND_NAME}, component_cleanup)
 TEST(${BACKEND_NAME}, aliased_output)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -335,7 +336,6 @@ TEST(${BACKEND_NAME}, abs)
 TEST(${BACKEND_NAME}, ceiling)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Ceiling>(A), op::ParameterVector{A});
@@ -776,7 +776,6 @@ TEST(${BACKEND_NAME}, equal)
 TEST(${BACKEND_NAME}, floor)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Floor>(A), op::ParameterVector{A});
@@ -1370,7 +1369,6 @@ TEST(${BACKEND_NAME}, lesseq_bool)
 TEST(${BACKEND_NAME}, log)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Log>(A), op::ParameterVector{A});
@@ -2673,7 +2671,6 @@ TEST(${BACKEND_NAME}, reshape_6d)
 TEST(${BACKEND_NAME}, sin)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Sin>(A), op::ParameterVector{A});
@@ -2699,7 +2696,6 @@ TEST(${BACKEND_NAME}, sin)
 TEST(${BACKEND_NAME}, cos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Cos>(A), op::ParameterVector{A});
@@ -2725,7 +2721,6 @@ TEST(${BACKEND_NAME}, cos)
 TEST(${BACKEND_NAME}, tan)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Tan>(A), op::ParameterVector{A});
@@ -2746,12 +2741,11 @@ TEST(${BACKEND_NAME}, tan)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return tanf(x); });
    cf->call({a}, {result});
-    EXPECT_EQ(input, read_vector<float>(result));
+    EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
 }
 TEST(${BACKEND_NAME}, asin)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Asin>(A), op::ParameterVector{A});
@@ -2776,7 +2770,6 @@ TEST(${BACKEND_NAME}, asin)
 TEST(${BACKEND_NAME}, acos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Acos>(A), op::ParameterVector{A});
@@ -2801,7 +2794,6 @@ TEST(${BACKEND_NAME}, acos)
 TEST(${BACKEND_NAME}, atan)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Atan>(A), op::ParameterVector{A});
@@ -2826,7 +2818,6 @@ TEST(${BACKEND_NAME}, atan)
 TEST(${BACKEND_NAME}, sinh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Sinh>(A), op::ParameterVector{A});
@@ -2851,7 +2842,6 @@ TEST(${BACKEND_NAME}, sinh)
 TEST(${BACKEND_NAME}, cosh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Cosh>(A), op::ParameterVector{A});
@@ -2876,7 +2866,6 @@ TEST(${BACKEND_NAME}, cosh)
 TEST(${BACKEND_NAME}, tanh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Tanh>(A), op::ParameterVector{A});
@@ -2896,12 +2885,11 @@ TEST(${BACKEND_NAME}, tanh)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return tanhf(x); });
    cf->call({a}, {result});
-    EXPECT_EQ(input, read_vector<float>(result));
+    EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
 }
 TEST(${BACKEND_NAME}, exp)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{8};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Exp>(A), op::ParameterVector{A});
@@ -8441,6 +8429,7 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop)
 TEST(${BACKEND_NAME}, softmax_all)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f =
@@ -8473,6 +8462,7 @@ TEST(${BACKEND_NAME}, softmax_all)
 TEST(${BACKEND_NAME}, softmax_axis)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
@@ -8501,6 +8491,7 @@ TEST(${BACKEND_NAME}, softmax_axis)
 TEST(${BACKEND_NAME}, softmax_underflow)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -25,6 +25,7 @@
 #include "ngraph/log.hpp"
 #include "ngraph/ngraph.hpp"
 #include "ngraph/ops/batch_norm.hpp"
+#include "ngraph/ops/get_output_element.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/pass/graph_rewrite.hpp"
 #include "ngraph/pass/manager.hpp"
@@ -254,18 +255,21 @@ TEST(cpu_fusion, batchnorm_fprop_b1c2h2w2)
    auto input_shape = Shape{1, 2, 2, 2};
    auto input = make_shared<op::Parameter>(element::f32, input_shape);
    auto mean_shape = Shape{2};
-    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
    auto var_shape = Shape{2};
-    auto var = make_shared<op::Parameter>(element::f32, var_shape);
    auto gamma_shape = Shape{2};
    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
    auto beta_shape = Shape{2};
    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
    double eps = 0.001;
    auto shape_r = Shape{1, 2, 2, 2};
-    auto bn = make_shared<op::BatchNorm>(eps, gamma, beta, input, mean, var);
+    auto bn = make_shared<op::BatchNorm>(eps, gamma, beta, input);
-    auto f = make_shared<Function>(bn, op::ParameterVector{mean, var, input, gamma, beta});
+    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
+    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
+    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
+    auto f = make_shared<Function>(NodeVector{output_rt, mean_rt, variance_rt},
+                                   op::ParameterVector{input, gamma, beta});
    auto manager = runtime::Manager::get("CPU");
    auto external = manager->compile(f);
    auto backend = manager->allocate_backend();
@@ -283,15 +287,13 @@ TEST(cpu_fusion, batchnorm_fprop_b1c2h2w2)
                            0.64589411f,
                            0.4375872f,
                            0.89177299f});
-    auto _mean = backend->make_primary_tensor_view(element::f32, mean_shape);
-    copy_data(_mean, vector<float>{0.60291237f, 0.59972727f});
-    auto _var = backend->make_primary_tensor_view(element::f32, var_shape);
-    copy_data(_var, vector<float>{0.00472505f, 0.03617825f});
    auto _gamma = backend->make_primary_tensor_view(element::f32, gamma_shape);
    copy_data(_gamma, vector<float>{1.0f, 1.0f});
    auto _beta = backend->make_primary_tensor_view(element::f32, beta_shape);
    copy_data(_beta, vector<float>{0.0f, 0.0f});
-    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+    auto bn_output = backend->make_primary_tensor_view(element::f32, shape_r);
+    auto result_mean = backend->make_primary_tensor_view(element::f32, mean_shape);
+    auto result_variance = backend->make_primary_tensor_view(element::f32, var_shape);
    vector<float> expected_result{-0.71498716f,
                                  1.48388731f,
@@ -301,8 +303,14 @@ TEST(cpu_fusion, batchnorm_fprop_b1c2h2w2)
                                  0.23943391f,
                                  -0.84090298f,
                                  1.51462936f};
-    cf->call({_mean, _var, _input, _gamma, _beta}, {result});
+    vector<float> expected_mean{0.602912f, 0.599727f};
-    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(result)));
+    vector<float> expected_variance{0.00472505f, 0.0361782f};
+    cf->call({_input, _gamma, _beta}, {bn_output, result_mean, result_variance});
+    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(bn_output)));
+    EXPECT_TRUE(test::all_close(expected_mean, read_vector<float>(result_mean)));
+    EXPECT_TRUE(test::all_close(expected_variance, read_vector<float>(result_variance)));
 }
 TEST(cpu_fusion, batchnorm_fprop_b2c2h2w1)
@@ -310,18 +318,21 @@ TEST(cpu_fusion, batchnorm_fprop_b2c2h2w1)
    auto input_shape = Shape{2, 2, 2, 1};
    auto input = make_shared<op::Parameter>(element::f32, input_shape);
    auto mean_shape = Shape{2};
-    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
    auto var_shape = Shape{2};
-    auto var = make_shared<op::Parameter>(element::f32, var_shape);
    auto gamma_shape = Shape{2};
    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
    auto beta_shape = Shape{2};
    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
    double eps = 0.001;
    auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNorm>(eps, gamma, beta, input, mean, var);
+    auto bn = make_shared<op::BatchNorm>(eps, gamma, beta, input);
-    auto f = make_shared<Function>(bn, op::ParameterVector{mean, var, input, gamma, beta});
+    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
+    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
+    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
+    auto f = make_shared<Function>(NodeVector{output_rt, mean_rt, variance_rt},
+                                   op::ParameterVector{input, gamma, beta});
    auto manager = runtime::Manager::get("CPU");
    auto external = manager->compile(f);
    auto backend = manager->allocate_backend();
@@ -337,20 +348,24 @@ TEST(cpu_fusion, batchnorm_fprop_b2c2h2w1)
                            0.64589411f,
                            0.4375872f,
                            0.89177299f});
-    auto _mean = backend->make_primary_tensor_view(element::f32, mean_shape);
-    copy_data(_mean, vector<float>{0.60291237f, 0.59972727f});
-    auto _var = backend->make_primary_tensor_view(element::f32, var_shape);
-    copy_data(_var, vector<float>{0.00472505f, 0.03617825f});
    auto _gamma = backend->make_primary_tensor_view(element::f32, gamma_shape);
    copy_data(_gamma, vector<float>{1.0f, 1.0f});
    auto _beta = backend->make_primary_tensor_view(element::f32, beta_shape);
    copy_data(_beta, vector<float>{0.0f, 0.0f});
-    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+    auto bn_output = backend->make_primary_tensor_view(element::f32, shape_r);
+    auto result_mean = backend->make_primary_tensor_view(element::f32, mean_shape);
+    auto result_variance = backend->make_primary_tensor_view(element::f32, var_shape);
    vector<float> expected_result{
-        -0.714987f, 1.48389f, 0.015746f, -0.284436f, -2.36912f, 0.56806f, -0.840903f, 1.51463f};
+        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
-    cf->call({_mean, _var, _input, _gamma, _beta}, {result});
+    vector<float> expected_mean{0.583388f, 0.619252f};
-    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(result)));
+    vector<float> expected_variance{0.0119972f, 0.0282681f};
+    cf->call({_input, _gamma, _beta}, {bn_output, result_mean, result_variance});
+    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(bn_output)));
+    EXPECT_TRUE(test::all_close(expected_mean, read_vector<float>(result_mean)));
+    EXPECT_TRUE(test::all_close(expected_variance, read_vector<float>(result_variance)));
 }
 TEST(cpu_fusion, fuse_fprop_bn)
@@ -404,7 +419,10 @@ TEST(cpu_fusion, bn_bprop_n4c3h2w2)
    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
    double eps = 0.001;
    auto shape_r = Shape{4, 3, 2, 2};
-    auto bn = make_shared<op::BatchNorm>(eps, gamma, beta, input, mean, var);
+    auto bn = make_shared<op::BatchNorm>(eps, gamma, beta, input);
+    auto bn_dx = make_shared<op::GetOutputElement>(bn, 0);
+    auto bn_dgamma = make_shared<op::GetOutputElement>(bn, 1);
+    auto bn_dbeta = make_shared<op::GetOutputElement>(bn, 2);
    auto manager = runtime::Manager::get("CPU");
    auto backend = manager->allocate_backend();
@@ -436,7 +454,8 @@ TEST(cpu_fusion, bn_bprop_n4c3h2w2)
    vector<float> deltaData(shape_size(shape_r), 20.0f);
    copy_data(_delta, deltaData);
-    auto f = make_shared<Function>(bn, op::ParameterVector{mean, var, input, gamma, beta});
+    auto f = make_shared<Function>(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
+                                   op::ParameterVector{mean, var, input, gamma, beta});
    auto C = std::make_shared<op::Parameter>(element::f32, shape_r);
    auto dinput = bn->backprop_node(input, C);

--- a/test/util/autodiff/backprop_function.hpp
+++ b/test/util/autodiff/backprop_function.hpp
@@ -21,15 +21,8 @@
 namespace ngraph
 {
-    class Node;
    class Function;
-    namespace runtime
-    {
-        class Backend;
-        class Manager;
-    }
    namespace autodiff
    {
        /// @brief Returns a FunctionSpec for the backprop derivative of its argument.