Merge remote-tracking branch 'origin/master' into jbobba/batchnorm-layouts

a94d46d4 · Jayaram Bobba · d6000754 · 9cca4073 · a94d46d4 · a94d46d4
Commit a94d46d4 authored Mar 08, 2018 by Jayaram Bobba
15 changed files
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -272,13 +272,6 @@ namespace ngraph

                const char* cbeta = "0.0f";

-                if (args.size() > 2)
-                {
-                    writer << "memcpy(" << out[0].get_name() << ", " << args[2].get_name() << ", "
-                           << out[0].get_size() * out[0].get_element_type().size() << ");\n";
-                    cbeta = "1.0f";
-                }
-
                writer << "cblas::cblas_sgemm("
                       << "cblas::Layout::RowMajor, " << tranpose_a << tranpose_b << m << ", " << n
                       << ", " << k << ",\n"
@@ -287,6 +280,101 @@ namespace ngraph
                       << "        " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
                       << ");\n";

+                if (args.size() > 2)
+                {
+                    auto axes = cg->get_broadcast_axes();
+                    if (axes.size() == 1)
+                    {
+                        if (*(axes.begin()) == 0)
+                        {
+                            writer << "static " << out[0].get_element_type().c_type_string()
+                                   << " ones_row[" << arg2_shape[0] << "]"
+                                   << " = { 1.0f";
+                            for (size_t i = 1; i < arg2_shape[0]; ++i)
+                            {
+                                writer << ", 1.0f";
+                            }
+                            writer << "};\n";
+
+                            writer << "cblas::cblas_sgemm("
+                                   << "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
+                                   << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                                   << ",\n"
+                                   << "        1.0f, ones_row, "
+                                   << "1"
+                                   << ", " << args[2].get_name() << ", " << max(1UL, arg2_shape[1])
+                                   << ", "
+                                   << "1.0f"
+                                   << ",\n"
+                                   << "        " << out[0].get_name() << ", "
+                                   << max(1UL, arg2_shape[1]) << ");\n";
+                        }
+                        else
+                        {
+                            writer << "static " << out[0].get_element_type().c_type_string()
+                                   << " ones_col[" << arg2_shape[1] << "]"
+                                   << " = { 1.0f";
+                            for (size_t i = 1; i < arg2_shape[1]; ++i)
+                            {
+                                writer << ", 1.0f";
+                            }
+                            writer << "};\n";
+
+                            writer << "cblas::cblas_sgemm("
+                                   << "cblas::Layout::RowMajor, " << cnotranspose << ctranspose
+                                   << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                                   << ",\n"
+                                   << "        1.0f, ones_col," << max(1UL, arg2_shape[1]) << ", "
+                                   << args[2].get_name() << ", "
+                                   << "1"
+                                   << ", "
+                                   << "1.0f"
+                                   << ",\n"
+                                   << "        " << out[0].get_name() << ", "
+                                   << max(1UL, arg2_shape[1]) << ");\n";
+                        }
+                    }
+                    else
+                    {
+                        if (axes.size() != 2)
+                        {
+                            throw ngraph_error("unexpected broadcast rank");
+                        }
+
+                        writer << out[0].get_element_type().c_type_string() << " bias["
+                               << arg2_shape[1] << "]"
+                               << " = { " << args[2].get_name() << "[0]";
+                        for (size_t i = 1; i < arg2_shape[1]; ++i)
+                        {
+                            writer << "," << args[2].get_name() << "[0]";
+                        }
+                        writer << "};\n";
+
+                        writer << "static " << out[0].get_element_type().c_type_string()
+                               << " ones_scalar[" << arg2_shape[0] << "]"
+                               << " = { 1.0f";
+                        for (size_t i = 1; i < arg2_shape[0]; ++i)
+                        {
+                            writer << ", 1.0f";
+                        }
+                        writer << "};\n";
+
+                        writer << "cblas::cblas_sgemm("
+                               << "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
+                               << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                               << ",\n"
+                               << "        1.0f, ones_scalar, "
+                               << "1"
+                               << ", "
+                               << "bias"
+                               << ", " << max(1UL, arg2_shape[1]) << ", "
+                               << "1.0f"
+                               << ",\n"
+                               << "        " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
+                               << ");\n";
+                    }
+                }
+
                writer.indent--;
                writer << "}\n";
            }

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
@@ -32,7 +32,8 @@ std::shared_ptr<ngraph::Node>
                                        m_shape_w,
                                        m_shape_x,
                                        m_transpose_w,
-                                        m_transpose_x);
+                                        m_transpose_x,
+                                        m_broadcast_axes);
 }

 ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
@@ -41,7 +42,8 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
                                   Shape shape_w,
                                   Shape shape_x,
                                   bool transpose_w,
-                                   bool transpose_x)
+                                   bool transpose_x,
+                                   AxisSet axes)
    : RequiresTensorViewArgs("MatMulBias",
                             b == nullptr ? std::vector<std::shared_ptr<Node>>{W, x}
                                          : std::vector<std::shared_ptr<Node>>{W, x, b})
@@ -49,8 +51,24 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
    , m_shape_x(shape_x)
    , m_transpose_w(transpose_w)
    , m_transpose_x(transpose_x)
+    , m_broadcast_axes(axes)

 {
+    if (axes.size() == 0 && b != nullptr)
+    {
+        throw ngraph_error("Bias but no broadcast axes");
+    }
+
+    if (b == nullptr && axes.size() != 0)
+    {
+        throw ngraph_error("Broadcast axes but no bias");
+    }
+
+    if (axes.size() > 2)
+    {
+        throw ngraph_error("Broadcasting to > 2D tensor");
+    }
+
    if (shape_w.size() != 2)
    {
        NGRAPH_DEBUG << "W shape = " << vector_to_string(shape_w);

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include "ngraph/axis_set.hpp"
 #include "ngraph/ops/util/requires_tensor_view_args.hpp"

 namespace ngraph
@@ -31,12 +32,14 @@ namespace ngraph
                       Shape shape_w,
                       Shape shape_x,
                       bool transpose_w,
-                       bool transpose_x);
+                       bool transpose_x,
+                       AxisSet axes = AxisSet{});

            bool get_is_arg0_transposed() const { return m_transpose_w; }
            bool get_is_arg1_transposed() const { return m_transpose_x; }
            Shape get_arg0_shape() const { return m_shape_w; }
            Shape get_arg1_shape() const { return m_shape_x; }
+            const AxisSet& get_broadcast_axes() const { return m_broadcast_axes; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

@@ -45,6 +48,7 @@ namespace ngraph
            Shape m_shape_x;
            bool m_transpose_w;
            bool m_transpose_x;
+            AxisSet m_broadcast_axes;
        };
    }
 }
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -134,12 +134,21 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_matmulbias_pattern()
                     << m.match_root()->get_name();

        auto mpattern = m.match_root(); //add
-        auto m_matmul = mpattern->get_input_op(0);
-        auto m_broadcast = mpattern->get_input_op(1);
+        auto m_matmul = std::dynamic_pointer_cast<op::MatmulBias>(mpattern->get_input_op(0));
+        auto m_broadcast = std::dynamic_pointer_cast<op::Broadcast>(mpattern->get_input_op(1));
+        auto m_bias = m_broadcast->get_input_op(0);
        auto pattern_map = m.get_pattern_map();

-        return m_matmul->copy_with_new_args(
-            NodeVector{pattern_map[W], pattern_map[x], m_broadcast});
+        auto mmb = std::make_shared<op::MatmulBias>(pattern_map[W],
+                                                    pattern_map[x],
+                                                    m_bias,
+                                                    m_matmul->get_arg0_shape(),
+                                                    m_matmul->get_arg1_shape(),
+                                                    m_matmul->get_is_arg0_transposed(),
+                                                    m_matmul->get_is_arg1_transposed(),
+                                                    m_broadcast->get_broadcast_axes());
+
+        return mmb;
    };

    auto m = std::make_shared<ngraph::pattern::Matcher>(padd, callback);

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"

 namespace ngraph
@@ -22,10 +21,10 @@ namespace ngraph
    {
        namespace gpu
        {
-            void CudaKernelBuilder::get_1_element_op(const std::string& name,
-                                                     const std::string& data_type,
-                                                     const std::string& op,
-                                                     std::string& kernel)
+            void CudaKernelBuilder::get_unary_elementwise_op(const std::string& name,
+                                                             const std::string& data_type,
+                                                             const std::string& op,
+                                                             std::string& kernel)
            {
                kernel = R"(  
 extern "C" __global__
@@ -40,10 +39,10 @@ out[tid] =)" + op + "(in[tid]);\n" +
                return;
            }

-            void CudaKernelBuilder::get_2_element_op(const std::string& name,
-                                                     const std::string& data_type,
-                                                     const std::string& op,
-                                                     std::string& kernel)
+            void CudaKernelBuilder::get_binary_elementwise_op(const std::string& name,
+                                                              const std::string& data_type,
+                                                              const std::string& op,
+                                                              std::string& kernel)
            {
                kernel = R"(  
 extern "C" __global__
@@ -60,10 +59,11 @@ out[tid] = in1[tid] )" + op +
                return;
            }

-            void CudaKernelBuilder::get_n_element_op(const std::string& name,
-                                                     const std::string& data_type,
-                                                     const std::vector<std::string>& ops,
-                                                     std::string& kernel)
+            void
+                CudaKernelBuilder::get_arbitrary_elementwise_op(const std::string& name,
+                                                                const std::string& data_type,
+                                                                const std::vector<std::string>& ops,
+                                                                std::string& kernel)
            {
                kernel = "";
                return;

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -28,20 +28,20 @@ namespace ngraph
            class CudaKernelBuilder
            {
            public:
-                static void get_1_element_op(const std::string& name,
-                                             const std::string& data_type,
-                                             const std::string& op,
-                                             std::string& kernel);
+                static void get_unary_elementwise_op(const std::string& name,
+                                                     const std::string& data_type,
+                                                     const std::string& op,
+                                                     std::string& kernel);

-                static void get_2_element_op(const std::string& name,
-                                             const std::string& data_type,
-                                             const std::string& op,
-                                             std::string& kernel);
+                static void get_binary_elementwise_op(const std::string& name,
+                                                      const std::string& data_type,
+                                                      const std::string& op,
+                                                      std::string& kernel);

-                static void get_n_element_op(const std::string& name,
-                                             const std::string& data_type,
-                                             const std::vector<std::string>& ops,
-                                             std::string& kernel);
+                static void get_arbitrary_elementwise_op(const std::string& name,
+                                                         const std::string& data_type,
+                                                         const std::vector<std::string>& ops,
+                                                         std::string& kernel);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -17,10 +17,8 @@
 #include <algorithm>
 #include <map>

-#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"

 namespace ngraph
 {
@@ -28,40 +26,6 @@ namespace ngraph
    {
        namespace gpu
        {
-            void emit_abs(void* in, void* out, size_t count)
-            {
-                std::string name = "abs";
-                // Create an instance of nvrtcProgram with the code string.
-                if (CudaFunctionPool::instance().get(name) == nullptr)
-                {
-                    const char* opts[] = {"--gpu-architecture=compute_35",
-                                          "--relocatable-device-code=true"};
-                    std::string kernel;
-                    CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
-                    CudaFunctionPool::instance().set(
-                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
-                }
-
-                //convert runtime ptr to driver api ptr
-                CUdeviceptr d_ptr_in, d_ptr_out;
-                d_ptr_in = (CUdeviceptr)in;
-                d_ptr_out = (CUdeviceptr)out;
-
-                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
-                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
-                                              count,
-                                              1,
-                                              1, // grid dim
-                                              1,
-                                              1,
-                                              1, // block dim
-                                              0,
-                                              NULL, // shared mem and stream
-                                              args_list,
-                                              0));  // arguments
-                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
-            }
-
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
            {

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -18,6 +18,9 @@

 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/coordinate.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/strides.hpp"

 namespace ngraph
@@ -26,9 +29,46 @@ namespace ngraph
    {
        namespace gpu
        {
-            void emit_abs(void* in, void* out, size_t count);
+            template <typename T>
+            struct CudaOpMap;
+
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
+
+            template <typename T>
+            void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
+            {
+                // Create an instance of nvrtcProgram with the code string.
+                if (CudaFunctionPool::instance().get(name) == nullptr)
+                {
+                    const char* opts[] = {"--gpu-architecture=compute_35",
+                                          "--relocatable-device-code=true"};
+                    std::string kernel;
+                    CudaKernelBuilder::get_unary_elementwise_op(
+                        name, "float", CudaOpMap<T>::op, kernel);
+                    CudaFunctionPool::instance().set(
+                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
+                }
+
+                //convert runtime ptr to driver api ptr
+                CUdeviceptr d_ptr_in, d_ptr_out;
+                d_ptr_in = (CUdeviceptr)in;
+                d_ptr_out = (CUdeviceptr)out;
+
+                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
+                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                              count,
+                                              1,
+                                              1, // grid dim
+                                              1,
+                                              1,
+                                              1, // block dim
+                                              0,
+                                              NULL, // shared mem and stream
+                                              args_list,
+                                              0));  // arguments
+                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+namespace ngraph
+{
+    namespace op
+    {
+        class Abs;
+        class Acos;
+        class Asin;
+        class Atan;
+        class Ceiling;
+        class Cos;
+        class Cosh;
+        class Exp;
+        class Floor;
+        class Log;
+        class Sin;
+        class Sinh;
+        class Tan;
+        class Tanh;
+
+        // Unimplemented or unused in favor of cuDNN impl.
+        class Max;
+        class Min;
+        class Negative;
+        class Not;
+        class Sign;
+        class Sqrt;
+    }
+    namespace runtime
+    {
+        namespace gpu
+        {
+            template <>
+            struct CudaOpMap<ngraph::op::Abs>
+            {
+                static constexpr const char* op = "fabsf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Acos>
+            {
+                static constexpr const char* op = "acosf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Asin>
+            {
+                static constexpr const char* op = "asinf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Atan>
+            {
+                static constexpr const char* op = "atanf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Ceiling>
+            {
+                static constexpr const char* op = "ceilf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Cos>
+            {
+                static constexpr const char* op = "cosf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Cosh>
+            {
+                static constexpr const char* op = "coshf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Exp>
+            {
+                static constexpr const char* op = "expf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Floor>
+            {
+                static constexpr const char* op = "floorf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Log>
+            {
+                static constexpr const char* op = "logf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Max>
+            {
+                static constexpr const char* op = "fmaxf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Min>
+            {
+                static constexpr const char* op = "fminf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Sin>
+            {
+                static constexpr const char* op = "sinf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Sinh>
+            {
+                static constexpr const char* op = "sinhf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Sqrt>
+            {
+                static constexpr const char* op = "sqrtf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Tan>
+            {
+                static constexpr const char* op = "tanf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Tanh>
+            {
+                static constexpr const char* op = "tanhf";
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -45,8 +45,8 @@ namespace ngraph
                static void EMITTER_DECL(EmitMultiply);
                static void EMITTER_DECL(EmitGetOutputElement);
                static void EMITTER_DECL(EmitXLAGetTupleElement);
+                static void EMITTER_DECL(EmitUnaryElementwise);
                static void EMITTER_DECL(EmitTuple);
-                static void EMITTER_DECL(EmitAbs);
                static void EMITTER_DECL(EmitConcat);
                static void EMITTER_DECL(EmitDivide);
                static void EMITTER_DECL(EmitEqual);
@@ -54,7 +54,6 @@ namespace ngraph
                static void EMITTER_DECL(EmitGreaterEq);
                static void EMITTER_DECL(EmitLess);
                static void EMITTER_DECL(EmitLessEq);
-                static void EMITTER_DECL(EmitLog);
                static void EMITTER_DECL(EmitMaximum);
                static void EMITTER_DECL(EmitMinimum);
                static void EMITTER_DECL(EmitNegative);
@@ -67,31 +66,18 @@ namespace ngraph
                static void EMITTER_DECL(EmitReshape);
                static void EMITTER_DECL(EmitFunctionCall);
                static void EMITTER_DECL(EmitReduce);
-                static void EMITTER_DECL(EmitSign);
                static void EMITTER_DECL(EmitSlice);
                static void EMITTER_DECL(EmitSum);
-                static void EMITTER_DECL(EmitExp);
-                static void EMITTER_DECL(EmitSin);
-                static void EMITTER_DECL(EmitSinh);
-                static void EMITTER_DECL(EmitCos);
-                static void EMITTER_DECL(EmitCosh);
-                static void EMITTER_DECL(EmitTan);
-                static void EMITTER_DECL(EmitTanh);
-                static void EMITTER_DECL(EmitAsin);
-                static void EMITTER_DECL(EmitAcos);
-                static void EMITTER_DECL(EmitAtan);
                static void EMITTER_DECL(EmitPower);
                static void EMITTER_DECL(EmitReplaceSlice);
                static void EMITTER_DECL(EmitOneHot);
-                static void EMITTER_DECL(EmitFloor);
-                static void EMITTER_DECL(EmitCeiling);
                static void EMITTER_DECL(EmitSqrt);
                static void EMITTER_DECL(EmitConvolution);
-                static void EMITTER_DECL(EmitNot);
                static void EMITTER_DECL(EmitMaxPool);
                static void EMITTER_DECL(EmitReverse);
                static void EMITTER_DECL(EmitReduceWindow);
                static void EMITTER_DECL(EmitSelectAndScatter);
+                static void EMITTER_DECL(EmitResult);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -151,7 +151,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
-    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
+    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
@@ -159,7 +159,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
-    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
+    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
@@ -173,30 +173,31 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
-    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
+    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
-    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
-    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
-    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
-    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
-    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
-    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
-    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
-    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
-    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
-    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
+    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
-    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
-    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
+    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
-    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
+    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
    {TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
+    {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult},
 };

 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
@@ -250,6 +251,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    #include "ngraph/pass/memory_layout.hpp"
    #include "ngraph/runtime/aligned_buffer.hpp"
    #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+    #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
    #include "ngraph/runtime/gpu/gpu_util.hpp"
    #include "ngraph/util.hpp"
 )";
@@ -346,12 +348,15 @@ using namespace std;
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
                auto c_value_strings = c->get_value_strings();
                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
-                       << tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n";
+                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
+                       << "] =\n";
                writer << "{\n";
                writer.indent++;
                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
                writer.indent--;
                writer << "\n};\n\n";
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
+                       << tv->get_tensor().get_name() << ";\n";
                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
            }
        }
@@ -485,6 +490,26 @@ using namespace std;
        writer << "{\n";
        writer.indent++;

+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        {
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+                if (c)
+                {
+                    shared_ptr<descriptor::TensorView> tv =
+                        node->get_outputs()[0].get_tensor_view();
+                    writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
+                           << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
+                           << ");\n";
+                    writer.indent--;
+                    writer << "}\n";
+                }
+            }
+        }
        bool temporaries_used = false;
        size_t worst_case_tmp_size = 0;
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
@@ -657,7 +682,6 @@ using namespace std;
            // Emit operation epilogue
            if (!node->is_parameter() && !node->is_constant())
            {
-                handle_output_alias(writer, *node, output_alias_map);
                if (m_emit_timing)
                {
                    emit_debug_function_exit(writer, node.get(), in, out);

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1305,6 +1305,7 @@ TEST(${BACKEND_NAME}, backwards_slice)

 TEST(${BACKEND_NAME}, backwards_softmax_all)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();

@@ -1322,6 +1323,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_all)

 TEST(${BACKEND_NAME}, backwards_softmax_axis)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();

@@ -1339,6 +1341,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_axis)

 TEST(${BACKEND_NAME}, backwards_softmax_underflow)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();

@@ -1358,6 +1361,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_underflow)

 TEST(${BACKEND_NAME}, backwards_softmax_3d)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();


--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -120,6 +120,7 @@ TEST(${BACKEND_NAME}, component_cleanup)

 TEST(${BACKEND_NAME}, aliased_output)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -335,7 +336,6 @@ TEST(${BACKEND_NAME}, abs)

 TEST(${BACKEND_NAME}, ceiling)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Ceiling>(A), op::ParameterVector{A});
@@ -776,7 +776,6 @@ TEST(${BACKEND_NAME}, equal)

 TEST(${BACKEND_NAME}, floor)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Floor>(A), op::ParameterVector{A});
@@ -1370,7 +1369,6 @@ TEST(${BACKEND_NAME}, lesseq_bool)

 TEST(${BACKEND_NAME}, log)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Log>(A), op::ParameterVector{A});
@@ -2673,7 +2671,6 @@ TEST(${BACKEND_NAME}, reshape_6d)

 TEST(${BACKEND_NAME}, sin)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Sin>(A), op::ParameterVector{A});
@@ -2699,7 +2696,6 @@ TEST(${BACKEND_NAME}, sin)

 TEST(${BACKEND_NAME}, cos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Cos>(A), op::ParameterVector{A});
@@ -2725,7 +2721,6 @@ TEST(${BACKEND_NAME}, cos)

 TEST(${BACKEND_NAME}, tan)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Tan>(A), op::ParameterVector{A});
@@ -2746,12 +2741,11 @@ TEST(${BACKEND_NAME}, tan)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return tanf(x); });

    cf->call({a}, {result});
-    EXPECT_EQ(input, read_vector<float>(result));
+    EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
 }

 TEST(${BACKEND_NAME}, asin)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Asin>(A), op::ParameterVector{A});
@@ -2776,7 +2770,6 @@ TEST(${BACKEND_NAME}, asin)

 TEST(${BACKEND_NAME}, acos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Acos>(A), op::ParameterVector{A});
@@ -2801,7 +2794,6 @@ TEST(${BACKEND_NAME}, acos)

 TEST(${BACKEND_NAME}, atan)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Atan>(A), op::ParameterVector{A});
@@ -2826,7 +2818,6 @@ TEST(${BACKEND_NAME}, atan)

 TEST(${BACKEND_NAME}, sinh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Sinh>(A), op::ParameterVector{A});
@@ -2851,7 +2842,6 @@ TEST(${BACKEND_NAME}, sinh)

 TEST(${BACKEND_NAME}, cosh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Cosh>(A), op::ParameterVector{A});
@@ -2876,7 +2866,6 @@ TEST(${BACKEND_NAME}, cosh)

 TEST(${BACKEND_NAME}, tanh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Tanh>(A), op::ParameterVector{A});
@@ -2896,12 +2885,11 @@ TEST(${BACKEND_NAME}, tanh)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return tanhf(x); });

    cf->call({a}, {result});
-    EXPECT_EQ(input, read_vector<float>(result));
+    EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
 }

 TEST(${BACKEND_NAME}, exp)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{8};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Exp>(A), op::ParameterVector{A});
@@ -8441,6 +8429,7 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop)

 TEST(${BACKEND_NAME}, softmax_all)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f =
@@ -8473,6 +8462,7 @@ TEST(${BACKEND_NAME}, softmax_all)

 TEST(${BACKEND_NAME}, softmax_axis)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
@@ -8501,6 +8491,7 @@ TEST(${BACKEND_NAME}, softmax_axis)

 TEST(${BACKEND_NAME}, softmax_underflow)
 {
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -91,11 +91,89 @@ TEST(cpu_fusion, gemm_pattern)
    ASSERT_EQ(n.get_pattern_map()[x], B);
    ASSERT_EQ(n.get_pattern_map()[b], C);

-    auto cg =
-        make_shared<op::MatmulBias>(W, x, broadcast, W->get_shape(), x->get_shape(), false, false);
+    auto cg = make_shared<op::MatmulBias>(
+        W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_row)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    cf->call({a, b}, {result});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
 }

-TEST(cpu_fusion, gemm_cpu)
+TEST(cpu_fusion, gemm_cpu_broadcast_column)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    cf->call({a, b}, {result});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
 {
    Shape shapeA{3, 2};
    Shape shapeB{2, 3};
@@ -109,8 +187,8 @@ TEST(cpu_fusion, gemm_cpu)
    auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});

    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
-    auto cg =
-        make_shared<op::MatmulBias>(A, B, broadcast, A->get_shape(), B->get_shape(), true, true);
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});

    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});

@@ -212,7 +290,7 @@ TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
    pass_manager.run_passes(func);
    auto gmm = graph->get_input_op(0);
    ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
-    ASSERT_EQ(gmm->get_input_op(2), broadcast);
+    ASSERT_EQ(gmm->get_input_op(2), b);
 }

 TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)