update gpu_emitter use template

8db26428 · Fenglei Tian · ad58cb29 · 8db26428 · 8db26428 · 8db26428
Commit 8db26428 authored Mar 07, 2018 by Fenglei Tian
Showing with 215 additions and 489 deletions

gpu_emitter.cpp src/ngraph/runtime/gpu/gpu_emitter.cpp +99 -380

gpu_emitter.hpp src/ngraph/runtime/gpu/gpu_emitter.hpp +31 -59

gpu_external_function.cpp src/ngraph/runtime/gpu/gpu_external_function.cpp +85 -50

No files found.
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -28,21 +28,69 @@
 #include <vector>
 #include "ngraph/node.hpp"
+#include "ngraph/ops/abs.hpp"
+#include "ngraph/ops/acos.hpp"
+#include "ngraph/ops/add.hpp"
+#include "ngraph/ops/allreduce.hpp"
+#include "ngraph/ops/asin.hpp"
+#include "ngraph/ops/atan.hpp"
+#include "ngraph/ops/avg_pool.hpp"
+#include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/broadcast.hpp"
+#include "ngraph/ops/ceiling.hpp"
 #include "ngraph/ops/concat.hpp"
 #include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/convert.hpp"
 #include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/cos.hpp"
+#include "ngraph/ops/cosh.hpp"
+#include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/dot.hpp"
+#include "ngraph/ops/equal.hpp"
+#include "ngraph/ops/exp.hpp"
+#include "ngraph/ops/floor.hpp"
 #include "ngraph/ops/function_call.hpp"
 #include "ngraph/ops/get_output_element.hpp"
+#include "ngraph/ops/greater.hpp"
+#include "ngraph/ops/greater_eq.hpp"
+#include "ngraph/ops/less.hpp"
+#include "ngraph/ops/less_eq.hpp"
+#include "ngraph/ops/log.hpp"
+#include "ngraph/ops/max.hpp"
 #include "ngraph/ops/max_pool.hpp"
+#include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/min.hpp"
+#include "ngraph/ops/minimum.hpp"
+#include "ngraph/ops/multiply.hpp"
+#include "ngraph/ops/negative.hpp"
+#include "ngraph/ops/not.hpp"
+#include "ngraph/ops/not_equal.hpp"
 #include "ngraph/ops/one_hot.hpp"
+#include "ngraph/ops/op.hpp"
+#include "ngraph/ops/pad.hpp"
+#include "ngraph/ops/parameter.hpp"
+#include "ngraph/ops/power.hpp"
+#include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/result.hpp"
 #include "ngraph/ops/reverse.hpp"
+#include "ngraph/ops/select.hpp"
+#include "ngraph/ops/select_and_scatter.hpp"
+#include "ngraph/ops/sign.hpp"
+#include "ngraph/ops/sin.hpp"
+#include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
+#include "ngraph/ops/sqrt.hpp"
+#include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"
+#include "ngraph/ops/tan.hpp"
+#include "ngraph/ops/tanh.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
@@ -50,20 +98,16 @@
 using namespace std;
 using namespace ngraph;
+namespace ngraph
-void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-}
+    namespace runtime
+    {
-void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
+        namespace gpu
-                                        const ngraph::Node* n,
+        {
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+template <>
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Abs)
 {
-    writer << "{  // " << n->get_name() << "\n";
+    writer << "{  // " << node->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -73,12 +117,10 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
+template <>
-                                        const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Add)
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    writer << "{  // " << n->get_name() << "\n";
+    writer << "{  // " << node->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -114,17 +156,8 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
+template <>
-                                           const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Dot)
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-}
-void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n);
    const Shape& arg0_shape = args[0].get_shape();
@@ -133,7 +166,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    {
        auto& first = (arg0_shape.empty() ? args[0] : args[1]);
        auto& second = (arg0_shape.empty() ? args[1] : args[0]);
-        writer << "{  // " << n->get_name() << "\n";
+        writer << "{  // " << node->get_name() << "\n";
        writer.indent++;
        writer << "int count = " << second.get_size() << ";\n";
        writer << "if(count == 0) return;\n";
@@ -152,7 +185,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    //return if output size is 0;
    if (out[0].get_size() == 0)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "return;\n";
        writer.indent--;
@@ -163,7 +196,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    //set output to 0 if input size is 0
    if (args[0].get_size() == 0 || args[1].get_size() == 0)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " << out[0].get_size()
               << " * sizeof(float));\n";
@@ -175,7 +208,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "cublasSdot("
               << "cublas_handle," << arg0_shape[0] << "," << args[0].get_name() << ","
@@ -186,7 +219,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    }
    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "const float alpha = 1.0;\n";
        writer << "const float beta  = 0;\n";
@@ -213,7 +246,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
        {
            throw std::runtime_error("input and output shape is not correct for dot;");
        }
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "const float alpha = 1.0;\n";
        writer << "const float beta  = 0.0;\n";
@@ -241,73 +274,14 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    }
    else
    {
-        throw std::runtime_error(n->get_name() + " with more then 2D is not implemented.");
+        throw std::runtime_error(node->get_name() + " with more then 2D is not implemented.");
    }
 }
-void runtime::gpu::GPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
+template <>
-                                           const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Maximum)
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitGreaterEq(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    writer << "{  // " << n->get_name() << "\n";
+    writer << "{  // " << node->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -343,12 +317,10 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
+template <>
-                                            const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Minimum)
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    writer << "{  // " << n->get_name() << "\n";
+    writer << "{  // " << node->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -384,13 +356,10 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitNegative(
+template <>
-    codegen::CodeWriter& writer,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Negative)
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    writer << "{  // " << n->get_name() << "\n";
+    writer << "{  // " << node->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -426,36 +395,8 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitNotEqual(
+template <>
-    codegen::CodeWriter& writer,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Broadcast)
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSubtract(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitBroadcast(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
    auto arg_shape = args[0].get_shape();
@@ -465,7 +406,7 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
    //broadcast axes is empty, do a copy
    if (axes.empty())
    {
-        writer << "{   // " << n->get_name() << " \n";
+        writer << "{   // " << node->get_name() << " \n";
        writer.indent++;
        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
               << ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
@@ -504,7 +445,7 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
            repeat_size *= result_shape[i];
        }
-        writer << "{   // " << n->get_name() << " \n";
+        writer << "{   // " << node->get_name() << " \n";
        writer.indent++;
        writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " << out[0].get_name()
               << ", " << repeat_size << ", " << repeat_times << ", " << out[0].get_size()
@@ -514,33 +455,20 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
    }
    else
    {
-        throw std::runtime_error(n->get_name() + " is not implemented.");
+        throw std::runtime_error(node->get_name() + " is not implemented.");
    }
 }
-void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
+template <>
-                                            const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Constant)
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    throw std::runtime_error(n->get_name() + " is not implemented.");
 }
-void runtime::gpu::GPU_Emitter::EmitConstant(
+template <>
-    codegen::CodeWriter& writer,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Reshape)
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-}
-void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    auto reshape = static_cast<const op::Reshape*>(n);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    auto arg_shape = args[0].get_shape();
    auto arg_rank = arg_shape.size();
@@ -561,7 +489,7 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    //  we can just copy.
    if (same_layout || result_shape_product < 2)
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
               << ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
@@ -572,7 +500,7 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    else if (arg_rank == 2)
    {
        // TODO Assert arg0_shape[0] == arg1_shape[0]?
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "const float alpha = 1.0;\n";
        writer << "const float beta = 0;\n";
@@ -600,53 +528,15 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitFunctionCall(
+template <>
-    codegen::CodeWriter& writer,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::FunctionCall)
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
-void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
+template <>
-                                           const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Multiply)
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    throw std::runtime_error(n->get_name() + " is not implemented.");
+    writer << "{  // " << node->get_name() << "\n";
-}
-void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitMultiply(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -682,133 +572,10 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer << "}\n";
 }
-void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
+template <>
-                                        const ngraph::Node* n,
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt)
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    throw std::runtime_error(n->get_name() + " is not implemented.");
+    writer << "{  // " << node->get_name() << "\n";
-}
-void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
@@ -843,54 +610,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer.indent--;
    writer << "}\n";
 }
+        }
-void runtime::gpu::GPU_Emitter::EmitConvolution(
+    }
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitReduceWindow(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
 }
--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -24,12 +24,12 @@
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
-#define EMITTER_DECL(E)                                                                            \
+#define EMITTER_DECL(op_name)                                                                      \
-    E(codegen::CodeWriter& writer,                                                                 \
+    emit<op_name>(GPU_ExternalFunction * external_function,                                        \
-      const ngraph::Node* n,                                                                       \
+                  codegen::CodeWriter & writer,                                                    \
-      const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& args,                        \
+                  const ngraph::Node* node,                                                        \
-      const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& out)
+                  const std::vector<GPU_TensorViewWrapper>& args,                                      \
+                  const std::vector<GPU_TensorViewWrapper>& out)
 namespace ngraph
 {
    namespace runtime
@@ -39,59 +39,31 @@ namespace ngraph
            class GPU_Emitter
            {
            public:
-                static void EMITTER_DECL(EmitNop);
+                template <typename OP>
-                static void EMITTER_DECL(EmitAdd);
+                static void emit(GPU_ExternalFunction* external_function,
-                static void EMITTER_DECL(EmitDot);
+                                 codegen::CodeWriter& writer,
-                static void EMITTER_DECL(EmitMultiply);
+                                 const ngraph::Node* node,
-                static void EMITTER_DECL(EmitGetOutputElement);
+                                 const std::vector<GPU_TensorViewWrapper>& args,
-                static void EMITTER_DECL(EmitXLAGetTupleElement);
+                                 const std::vector<GPU_TensorViewWrapper>& out)
-                static void EMITTER_DECL(EmitTuple);
+                {
-                static void EMITTER_DECL(EmitAbs);
+                    throw std::runtime_error("Unimplemented op in GPU emitter");
-                static void EMITTER_DECL(EmitConcat);
+                }
-                static void EMITTER_DECL(EmitDivide);
-                static void EMITTER_DECL(EmitEqual);
+                static void nop(GPU_ExternalFunction* external_function,
-                static void EMITTER_DECL(EmitGreater);
+                                codegen::CodeWriter& writer,
-                static void EMITTER_DECL(EmitGreaterEq);
+                                const ngraph::Node* node,
-                static void EMITTER_DECL(EmitLess);
+                                const std::vector<GPU_TensorViewWrapper>& args,
-                static void EMITTER_DECL(EmitLessEq);
+                                const std::vector<GPU_TensorViewWrapper>& out)
-                static void EMITTER_DECL(EmitLog);
+                {
-                static void EMITTER_DECL(EmitMaximum);
+                }
-                static void EMITTER_DECL(EmitMinimum);
-                static void EMITTER_DECL(EmitNegative);
+            private:
-                static void EMITTER_DECL(EmitNotEqual);
+                static std::string emit_vector(const GPU_TensorViewWrapper&,
-                static void EMITTER_DECL(EmitSelect);
+                                               const std::string& name = "");
-                static void EMITTER_DECL(EmitSubtract);
+                static std::string emit_array1d(const GPU_TensorViewWrapper&,
-                static void EMITTER_DECL(EmitBroadcast);
+                                                const std::string& name = "");
-                static void EMITTER_DECL(EmitConvert);
+                static std::string emit_matrix(const GPU_TensorViewWrapper&,
-                static void EMITTER_DECL(EmitConstant);
+                                               const std::string& name = "");
-                static void EMITTER_DECL(EmitReshape);
-                static void EMITTER_DECL(EmitFunctionCall);
-                static void EMITTER_DECL(EmitReduce);
-                static void EMITTER_DECL(EmitSign);
-                static void EMITTER_DECL(EmitSlice);
-                static void EMITTER_DECL(EmitSum);
-                static void EMITTER_DECL(EmitExp);
-                static void EMITTER_DECL(EmitSin);
-                static void EMITTER_DECL(EmitSinh);
-                static void EMITTER_DECL(EmitCos);
-                static void EMITTER_DECL(EmitCosh);
-                static void EMITTER_DECL(EmitTan);
-                static void EMITTER_DECL(EmitTanh);
-                static void EMITTER_DECL(EmitAsin);
-                static void EMITTER_DECL(EmitAcos);
-                static void EMITTER_DECL(EmitAtan);
-                static void EMITTER_DECL(EmitPower);
-                static void EMITTER_DECL(EmitReplaceSlice);
-                static void EMITTER_DECL(EmitOneHot);
-                static void EMITTER_DECL(EmitFloor);
-                static void EMITTER_DECL(EmitCeiling);
-                static void EMITTER_DECL(EmitSqrt);
-                static void EMITTER_DECL(EmitConvolution);
-                static void EMITTER_DECL(EmitNot);
-                static void EMITTER_DECL(EmitMaxPool);
-                static void EMITTER_DECL(EmitReverse);
-                static void EMITTER_DECL(EmitReduceWindow);
-                static void EMITTER_DECL(EmitSelectAndScatter);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -41,8 +41,11 @@
 #include "ngraph/ops/abs.hpp"
 #include "ngraph/ops/acos.hpp"
 #include "ngraph/ops/add.hpp"
+#include "ngraph/ops/allreduce.hpp"
 #include "ngraph/ops/asin.hpp"
 #include "ngraph/ops/atan.hpp"
+#include "ngraph/ops/avg_pool.hpp"
+#include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/ceiling.hpp"
 #include "ngraph/ops/concat.hpp"
@@ -57,24 +60,34 @@
 #include "ngraph/ops/exp.hpp"
 #include "ngraph/ops/floor.hpp"
 #include "ngraph/ops/function_call.hpp"
+#include "ngraph/ops/get_output_element.hpp"
 #include "ngraph/ops/greater.hpp"
 #include "ngraph/ops/greater_eq.hpp"
 #include "ngraph/ops/less.hpp"
 #include "ngraph/ops/less_eq.hpp"
 #include "ngraph/ops/log.hpp"
+#include "ngraph/ops/max.hpp"
 #include "ngraph/ops/max_pool.hpp"
 #include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/min.hpp"
 #include "ngraph/ops/minimum.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/negative.hpp"
 #include "ngraph/ops/not.hpp"
 #include "ngraph/ops/not_equal.hpp"
 #include "ngraph/ops/one_hot.hpp"
+#include "ngraph/ops/op.hpp"
+#include "ngraph/ops/pad.hpp"
+#include "ngraph/ops/parameter.hpp"
 #include "ngraph/ops/power.hpp"
+#include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/result.hpp"
 #include "ngraph/ops/reverse.hpp"
 #include "ngraph/ops/select.hpp"
 #include "ngraph/ops/select_and_scatter.hpp"
@@ -82,6 +95,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"
@@ -147,56 +161,77 @@ static StaticInitializers s_static_initializers;
 #define TI(x) type_index(typeid(x))
 static const runtime::gpu::OpMap dispatcher{
-    {TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::EmitAdd},
+    {TI(ngraph::op::Add), &runtime::cpu::GPU_Emitter::emit<op::Add>},
-    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
+    {TI(ngraph::op::MatmulBias), &runtime::cpu::GPU_Emitter::emit<op::MatmulBias>},
-    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
+    {TI(ngraph::op::Dot), &runtime::cpu::GPU_Emitter::emit<op::Dot>},
-    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
+    {TI(ngraph::op::Multiply), &runtime::cpu::GPU_Emitter::emit<op::Multiply>},
-    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
+    {TI(ngraph::op::Parameter), &runtime::cpu::GPU_Emitter::nop},
-    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
+    {TI(ngraph::op::Abs), &runtime::cpu::GPU_Emitter::emit<op::Abs>},
-    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
+    {TI(ngraph::op::Concat), &runtime::cpu::GPU_Emitter::emit<op::Concat>},
-    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
+    {TI(ngraph::op::Divide), &runtime::cpu::GPU_Emitter::emit<op::Divide>},
-    {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitGreater},
+    {TI(ngraph::op::Equal), &runtime::cpu::GPU_Emitter::emit<op::Equal>},
-    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
+    {TI(ngraph::op::GetOutputElement), &runtime::cpu::GPU_Emitter::emit<op::GetOutputElement>},
-    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
+    {TI(ngraph::op::Greater), &runtime::cpu::GPU_Emitter::emit<op::Greater>},
-    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
+    {TI(ngraph::op::GreaterEq), &runtime::cpu::GPU_Emitter::emit<op::GreaterEq>},
-    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
+    {TI(ngraph::op::Less), &runtime::cpu::GPU_Emitter::emit<op::Less>},
-    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
+    {TI(ngraph::op::LessEq), &runtime::cpu::GPU_Emitter::emit<op::LessEq>},
-    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
+    {TI(ngraph::op::Log), &runtime::cpu::GPU_Emitter::emit<op::Log>},
-    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
+    {TI(ngraph::op::Maximum), &runtime::cpu::GPU_Emitter::emit<op::Maximum>},
-    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitNotEqual},
+    {TI(ngraph::op::Minimum), &runtime::cpu::GPU_Emitter::emit<op::Minimum>},
-    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitPower},
+    {TI(ngraph::op::Negative), &runtime::cpu::GPU_Emitter::emit<op::Negative>},
-    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::EmitSelect},
+    {TI(ngraph::op::NotEqual), &runtime::cpu::GPU_Emitter::emit<op::NotEqual>},
-    {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitSubtract},
+    {TI(ngraph::op::Power), &runtime::cpu::GPU_Emitter::emit<op::Power>},
-    {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::EmitBroadcast},
+    {TI(ngraph::op::Select), &runtime::cpu::GPU_Emitter::emit<op::Select>},
-    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitConvert},
+    {TI(ngraph::op::Subtract), &runtime::cpu::GPU_Emitter::emit<op::Subtract>},
-    {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::EmitConstant},
+    {TI(ngraph::op::Broadcast), &runtime::cpu::GPU_Emitter::emit<op::Broadcast>},
-    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
+    {TI(ngraph::op::Convert), &runtime::cpu::GPU_Emitter::emit<op::Convert>},
-    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
+    {TI(ngraph::op::Constant), &runtime::cpu::GPU_Emitter::emit<op::Constant>},
-    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
+    {TI(ngraph::op::Reshape), &runtime::cpu::GPU_Emitter::emit<op::Reshape>},
-    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
+    {TI(ngraph::op::FunctionCall), &runtime::cpu::GPU_Emitter::emit<op::FunctionCall>},
-    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
+    {TI(ngraph::op::Reduce), &runtime::cpu::GPU_Emitter::emit<op::Reduce>},
-    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
+    {TI(ngraph::op::Sign), &runtime::cpu::GPU_Emitter::emit<op::Sign>},
-    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
+    {TI(ngraph::op::Slice), &runtime::cpu::GPU_Emitter::emit<op::Slice>},
-    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
+    {TI(ngraph::op::Sum), &runtime::cpu::GPU_Emitter::emit<op::Sum>},
-    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
+    {TI(ngraph::op::Exp), &runtime::cpu::GPU_Emitter::emit<op::Exp>},
-    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
+    {TI(ngraph::op::Sin), &runtime::cpu::GPU_Emitter::emit<op::Sin>},
-    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
+    {TI(ngraph::op::Sinh), &runtime::cpu::GPU_Emitter::emit<op::Sinh>},
-    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
+    {TI(ngraph::op::Cos), &runtime::cpu::GPU_Emitter::emit<op::Cos>},
-    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
+    {TI(ngraph::op::Cosh), &runtime::cpu::GPU_Emitter::emit<op::Cosh>},
-    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
+    {TI(ngraph::op::Tan), &runtime::cpu::GPU_Emitter::emit<op::Tan>},
-    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
+    {TI(ngraph::op::Tanh), &runtime::cpu::GPU_Emitter::emit<op::Tanh>},
-    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
+    {TI(ngraph::op::Asin), &runtime::cpu::GPU_Emitter::emit<op::Asin>},
-    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
+    {TI(ngraph::op::Acos), &runtime::cpu::GPU_Emitter::emit<op::Acos>},
-    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
+    {TI(ngraph::op::Atan), &runtime::cpu::GPU_Emitter::emit<op::Atan>},
-    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
+    {TI(ngraph::op::ReplaceSlice), &runtime::cpu::GPU_Emitter::emit<op::ReplaceSlice>},
-    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
+    {TI(ngraph::op::OneHot), &runtime::cpu::GPU_Emitter::emit<op::OneHot>},
-    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
+    {TI(ngraph::op::Floor), &runtime::cpu::GPU_Emitter::emit<op::Floor>},
-    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
+    {TI(ngraph::op::Ceiling), &runtime::cpu::GPU_Emitter::emit<op::Ceiling>},
-    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
+    {TI(ngraph::op::Sqrt), &runtime::cpu::GPU_Emitter::emit<op::Sqrt>},
-    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
+    {TI(ngraph::op::Convolution), &runtime::cpu::GPU_Emitter::emit<op::Convolution>},
-    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
+    {TI(ngraph::op::ConvolutionBackpropFilters),
-    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
+     &runtime::cpu::GPU_Emitter::emit<op::ConvolutionBackpropFilters>},
-    {TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
+    {TI(ngraph::op::ConvolutionBackpropData),
+     &runtime::cpu::GPU_Emitter::emit<op::ConvolutionBackpropData>},
+    {TI(ngraph::runtime::cpu::op::ConvertLayout),
+     &runtime::cpu::GPU_Emitter::emit<runtime::cpu::op::ConvertLayout>},
+    {TI(ngraph::op::Not), &runtime::cpu::GPU_Emitter::emit<op::Not>},
+    {TI(ngraph::op::MaxPool), &runtime::cpu::GPU_Emitter::emit<op::MaxPool>},
+    {TI(ngraph::op::Reverse), &runtime::cpu::GPU_Emitter::emit<op::Reverse>},
+    {TI(ngraph::op::Result), &runtime::cpu::GPU_Emitter::emit<op::Result>},
+    {TI(ngraph::op::ReduceWindow), &runtime::cpu::GPU_Emitter::emit<op::ReduceWindow>},
+    {TI(ngraph::op::SelectAndScatter), &runtime::cpu::GPU_Emitter::emit<op::SelectAndScatter>},
+    {TI(ngraph::op::AvgPool), &runtime::cpu::GPU_Emitter::emit<op::AvgPool>},
+    {TI(ngraph::op::AvgPoolBackprop), &runtime::cpu::GPU_Emitter::emit<op::AvgPoolBackprop>},
+    {TI(ngraph::op::Pad), &runtime::cpu::GPU_Emitter::emit<op::Pad>},
+    {TI(ngraph::op::BatchNorm), &runtime::cpu::GPU_Emitter::emit<op::BatchNorm>},
+    {TI(ngraph::op::BatchNormBackprop), &runtime::cpu::GPU_Emitter::emit<op::BatchNormBackprop>},
+    {TI(ngraph::op::MaxPoolBackprop), &runtime::cpu::GPU_Emitter::emit<op::MaxPoolBackprop>},
+    {TI(ngraph::op::Product), &runtime::cpu::GPU_Emitter::emit<op::Product>},
+    {TI(ngraph::op::Max), &runtime::cpu::GPU_Emitter::emit<op::Max>},
+    {TI(ngraph::op::Min), &runtime::cpu::GPU_Emitter::emit<op::Min>},
+    {TI(ngraph::op::Relu), &runtime::cpu::GPU_Emitter::emit<op::Relu>},
+    {TI(ngraph::op::ReluBackprop), &runtime::cpu::GPU_Emitter::emit<op::ReluBackprop>},
+    {TI(ngraph::op::Softmax), &runtime::cpu::GPU_Emitter::emit<op::Softmax>},
 };
 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(