Abstraction for GPU unary elementwise ops (#587)

* straightforward gpu.cos implementation following previous patterns prior to refactor * Generalized unary elementwise gpu op impl.. New unary elementwise ops can be added to the type annotations in gpu_cuda_kernel_ops.hpp. Next step is to refactor the llvm interface in gpu_emitters.hpp for similar generality. * Added gpu_emitter.hpp:EmitUnaryElementwise. Function adds cuda kernel based on ngraph::op::op_type::description. This can service all unary elementwise ops run on the gpu. * The following elementwise unary ops now use the EmitUnaryElementwise emitter: * GPU.abs * GPU.acos * GPU.asin * GPU.atan * GPU.ceiling * GPU.cos * GPU.cosh * GPU.exp * GPU.floor * GPU.log * GPU.not * GPU.sign * GPU.sin * GPU.sinh * GPU.tan * GPU.tanh Unary elementwise ops Sign and Not need extra consideration. * tanh test changed to test::all_close for fp comparison (also done for tan in commit 65fa7c6de34c8277fe2a4801644f6bb64574f4ff). * GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant. * code format update * changed cuda builder interface names to unary/binary/arbitrary, added impl. note to gpu_cuda_kernel_ops, cleaned code format * updated ngraph-cpp reference * Fixing incorrect github conflict resolution. * Added GPU emitter for op::Result. For now it simply copies the output tensor. All but 3 tests now pass. The remaining failing tests are: * GPU.dot_0_0 * GPU.dot_matrix_2x0_0x2 * GPU.dot_2x0_0 * Removed call to handle memory aliasing in gpu_external_function. * fix gpu emitter bug that will return in the middle of function * Merge pull request #609 from NervanaSystems/tfl/fix_return_bug fix gpu emitter bug that will return in the middle of function * GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant.

Abstraction for GPU unary elementwise ops (#587)
* straightforward gpu.cos implementation following previous patterns prior to refactor * Generalized unary elementwise gpu op impl.. New unary elementwise ops can be added to the type annotations in gpu_cuda_kernel_ops.hpp. Next step is to refactor the llvm interface in gpu_emitters.hpp for similar generality. * Added gpu_emitter.hpp:EmitUnaryElementwise. Function adds cuda kernel based on ngraph::op::op_type::description. This can service all unary elementwise ops run on the gpu. * The following elementwise unary ops now use the EmitUnaryElementwise emitter: * GPU.abs * GPU.acos * GPU.asin * GPU.atan * GPU.ceiling * GPU.cos * GPU.cosh * GPU.exp * GPU.floor * GPU.log * GPU.not * GPU.sign * GPU.sin * GPU.sinh * GPU.tan * GPU.tanh Unary elementwise ops Sign and Not need extra consideration. * tanh test changed to test::all_close for fp comparison (also done for tan in commit 65fa7c6de34c8277fe2a4801644f6bb64574f4ff). * GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant. * code format update * changed cuda builder interface names to unary/binary/arbitrary, added impl. note to gpu_cuda_kernel_ops, cleaned code format * updated ngraph-cpp reference * Fixing incorrect github conflict resolution. * Added GPU emitter for op::Result. For now it simply copies the output tensor. All but 3 tests now pass. The remaining failing tests are: * GPU.dot_0_0 * GPU.dot_matrix_2x0_0x2 * GPU.dot_2x0_0 * Removed call to handle memory aliasing in gpu_external_function. * fix gpu emitter bug that will return in the middle of function * Merge pull request #609 from NervanaSystems/tfl/fix_return_bug fix gpu emitter bug that will return in the middle of function * GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant.
529362b5 · Chris Sullivan · Robert Kimball · a02aab01 · 529362b5 · 529362b5
Commit 529362b5 authored Mar 08, 2018 by Chris Sullivan Committed by Robert Kimball Mar 08, 2018
9 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"

 namespace ngraph
@@ -22,10 +21,10 @@ namespace ngraph
    {
        namespace gpu
        {
-            void CudaKernelBuilder::get_1_element_op(const std::string& name,
-                                                     const std::string& data_type,
-                                                     const std::string& op,
-                                                     std::string& kernel)
+            void CudaKernelBuilder::get_unary_elementwise_op(const std::string& name,
+                                                             const std::string& data_type,
+                                                             const std::string& op,
+                                                             std::string& kernel)
            {
                kernel = R"(  
 extern "C" __global__
@@ -40,10 +39,10 @@ out[tid] =)" + op + "(in[tid]);\n" +
                return;
            }

-            void CudaKernelBuilder::get_2_element_op(const std::string& name,
-                                                     const std::string& data_type,
-                                                     const std::string& op,
-                                                     std::string& kernel)
+            void CudaKernelBuilder::get_binary_elementwise_op(const std::string& name,
+                                                              const std::string& data_type,
+                                                              const std::string& op,
+                                                              std::string& kernel)
            {
                kernel = R"(  
 extern "C" __global__
@@ -60,10 +59,11 @@ out[tid] = in1[tid] )" + op +
                return;
            }

-            void CudaKernelBuilder::get_n_element_op(const std::string& name,
-                                                     const std::string& data_type,
-                                                     const std::vector<std::string>& ops,
-                                                     std::string& kernel)
+            void
+                CudaKernelBuilder::get_arbitrary_elementwise_op(const std::string& name,
+                                                                const std::string& data_type,
+                                                                const std::vector<std::string>& ops,
+                                                                std::string& kernel)
            {
                kernel = "";
                return;

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -28,20 +28,20 @@ namespace ngraph
            class CudaKernelBuilder
            {
            public:
-                static void get_1_element_op(const std::string& name,
-                                             const std::string& data_type,
-                                             const std::string& op,
-                                             std::string& kernel);
+                static void get_unary_elementwise_op(const std::string& name,
+                                                     const std::string& data_type,
+                                                     const std::string& op,
+                                                     std::string& kernel);

-                static void get_2_element_op(const std::string& name,
-                                             const std::string& data_type,
-                                             const std::string& op,
-                                             std::string& kernel);
+                static void get_binary_elementwise_op(const std::string& name,
+                                                      const std::string& data_type,
+                                                      const std::string& op,
+                                                      std::string& kernel);

-                static void get_n_element_op(const std::string& name,
-                                             const std::string& data_type,
-                                             const std::vector<std::string>& ops,
-                                             std::string& kernel);
+                static void get_arbitrary_elementwise_op(const std::string& name,
+                                                         const std::string& data_type,
+                                                         const std::vector<std::string>& ops,
+                                                         std::string& kernel);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -17,10 +17,8 @@
 #include <algorithm>
 #include <map>

-#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
-#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"

 namespace ngraph
 {
@@ -28,40 +26,6 @@ namespace ngraph
    {
        namespace gpu
        {
-            void emit_abs(void* in, void* out, size_t count)
-            {
-                std::string name = "abs";
-                // Create an instance of nvrtcProgram with the code string.
-                if (CudaFunctionPool::instance().get(name) == nullptr)
-                {
-                    const char* opts[] = {"--gpu-architecture=compute_35",
-                                          "--relocatable-device-code=true"};
-                    std::string kernel;
-                    CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
-                    CudaFunctionPool::instance().set(
-                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
-                }
-
-                //convert runtime ptr to driver api ptr
-                CUdeviceptr d_ptr_in, d_ptr_out;
-                d_ptr_in = (CUdeviceptr)in;
-                d_ptr_out = (CUdeviceptr)out;
-
-                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
-                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
-                                              count,
-                                              1,
-                                              1, // grid dim
-                                              1,
-                                              1,
-                                              1, // block dim
-                                              0,
-                                              NULL, // shared mem and stream
-                                              args_list,
-                                              0));  // arguments
-                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
-            }
-
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
            {

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -18,6 +18,9 @@

 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/coordinate.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/strides.hpp"

 namespace ngraph
@@ -26,9 +29,46 @@ namespace ngraph
    {
        namespace gpu
        {
-            void emit_abs(void* in, void* out, size_t count);
+            template <typename T>
+            struct CudaOpMap;
+
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
+
+            template <typename T>
+            void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
+            {
+                // Create an instance of nvrtcProgram with the code string.
+                if (CudaFunctionPool::instance().get(name) == nullptr)
+                {
+                    const char* opts[] = {"--gpu-architecture=compute_35",
+                                          "--relocatable-device-code=true"};
+                    std::string kernel;
+                    CudaKernelBuilder::get_unary_elementwise_op(
+                        name, "float", CudaOpMap<T>::op, kernel);
+                    CudaFunctionPool::instance().set(
+                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
+                }
+
+                //convert runtime ptr to driver api ptr
+                CUdeviceptr d_ptr_in, d_ptr_out;
+                d_ptr_in = (CUdeviceptr)in;
+                d_ptr_out = (CUdeviceptr)out;
+
+                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
+                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                              count,
+                                              1,
+                                              1, // grid dim
+                                              1,
+                                              1,
+                                              1, // block dim
+                                              0,
+                                              NULL, // shared mem and stream
+                                              args_list,
+                                              0));  // arguments
+                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+namespace ngraph
+{
+    namespace op
+    {
+        class Abs;
+        class Acos;
+        class Asin;
+        class Atan;
+        class Ceiling;
+        class Cos;
+        class Cosh;
+        class Exp;
+        class Floor;
+        class Log;
+        class Sin;
+        class Sinh;
+        class Tan;
+        class Tanh;
+
+        // Unimplemented or unused in favor of cuDNN impl.
+        class Max;
+        class Min;
+        class Negative;
+        class Not;
+        class Sign;
+        class Sqrt;
+    }
+    namespace runtime
+    {
+        namespace gpu
+        {
+            template <>
+            struct CudaOpMap<ngraph::op::Abs>
+            {
+                static constexpr const char* op = "fabsf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Acos>
+            {
+                static constexpr const char* op = "acosf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Asin>
+            {
+                static constexpr const char* op = "asinf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Atan>
+            {
+                static constexpr const char* op = "atanf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Ceiling>
+            {
+                static constexpr const char* op = "ceilf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Cos>
+            {
+                static constexpr const char* op = "cosf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Cosh>
+            {
+                static constexpr const char* op = "coshf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Exp>
+            {
+                static constexpr const char* op = "expf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Floor>
+            {
+                static constexpr const char* op = "floorf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Log>
+            {
+                static constexpr const char* op = "logf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Max>
+            {
+                static constexpr const char* op = "fmaxf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Min>
+            {
+                static constexpr const char* op = "fminf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Sin>
+            {
+                static constexpr const char* op = "sinf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Sinh>
+            {
+                static constexpr const char* op = "sinhf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Sqrt>
+            {
+                static constexpr const char* op = "sqrtf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Tan>
+            {
+                static constexpr const char* op = "tanf";
+            };
+
+            template <>
+            struct CudaOpMap<ngraph::op::Tanh>
+            {
+                static constexpr const char* op = "tanhf";
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -58,10 +58,11 @@ void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
 {
 }

-void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+void runtime::gpu::GPU_Emitter::EmitUnaryElementwise(
+    codegen::CodeWriter& writer,
+    const ngraph::Node* n,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    if (out[0].get_size() == 0)
    {
@@ -71,8 +72,9 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
-    writer << "ngraph::runtime::gpu::emit_abs((void*) " << args[0].get_name() << ", (void*) "
-           << out[0].get_name() << ", count);\n";
+    writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::" << n->description()
+           << ">((void*) " << args[0].get_name() << ", (void*) " << out[0].get_name()
+           << ", count, \"" << n->description() << "\");\n";
    writer.indent--;
    writer << "}\n";
 }
@@ -293,14 +295,6 @@ void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
    throw std::runtime_error(n->get_name() + " is not implemented.");
 }

-void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
 void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
                                            const ngraph::Node* n,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
@@ -632,14 +626,6 @@ void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
    throw std::runtime_error(n->get_name() + " is not implemented.");
 }

-void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
 void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
                                          const ngraph::Node* n,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
@@ -701,86 +687,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
    writer << "}\n";
 }

-void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
 void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
                                          const ngraph::Node* n,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
@@ -806,22 +712,6 @@ void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
    throw std::runtime_error(n->get_name() + " is not implemented.");
 }

-void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
-void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
 void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
                                         const ngraph::Node* n,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
@@ -875,14 +765,6 @@ void runtime::gpu::GPU_Emitter::EmitConvolution(
    throw std::runtime_error(n->get_name() + " is not implemented.");
 }

-void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
-{
-    throw std::runtime_error(n->get_name() + " is not implemented.");
-}
-
 void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
                                            const ngraph::Node* n,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,

--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -45,8 +45,8 @@ namespace ngraph
                static void EMITTER_DECL(EmitMultiply);
                static void EMITTER_DECL(EmitGetOutputElement);
                static void EMITTER_DECL(EmitXLAGetTupleElement);
+                static void EMITTER_DECL(EmitUnaryElementwise);
                static void EMITTER_DECL(EmitTuple);
-                static void EMITTER_DECL(EmitAbs);
                static void EMITTER_DECL(EmitConcat);
                static void EMITTER_DECL(EmitDivide);
                static void EMITTER_DECL(EmitEqual);
@@ -54,7 +54,6 @@ namespace ngraph
                static void EMITTER_DECL(EmitGreaterEq);
                static void EMITTER_DECL(EmitLess);
                static void EMITTER_DECL(EmitLessEq);
-                static void EMITTER_DECL(EmitLog);
                static void EMITTER_DECL(EmitMaximum);
                static void EMITTER_DECL(EmitMinimum);
                static void EMITTER_DECL(EmitNegative);
@@ -67,27 +66,13 @@ namespace ngraph
                static void EMITTER_DECL(EmitReshape);
                static void EMITTER_DECL(EmitFunctionCall);
                static void EMITTER_DECL(EmitReduce);
-                static void EMITTER_DECL(EmitSign);
                static void EMITTER_DECL(EmitSlice);
                static void EMITTER_DECL(EmitSum);
-                static void EMITTER_DECL(EmitExp);
-                static void EMITTER_DECL(EmitSin);
-                static void EMITTER_DECL(EmitSinh);
-                static void EMITTER_DECL(EmitCos);
-                static void EMITTER_DECL(EmitCosh);
-                static void EMITTER_DECL(EmitTan);
-                static void EMITTER_DECL(EmitTanh);
-                static void EMITTER_DECL(EmitAsin);
-                static void EMITTER_DECL(EmitAcos);
-                static void EMITTER_DECL(EmitAtan);
                static void EMITTER_DECL(EmitPower);
                static void EMITTER_DECL(EmitReplaceSlice);
                static void EMITTER_DECL(EmitOneHot);
-                static void EMITTER_DECL(EmitFloor);
-                static void EMITTER_DECL(EmitCeiling);
                static void EMITTER_DECL(EmitSqrt);
                static void EMITTER_DECL(EmitConvolution);
-                static void EMITTER_DECL(EmitNot);
                static void EMITTER_DECL(EmitMaxPool);
                static void EMITTER_DECL(EmitReverse);
                static void EMITTER_DECL(EmitReduceWindow);

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -151,7 +151,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
-    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
+    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
@@ -159,7 +159,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
-    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
+    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
@@ -173,26 +173,26 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
-    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
+    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
-    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
-    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
-    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
-    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
-    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
-    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
-    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
-    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
-    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
-    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
+    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
-    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
-    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
+    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
+    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
-    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
+    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
@@ -251,6 +251,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    #include "ngraph/pass/memory_layout.hpp"
    #include "ngraph/runtime/aligned_buffer.hpp"
    #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+    #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
    #include "ngraph/runtime/gpu/gpu_util.hpp"
    #include "ngraph/util.hpp"
 )";

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -336,7 +336,6 @@ TEST(${BACKEND_NAME}, abs)

 TEST(${BACKEND_NAME}, ceiling)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Ceiling>(A), op::ParameterVector{A});
@@ -777,7 +776,6 @@ TEST(${BACKEND_NAME}, equal)

 TEST(${BACKEND_NAME}, floor)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Floor>(A), op::ParameterVector{A});
@@ -1371,7 +1369,6 @@ TEST(${BACKEND_NAME}, lesseq_bool)

 TEST(${BACKEND_NAME}, log)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Log>(A), op::ParameterVector{A});
@@ -2674,7 +2671,6 @@ TEST(${BACKEND_NAME}, reshape_6d)

 TEST(${BACKEND_NAME}, sin)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Sin>(A), op::ParameterVector{A});
@@ -2700,7 +2696,6 @@ TEST(${BACKEND_NAME}, sin)

 TEST(${BACKEND_NAME}, cos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Cos>(A), op::ParameterVector{A});
@@ -2726,7 +2721,6 @@ TEST(${BACKEND_NAME}, cos)

 TEST(${BACKEND_NAME}, tan)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Tan>(A), op::ParameterVector{A});
@@ -2747,12 +2741,11 @@ TEST(${BACKEND_NAME}, tan)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return tanf(x); });

    cf->call({a}, {result});
-    EXPECT_EQ(input, read_vector<float>(result));
+    EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
 }

 TEST(${BACKEND_NAME}, asin)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Asin>(A), op::ParameterVector{A});
@@ -2777,7 +2770,6 @@ TEST(${BACKEND_NAME}, asin)

 TEST(${BACKEND_NAME}, acos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Acos>(A), op::ParameterVector{A});
@@ -2802,7 +2794,6 @@ TEST(${BACKEND_NAME}, acos)

 TEST(${BACKEND_NAME}, atan)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Atan>(A), op::ParameterVector{A});
@@ -2827,7 +2818,6 @@ TEST(${BACKEND_NAME}, atan)

 TEST(${BACKEND_NAME}, sinh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Sinh>(A), op::ParameterVector{A});
@@ -2852,7 +2842,6 @@ TEST(${BACKEND_NAME}, sinh)

 TEST(${BACKEND_NAME}, cosh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Cosh>(A), op::ParameterVector{A});
@@ -2877,7 +2866,6 @@ TEST(${BACKEND_NAME}, cosh)

 TEST(${BACKEND_NAME}, tanh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{6};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Tanh>(A), op::ParameterVector{A});
@@ -2897,12 +2885,11 @@ TEST(${BACKEND_NAME}, tanh)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return tanhf(x); });

    cf->call({a}, {result});
-    EXPECT_EQ(input, read_vector<float>(result));
+    EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
 }

 TEST(${BACKEND_NAME}, exp)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{8};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Exp>(A), op::ParameterVector{A});