gpu select (#919)

* add select op, pass data type for each operand * fix bugs and apply clang format * fix index bug

gpu select (#919)
* add select op, pass data type for each operand * fix bugs and apply clang format * fix index bug
30d24597 · Fenglei · Robert Kimball · 2d02b23f · 30d24597 · 30d24597
Commit 30d24597 authored Apr 27, 2018 by Fenglei Committed by Robert Kimball Apr 27, 2018
7 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -20,19 +20,18 @@
 using namespace ngraph;
-void runtime::gpu::CudaKernelBuilder::get_elementwise_op(
+void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
-    codegen::CodeWriter& writer,
+                                                         const std::string& name,
-    const std::string& name,
+                                                         const std::string& op,
-    const std::string& op,
+                                                         const std::vector<std::string>& data_types,
-    const std::array<std::string, 2>& data_types,
+                                                         const size_t& num_inputs)
-    const size_t& num_inputs)
 {
    writer << "extern \"C\" __global__ void cuda_" << name << "(";
    for (size_t i = 0; i < num_inputs; i++)
    {
-        writer << data_types[0] << "* in" << i << ", ";
+        writer << data_types[i] << "* in" << i << ", ";
    }
-    writer << data_types[1] << "* out,"
+    writer << data_types[num_inputs] << "* out, "
           << "size_t n)\n";
    writer << "{\n";
    writer.indent++;
@@ -167,21 +166,20 @@ void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer,
    writer.block_end();
 }
-void runtime::gpu::CudaKernelBuilder::get_device_helper(
+void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
-    codegen::CodeWriter& writer,
+                                                        const std::string& name,
-    const std::string& name,
+                                                        const std::string& math_kernel,
-    const std::string& math_kernel,
+                                                        const std::vector<std::string>& data_types,
-    const std::array<std::string, 2>& data_types,
+                                                        const size_t& num_inputs)
-    const size_t& num_inputs)
 {
    if (math_kernel.size())
    {
-        writer << "__device__ " << data_types[1] << " " << name << "(";
+        writer << "__device__ " << data_types[num_inputs] << " " << name << "(";
        for (size_t i = 0; i < num_inputs - 1; i++)
        {
-            writer << data_types[0] << " x" << i << ", ";
+            writer << data_types[i] << " x" << i << ", ";
        }
-        writer << data_types[0] << " x" << num_inputs - 1;
+        writer << data_types[num_inputs - 1] << " x" << num_inputs - 1;
        writer << ")\n";
        writer << "{\n";
        writer.indent++;

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -36,7 +36,7 @@ namespace ngraph
                static void get_elementwise_op(codegen::CodeWriter& writer,
                                               const std::string& name,
                                               const std::string& op,
-                                               const std::array<std::string, 2>& data_types,
+                                               const std::vector<std::string>& data_types,
                                               const size_t& num_inputs);
                static void get_broadcast_op(codegen::CodeWriter& writer,
@@ -58,8 +58,9 @@ namespace ngraph
                static void get_device_helper(codegen::CodeWriter& writer,
                                              const std::string& name,
                                              const std::string& math_kernel,
-                                              const std::array<std::string, 2>& data_types,
+                                              const std::vector<std::string>& data_types,
                                              const size_t& num_inputs);
                static void add_pod_typedefs(codegen::CodeWriter& writer);
            };
        }

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -25,6 +25,7 @@
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 #include "ngraph/strides.hpp"
+#include "ngraph/util.hpp"
 namespace ngraph
 {
@@ -77,13 +78,13 @@ namespace ngraph
            template <typename T, typename... Inputs>
            void emit_elementwise_op(const std::string& name,
-                                     const std::array<std::string, 2>& data_types,
+                                     const std::vector<std::string>& data_types,
                                     GPURuntimeContext* ctx,
                                     size_t count,
                                     CUdeviceptr out,
                                     Inputs&&... inputs)
            {
-                std::string type_signature = "_" + data_types[0] + "_" + data_types[1];
+                std::string type_signature = "_" + join(data_types, "_");
                std::replace(type_signature.begin(), type_signature.end(), ' ', '_');
                auto compiled_kernel = ctx->compiled_kernel_pool->get(name + type_signature);
                if (compiled_kernel == nullptr)

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
@@ -53,6 +53,7 @@ namespace ngraph
        class Negative;
        class Not;
        class Sqrt;
+        class Select;
    }
    namespace runtime
    {
@@ -268,6 +269,13 @@ namespace ngraph
                static constexpr const char* math_kernel = "!x0";
            };
+            template <>
+            struct CudaOpMap<ngraph::op::Select>
+            {
+                static constexpr const char* op = "select";
+                static constexpr const char* math_kernel = "(x0 == 0) ? x2 : x1";
+            };
            template <>
            struct CudaOpMap<ngraph::op::ReluBackprop>
            {

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -123,7 +123,12 @@ namespace ngraph
                writer << "if(count == 0) return;\n";
                writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::"
                       << node->description() << ">(\"" << node->description() << "\""
-                       << ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
+                       << ", std::vector<std::string>{";
+                for (size_t i = 0; i < args.size(); i++)
+                {
+                    writer << "\"" << args[i].get_type() << "\", ";
+                }
+                writer << "\"" << out[0].get_type() << "\"}"
                       << ", ctx"
                       << ", count"
                       << ", CUdeviceptr(" << out[0].get_name() << ")";

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -186,7 +186,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit_elementwise},
    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::emit_elementwise},
-    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
+    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit_elementwise},
    {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::emit_elementwise},
    {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit_elementwise},

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1353,7 +1353,6 @@ TEST(${BACKEND_NAME}, notequal)
 TEST(${BACKEND_NAME}, select)
 {
    SKIP_TEST_FOR("IE", "${BACKEND_NAME}");
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::boolean, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);