add sign op, fix constant bug

dd5a6769 · fenglei.tian · b5414ba5 · dd5a6769 · dd5a6769 · dd5a6769
Commit dd5a6769 authored Mar 08, 2018 by fenglei.tian
5 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -74,6 +74,53 @@ void cuda_)" + name + "(" + data_type +
                                              0));  // arguments
                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
            }
+            void emit_sign(void* in, void* out, size_t count)
+            {
+                std::string name = "sign";
+                // Create an instance of nvrtcProgram with the code string.
+                if (CudaFunctionPool::instance().get(name) == nullptr)
+                {
+                    const char* opts[] = {"--gpu-architecture=compute_35",
+                                          "--relocatable-device-code=true"};
+                    std::string kernel;
+                    std::string data_type("float");
+                    kernel = R"(
+extern "C" __global__
+void cuda_)" + name + "(" + data_type +
+                             "* in, " + data_type + "* out, size_t n)\n" + R"(
+{
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(tid < n)
+    {
+        out[tid] = (in[tid] > 0) - (in[tid] < 0);
+    }
+})";
+                    CudaFunctionPool::instance().set(
+                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
+                }
+                //convert runtime ptr to driver api ptr
+                CUdeviceptr d_ptr_in, d_ptr_out;
+                d_ptr_in = (CUdeviceptr)in;
+                d_ptr_out = (CUdeviceptr)out;
+                void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
+                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                              count,
+                                              1,
+                                              1, // grid dim
+                                              1,
+                                              1,
+                                              1, // block dim
+                                              0,
+                                              NULL, // shared mem and stream
+                                              args_list,
+                                              0));  // arguments
+                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -35,6 +35,8 @@ namespace ngraph
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
+            void emit_sign(void* in, void* out, size_t count);
            template <typename T>
            void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
            {

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -134,7 +134,6 @@ namespace ngraph
                writer << "{  // " << node->get_name() << "\n";
                writer.indent++;
                writer << "int count = " << out[0].get_size() << ";\n";
-                writer << "if(count == 0) return;\n";
                writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::"
                       << node->description() << ">((void*) " << args[0].get_name() << ", (void*) "
                       << out[0].get_name() << ", count, \"" << node->description() << "\");\n";
@@ -618,6 +617,22 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                writer << "}\n";
            }
+            template <>
+            void GPU_Emitter::EMITTER_DECL(ngraph::op::Sign)
+            {
+                if (out[0].get_size() == 0)
+                {
+                    return;
+                }
+                writer << "{  // " << node->get_name() << "\n";
+                writer.indent++;
+                writer << "int count = " << out[0].get_size() << ";\n";
+                writer << "ngraph::runtime::gpu::emit_sign((void*) " << args[0].get_name()
+                       << ", (void*) " << out[0].get_name() << ", count);\n";
+                writer.indent--;
+                writer << "}\n";
+            }
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt)
            {

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -194,7 +194,7 @@ namespace ngraph
                {TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
                {TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
                {TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
-                {TI(ngraph::op::Sign), &GPU_Emitter::EmitUnaryElementwise},
+                {TI(ngraph::op::Sign), &GPU_Emitter::emit<ngraph::op::Sign>},
                {TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
                {TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
                {TI(ngraph::op::Exp), &GPU_Emitter::EmitUnaryElementwise},
@@ -558,6 +558,10 @@ using namespace std;
                                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
                                writer << "{\n";
                                writer.indent++;
+                                writer << tv->get_tensor().get_name() << " = ("
+                                       << tv->get_tensor().get_element_type().c_type_string()
+                                       << " *) ngraph::runtime::gpu::create_gpu_buffer("
+                                       << tv->get_tensor().size() << ");\n";
                                writer << "runtime::gpu::cuda_memcpyHtD("
                                       << tv->get_tensor().get_name() << ", "
                                       << tv->get_tensor().get_name() << "_cpu, "
@@ -853,4 +857,4 @@ using namespace std;
            }
        }
    }
 }
\ No newline at end of file
--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -388,7 +388,6 @@ TEST(${BACKEND_NAME}, backwards_avgpool_n2_c2_hw2x2_win_2x2_str_1x1_padding_nume
 TEST(${BACKEND_NAME}, backwards_abs)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -439,7 +438,6 @@ TEST(${BACKEND_NAME}, backwards_add)
 TEST(${BACKEND_NAME}, backwards_add_nested)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -577,7 +575,6 @@ TEST(${BACKEND_NAME}, backwards_concat_axis_1)
 TEST(${BACKEND_NAME}, backwards_ceiling)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
@@ -617,7 +614,6 @@ TEST(${BACKEND_NAME}, backwards_ceiling)
 TEST(${BACKEND_NAME}, backwards_cos)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -639,7 +635,6 @@ TEST(${BACKEND_NAME}, backwards_cos)
 TEST(${BACKEND_NAME}, backwards_cosh)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
@@ -663,7 +658,6 @@ TEST(${BACKEND_NAME}, backwards_cosh)
 TEST(${BACKEND_NAME}, backwards_divide)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
@@ -690,7 +684,6 @@ TEST(${BACKEND_NAME}, backwards_divide)
 TEST(${BACKEND_NAME}, backwards_dot_scalar_scalar)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
@@ -712,7 +705,6 @@ TEST(${BACKEND_NAME}, backwards_dot_scalar_scalar)
 TEST(${BACKEND_NAME}, backwards_dot_scalar_tensor)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();