refacgtor and fix bug

94429375 · fenglei.tian · 28de1557 · 94429375 · 94429375 · 94429375
Commit 94429375 authored Feb 07, 2018 by fenglei.tian
8 changed files
--- a/scripts/build.sh
+++ b/scripts/build.sh
 mkdir build
 cd build
 cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-make
+make -j24 all
--- a/scripts/run.sh
+++ b/scripts/run.sh
-./build/test/unit-test --gtest_filter=GPU.*
+./build/test/unit-test --gtest_filter=GPU.ab
+#./build/test/unit-test --gtest_filter=GPU.dot*
--- a/src/ngraph/runtime/gpu/gpu_call_frame.cpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.cpp
@@ -17,7 +17,7 @@
 #include <stdio.h>

 #include <cuda_runtime.h>
-#include "cublas_v2.h"
+#include "cublas.h"

 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
@@ -32,21 +32,27 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction>
    , m_compiled_function(compiled_function)
 {
    cublasStatus_t stat = cublasCreate(&m_cublas_handle);
-    if (stat != cudaSuccess)
+    if (stat != CUBLAS_STATUS_SUCCESS)
    {
        throw runtime_error("cuBLAS create failed");
    }
+    cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_HOST);
    // Pass scalars as reference on the device
    cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
 }

+runtime::gpu::GPU_CallFrame::~GPU_CallFrame()
+{
+    cublasDestroy(m_cublas_handle);
+}
+
 void runtime::gpu::GPU_CallFrame::tensor_call(
    const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs,
    const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs)
 {
    // Host tensors
-    vector<void**> inputs;
-    vector<void**> outputs;
+    vector<void*> inputs;
+    vector<void*> outputs;

    for (size_t i = 0; i < input_tvs.size(); i++)
    {

--- a/src/ngraph/runtime/gpu/gpu_call_frame.hpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.hpp
@@ -35,8 +35,8 @@ namespace ngraph
            class GPU_CallFrame;
            class GPU_ExternalFunction;

-            using EntryPoint_t = void(void*** inputs,
-                                      void*** outputs,
+            using EntryPoint_t = void(void** inputs,
+                                      void** outputs,
                                      cublasHandle_t& cublas_handle);

            using EntryPoint = std::function<EntryPoint_t>;
@@ -48,7 +48,7 @@ namespace ngraph
                GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
                              EntryPoint compiled_function);

-                ~GPU_CallFrame() override = default;
+                ~GPU_CallFrame() override;

                /// @brief Invoke the function with values matching the signature of the function.
                ///

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -63,55 +63,28 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    const Shape& arg0_shape = args[0].get_shape();
-    const Shape& arg1_shape = args[1].get_shape();
-    if (arg0_shape.empty() || arg1_shape.empty())
-    {
-        auto& first = (arg0_shape.empty() ? args[0] : args[1]);
-        auto& second = (arg0_shape.empty() ? args[1] : args[0]);
-    }
-
-    // clang-format off
-    else if ((arg0_shape.size() <= 2) && (arg1_shape.size() <= 2))
    {
      // TODO Assert arg0_shape[0] == arg1_shape[0]?
      writer << "{   // " << n->get_name() << "\n";
      writer.indent++;
-      writer << "static const float alpha = 1.0;\n";
-      writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";;
-      writer << "cublasScopy("
-          << "cublas_handle,"
-          << out[0].get_size() << ","
+      writer << "const float alpha = 1.0;\n";
+      writer << "const float beta = 1.0;\n";
+      writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+      writer << "cublasSgeam("
+          << "cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,\n"
+          << out[0].get_size() << "," 
+          << " 1, \n"
+          << " &alpha, "
          << args[0].get_name() << ","
-          // Todo handle striding?
-          << "1,"
+          << args[0].get_size() << ",\n"
+          << " &beta, "
+          << args[1].get_name() << ","
+          << args[1].get_size() << ",\n"
          << out[0].get_name() << ","
-          << "1);\n";
-      writer << "cublasSaxpy("
-             << "cublas_handle,"
-             << out[0].get_size() << ","
-             << "&alpha," //alpha
-             << args[1].get_name() << ","
-        // Todo handle striding?
-             << "1,"
-             << out[0].get_name() << ","
-             << "1);\n";
+          << out[0].get_size() << ");\n";
      writer.indent--;
-      writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
      writer << "}\n";
    }
-    // clang-format on
-    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
-    {
-    }
-    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
-    {
-        // GEMM Call
-    }
-    else
-    {
-        // General ND Call?
-    }
 }

 void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
@@ -188,7 +161,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
            << out[0].get_name() << ","
            << "1);\n";
        // clang-format on
-        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
        ;
        writer.indent--;
        writer << "}\n";
@@ -224,7 +196,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
             << out[0].get_name() << ","
             << "n);\n";
        // clang-format on
-        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
        writer.indent--;
        writer << "}\n";
    }
@@ -333,7 +304,6 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
        writer +=  R"(
                cudnnDestroy(cudnnHandle);
      )";
-        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
        writer.indent--;
        writer << "}\n";
    // clang-format on
@@ -458,7 +428,6 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
                 << arg_shape[1] << ","
                 << out[0].get_name() << ","
                 << out[0].get_shape()[1] << ");\n";
-          writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
          writer.indent--;
          writer << "}\n";
          //clang-format on
@@ -541,7 +510,6 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
           << "1"                      // Stride y
           << ");\n";
    writer.indent--;
-    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
    writer << "}\n";
    // clang-format on
 }

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -21,6 +21,11 @@
 #include <typeinfo>
 #include <unordered_map>

+    #include <cuda_runtime.h>
+    #include <cudnn_v7.h>
+    #include "cublas_v2.h"
+    #include "cuda.h"
+
 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/codegen/compiler.hpp"
 #include "ngraph/codegen/execution_engine.hpp"
@@ -223,19 +228,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    writer +=
        R"(// Generated by the NGraph GPU backend
-    #include <cassert>
-    #include <cmath>
-    #include <cstdlib>
-    #include <fstream>
-    #include <fstream>
-    #include <iostream>
-    #include <memory>
-    #include <string>
-    #include <tuple>
-    #include <typeindex>
-    #include <typeinfo>
-    #include <unordered_map>
-
    #include <cuda_runtime.h>
    #include <cudnn_v7.h>
    #include "cublas_v2.h"
@@ -249,53 +241,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    #include "ngraph/function.hpp"
    #include "ngraph/graph_util.hpp"
    #include "ngraph/node.hpp"
-    #include "ngraph/ops/abs.hpp"
-    #include "ngraph/ops/acos.hpp"
-    #include "ngraph/ops/add.hpp"
-    #include "ngraph/ops/asin.hpp"
-    #include "ngraph/ops/atan.hpp"
-    #include "ngraph/ops/broadcast.hpp"
-    #include "ngraph/ops/ceiling.hpp"
-    #include "ngraph/ops/concatenate.hpp"
-    #include "ngraph/ops/constant.hpp"
-    #include "ngraph/ops/convert.hpp"
-    #include "ngraph/ops/convolution.hpp"
-    #include "ngraph/ops/cos.hpp"
-    #include "ngraph/ops/cosh.hpp"
-    #include "ngraph/ops/divide.hpp"
-    #include "ngraph/ops/dot.hpp"
-    #include "ngraph/ops/equal.hpp"
-    #include "ngraph/ops/exp.hpp"
-    #include "ngraph/ops/floor.hpp"
-    #include "ngraph/ops/function_call.hpp"
-    #include "ngraph/ops/greater.hpp"
-    #include "ngraph/ops/greater_eq.hpp"
-    #include "ngraph/ops/less.hpp"
-    #include "ngraph/ops/less_eq.hpp"
-    #include "ngraph/ops/log.hpp"
-    #include "ngraph/ops/max_pool.hpp"
-    #include "ngraph/ops/maximum.hpp"
-    #include "ngraph/ops/minimum.hpp"
-    #include "ngraph/ops/multiply.hpp"
-    #include "ngraph/ops/negative.hpp"
-    #include "ngraph/ops/not.hpp"
-    #include "ngraph/ops/not_equal.hpp"
-    #include "ngraph/ops/one_hot.hpp"
-    #include "ngraph/ops/power.hpp"
-    #include "ngraph/ops/reduce.hpp"
-    #include "ngraph/ops/replace_slice.hpp"
-    #include "ngraph/ops/reshape.hpp"
-    #include "ngraph/ops/reverse.hpp"
-    #include "ngraph/ops/select.hpp"
-    #include "ngraph/ops/sign.hpp"
-    #include "ngraph/ops/sin.hpp"
-    #include "ngraph/ops/sinh.hpp"
-    #include "ngraph/ops/slice.hpp"
-    #include "ngraph/ops/sqrt.hpp"
-    #include "ngraph/ops/subtract.hpp"
-    #include "ngraph/ops/sum.hpp"
-    #include "ngraph/ops/tan.hpp"
-    #include "ngraph/ops/tanh.hpp"
    #include "ngraph/pass/assign_layout.hpp"
    #include "ngraph/pass/dump_sorted.hpp"
    #include "ngraph/pass/liveness.hpp"
@@ -413,7 +358,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
    {
        writer << "extern \"C\" void " << f->get_name()
-               << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle);\n";
+               << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle);\n";
    }

    writer << "\n";
@@ -450,7 +395,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
            string match_function_name;
            for (size_t j = i + 1; j < op_list.size(); j++)
            {
-                if (op_list[i]->is_functionally_identical(*op_list[j]))
+                if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
                {
                    if (match_function_name.empty())
                    {
@@ -531,7 +476,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
        }

        writer << "extern \"C\" void " << current_function->get_name();
-        writer << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle)\n";
+        writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle)\n";
        writer << "{\n";
        writer.indent++;


--- a/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.cpp
@@ -39,7 +39,7 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
    m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
    if (m_buffer_size > 0)
    {
-        cudaMalloc(&m_allocated_buffer_pool, m_buffer_size);
+        cudaMalloc((void**) &m_allocated_buffer_pool, m_buffer_size);
    }
 }

@@ -50,18 +50,10 @@ runtime::gpu::GPU_TensorView::~GPU_TensorView()

 void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)
 {
-    if (tensor_offset + n > m_buffer_size)
-    {
-        throw out_of_range("write access past end of tensor");
-    }
    cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice);
 }

 void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const
 {
-    if (tensor_offset + n > m_buffer_size)
-    {
-        throw out_of_range("read access past end of tensor");
-    }
    cudaMemcpy(target, m_allocated_buffer_pool, n, cudaMemcpyDeviceToHost);
 }
--- a/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor_view.hpp
@@ -49,6 +49,6 @@ public:
    /// @param n Number of bytes to read, must be integral number of elements.
    void read(void* p, size_t tensor_offset, size_t n) const override;

-    void** m_allocated_buffer_pool;
+    void* m_allocated_buffer_pool;
    size_t m_buffer_size;
 };