Commit 94429375 authored by fenglei.tian's avatar fenglei.tian

refacgtor and fix bug

parent 28de1557
mkdir build mkdir build
cd build cd build
cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
make make -j24 all
./build/test/unit-test --gtest_filter=GPU.* ./build/test/unit-test --gtest_filter=GPU.ab
#./build/test/unit-test --gtest_filter=GPU.dot*
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <stdio.h> #include <stdio.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "cublas_v2.h" #include "cublas.h"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
...@@ -32,21 +32,27 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> ...@@ -32,21 +32,27 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction>
, m_compiled_function(compiled_function) , m_compiled_function(compiled_function)
{ {
cublasStatus_t stat = cublasCreate(&m_cublas_handle); cublasStatus_t stat = cublasCreate(&m_cublas_handle);
if (stat != cudaSuccess) if (stat != CUBLAS_STATUS_SUCCESS)
{ {
throw runtime_error("cuBLAS create failed"); throw runtime_error("cuBLAS create failed");
} }
cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_HOST);
// Pass scalars as reference on the device // Pass scalars as reference on the device
cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE); cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
} }
runtime::gpu::GPU_CallFrame::~GPU_CallFrame()
{
cublasDestroy(m_cublas_handle);
}
void runtime::gpu::GPU_CallFrame::tensor_call( void runtime::gpu::GPU_CallFrame::tensor_call(
const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs, const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs,
const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs) const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs)
{ {
// Host tensors // Host tensors
vector<void**> inputs; vector<void*> inputs;
vector<void**> outputs; vector<void*> outputs;
for (size_t i = 0; i < input_tvs.size(); i++) for (size_t i = 0; i < input_tvs.size(); i++)
{ {
......
...@@ -35,8 +35,8 @@ namespace ngraph ...@@ -35,8 +35,8 @@ namespace ngraph
class GPU_CallFrame; class GPU_CallFrame;
class GPU_ExternalFunction; class GPU_ExternalFunction;
using EntryPoint_t = void(void*** inputs, using EntryPoint_t = void(void** inputs,
void*** outputs, void** outputs,
cublasHandle_t& cublas_handle); cublasHandle_t& cublas_handle);
using EntryPoint = std::function<EntryPoint_t>; using EntryPoint = std::function<EntryPoint_t>;
...@@ -48,7 +48,7 @@ namespace ngraph ...@@ -48,7 +48,7 @@ namespace ngraph
GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function, GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
EntryPoint compiled_function); EntryPoint compiled_function);
~GPU_CallFrame() override = default; ~GPU_CallFrame() override;
/// @brief Invoke the function with values matching the signature of the function. /// @brief Invoke the function with values matching the signature of the function.
/// ///
......
...@@ -63,55 +63,28 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, ...@@ -63,55 +63,28 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
if (arg0_shape.empty() || arg1_shape.empty())
{
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
}
// clang-format off
else if ((arg0_shape.size() <= 2) && (arg1_shape.size() <= 2))
{ {
// TODO Assert arg0_shape[0] == arg1_shape[0]? // TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "static const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";; writer << "const float beta = 1.0;\n";
writer << "cublasScopy(" writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
<< "cublas_handle," writer << "cublasSgeam("
<< "cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,\n"
<< out[0].get_size() << "," << out[0].get_size() << ","
<< " 1, \n"
<< " &alpha, "
<< args[0].get_name() << "," << args[0].get_name() << ","
// Todo handle striding? << args[0].get_size() << ",\n"
<< "1," << " &beta, "
<< out[0].get_name() << ","
<< "1);\n";
writer << "cublasSaxpy("
<< "cublas_handle,"
<< out[0].get_size() << ","
<< "&alpha," //alpha
<< args[1].get_name() << "," << args[1].get_name() << ","
// Todo handle striding? << args[1].get_size() << ",\n"
<< "1,"
<< out[0].get_name() << "," << out[0].get_name() << ","
<< "1);\n"; << out[0].get_size() << ");\n";
writer.indent--; writer.indent--;
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer << "}\n"; writer << "}\n";
} }
// clang-format on
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
{
// GEMM Call
}
else
{
// General ND Call?
}
} }
void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
...@@ -188,7 +161,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -188,7 +161,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
<< out[0].get_name() << "," << out[0].get_name() << ","
<< "1);\n"; << "1);\n";
// clang-format on // clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
; ;
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
...@@ -224,7 +196,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -224,7 +196,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
<< out[0].get_name() << "," << out[0].get_name() << ","
<< "n);\n"; << "n);\n";
// clang-format on // clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
...@@ -333,7 +304,6 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer, ...@@ -333,7 +304,6 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
writer += R"( writer += R"(
cudnnDestroy(cudnnHandle); cudnnDestroy(cudnnHandle);
)"; )";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
// clang-format on // clang-format on
...@@ -458,7 +428,6 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, ...@@ -458,7 +428,6 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
<< arg_shape[1] << "," << arg_shape[1] << ","
<< out[0].get_name() << "," << out[0].get_name() << ","
<< out[0].get_shape()[1] << ");\n"; << out[0].get_shape()[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
//clang-format on //clang-format on
...@@ -541,7 +510,6 @@ void runtime::gpu::GPU_Emitter::EmitMultiply( ...@@ -541,7 +510,6 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
<< "1" // Stride y << "1" // Stride y
<< ");\n"; << ");\n";
writer.indent--; writer.indent--;
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer << "}\n"; writer << "}\n";
// clang-format on // clang-format on
} }
......
...@@ -21,6 +21,11 @@ ...@@ -21,6 +21,11 @@
#include <typeinfo> #include <typeinfo>
#include <unordered_map> #include <unordered_map>
#include <cuda_runtime.h>
#include <cudnn_v7.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp" #include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp" #include "ngraph/codegen/execution_engine.hpp"
...@@ -223,19 +228,6 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -223,19 +228,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
writer += writer +=
R"(// Generated by the NGraph GPU backend R"(// Generated by the NGraph GPU backend
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <tuple>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cudnn_v7.h> #include <cudnn_v7.h>
#include "cublas_v2.h" #include "cublas_v2.h"
...@@ -249,53 +241,6 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -249,53 +241,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp" #include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp" #include "ngraph/node.hpp"
#include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp"
#include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concatenate.hpp"
#include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convert.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/cos.hpp"
#include "ngraph/ops/cosh.hpp"
#include "ngraph/ops/divide.hpp"
#include "ngraph/ops/dot.hpp"
#include "ngraph/ops/equal.hpp"
#include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/power.hpp"
#include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp"
#include "ngraph/ops/sign.hpp"
#include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp"
#include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp"
#include "ngraph/ops/tan.hpp"
#include "ngraph/ops/tanh.hpp"
#include "ngraph/pass/assign_layout.hpp" #include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp" #include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp" #include "ngraph/pass/liveness.hpp"
...@@ -413,7 +358,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -413,7 +358,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
for (shared_ptr<Function> f : pass_manager.get_state().get_functions()) for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{ {
writer << "extern \"C\" void " << f->get_name() writer << "extern \"C\" void " << f->get_name()
<< "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle);\n"; << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle);\n";
} }
writer << "\n"; writer << "\n";
...@@ -450,7 +395,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -450,7 +395,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
string match_function_name; string match_function_name;
for (size_t j = i + 1; j < op_list.size(); j++) for (size_t j = i + 1; j < op_list.size(); j++)
{ {
if (op_list[i]->is_functionally_identical(*op_list[j])) if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
{ {
if (match_function_name.empty()) if (match_function_name.empty())
{ {
...@@ -531,7 +476,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -531,7 +476,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
} }
writer << "extern \"C\" void " << current_function->get_name(); writer << "extern \"C\" void " << current_function->get_name();
writer << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle)\n"; writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
......
...@@ -39,7 +39,7 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen ...@@ -39,7 +39,7 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size(); m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
if (m_buffer_size > 0) if (m_buffer_size > 0)
{ {
cudaMalloc(&m_allocated_buffer_pool, m_buffer_size); cudaMalloc((void**) &m_allocated_buffer_pool, m_buffer_size);
} }
} }
...@@ -50,18 +50,10 @@ runtime::gpu::GPU_TensorView::~GPU_TensorView() ...@@ -50,18 +50,10 @@ runtime::gpu::GPU_TensorView::~GPU_TensorView()
void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n) void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)
{ {
if (tensor_offset + n > m_buffer_size)
{
throw out_of_range("write access past end of tensor");
}
cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice); cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice);
} }
void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const
{ {
if (tensor_offset + n > m_buffer_size)
{
throw out_of_range("read access past end of tensor");
}
cudaMemcpy(target, m_allocated_buffer_pool, n, cudaMemcpyDeviceToHost); cudaMemcpy(target, m_allocated_buffer_pool, n, cudaMemcpyDeviceToHost);
} }
...@@ -49,6 +49,6 @@ public: ...@@ -49,6 +49,6 @@ public:
/// @param n Number of bytes to read, must be integral number of elements. /// @param n Number of bytes to read, must be integral number of elements.
void read(void* p, size_t tensor_offset, size_t n) const override; void read(void* p, size_t tensor_offset, size_t n) const override;
void** m_allocated_buffer_pool; void* m_allocated_buffer_pool;
size_t m_buffer_size; size_t m_buffer_size;
}; };
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment