Commit 94429375 authored by fenglei.tian's avatar fenglei.tian

refacgtor and fix bug

parent 28de1557
mkdir build
cd build
cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
make
make -j24 all
./build/test/unit-test --gtest_filter=GPU.*
./build/test/unit-test --gtest_filter=GPU.ab
#./build/test/unit-test --gtest_filter=GPU.dot*
......@@ -17,7 +17,7 @@
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cublas.h"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
......@@ -32,21 +32,27 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction>
, m_compiled_function(compiled_function)
{
cublasStatus_t stat = cublasCreate(&m_cublas_handle);
if (stat != cudaSuccess)
if (stat != CUBLAS_STATUS_SUCCESS)
{
throw runtime_error("cuBLAS create failed");
}
cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_HOST);
// Pass scalars as reference on the device
cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
}
runtime::gpu::GPU_CallFrame::~GPU_CallFrame()
{
cublasDestroy(m_cublas_handle);
}
void runtime::gpu::GPU_CallFrame::tensor_call(
const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs,
const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs)
{
// Host tensors
vector<void**> inputs;
vector<void**> outputs;
vector<void*> inputs;
vector<void*> outputs;
for (size_t i = 0; i < input_tvs.size(); i++)
{
......
......@@ -35,8 +35,8 @@ namespace ngraph
class GPU_CallFrame;
class GPU_ExternalFunction;
using EntryPoint_t = void(void*** inputs,
void*** outputs,
using EntryPoint_t = void(void** inputs,
void** outputs,
cublasHandle_t& cublas_handle);
using EntryPoint = std::function<EntryPoint_t>;
......@@ -48,7 +48,7 @@ namespace ngraph
GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
EntryPoint compiled_function);
~GPU_CallFrame() override = default;
~GPU_CallFrame() override;
/// @brief Invoke the function with values matching the signature of the function.
///
......
......@@ -63,55 +63,28 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
if (arg0_shape.empty() || arg1_shape.empty())
{
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
}
// clang-format off
else if ((arg0_shape.size() <= 2) && (arg1_shape.size() <= 2))
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const float alpha = 1.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";;
writer << "cublasScopy("
<< "cublas_handle,"
<< out[0].get_size() << ","
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 1.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgeam("
<< "cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,\n"
<< out[0].get_size() << ","
<< " 1, \n"
<< " &alpha, "
<< args[0].get_name() << ","
// Todo handle striding?
<< "1,"
<< args[0].get_size() << ",\n"
<< " &beta, "
<< args[1].get_name() << ","
<< args[1].get_size() << ",\n"
<< out[0].get_name() << ","
<< "1);\n";
writer << "cublasSaxpy("
<< "cublas_handle,"
<< out[0].get_size() << ","
<< "&alpha," //alpha
<< args[1].get_name() << ","
// Todo handle striding?
<< "1,"
<< out[0].get_name() << ","
<< "1);\n";
<< out[0].get_size() << ");\n";
writer.indent--;
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer << "}\n";
}
// clang-format on
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
{
// GEMM Call
}
else
{
// General ND Call?
}
}
void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
......@@ -188,7 +161,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
<< out[0].get_name() << ","
<< "1);\n";
// clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
;
writer.indent--;
writer << "}\n";
......@@ -224,7 +196,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
<< out[0].get_name() << ","
<< "n);\n";
// clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
......@@ -333,7 +304,6 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
writer += R"(
cudnnDestroy(cudnnHandle);
)";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer.indent--;
writer << "}\n";
// clang-format on
......@@ -458,7 +428,6 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
<< arg_shape[1] << ","
<< out[0].get_name() << ","
<< out[0].get_shape()[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer.indent--;
writer << "}\n";
//clang-format on
......@@ -541,7 +510,6 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
<< "1" // Stride y
<< ");\n";
writer.indent--;
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";;
writer << "}\n";
// clang-format on
}
......
......@@ -21,6 +21,11 @@
#include <typeinfo>
#include <unordered_map>
#include <cuda_runtime.h>
#include <cudnn_v7.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
......@@ -223,19 +228,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
writer +=
R"(// Generated by the NGraph GPU backend
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <tuple>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include <cuda_runtime.h>
#include <cudnn_v7.h>
#include "cublas_v2.h"
......@@ -249,53 +241,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp"
#include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concatenate.hpp"
#include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convert.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/cos.hpp"
#include "ngraph/ops/cosh.hpp"
#include "ngraph/ops/divide.hpp"
#include "ngraph/ops/dot.hpp"
#include "ngraph/ops/equal.hpp"
#include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/power.hpp"
#include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp"
#include "ngraph/ops/sign.hpp"
#include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp"
#include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp"
#include "ngraph/ops/tan.hpp"
#include "ngraph/ops/tanh.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
......@@ -413,7 +358,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{
writer << "extern \"C\" void " << f->get_name()
<< "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle);\n";
<< "(void** inputs, void** outputs, cublasHandle_t& cublas_handle);\n";
}
writer << "\n";
......@@ -450,7 +395,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
string match_function_name;
for (size_t j = i + 1; j < op_list.size(); j++)
{
if (op_list[i]->is_functionally_identical(*op_list[j]))
if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
{
if (match_function_name.empty())
{
......@@ -531,7 +476,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
}
writer << "extern \"C\" void " << current_function->get_name();
writer << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle)\n";
writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle)\n";
writer << "{\n";
writer.indent++;
......
......@@ -39,7 +39,7 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
if (m_buffer_size > 0)
{
cudaMalloc(&m_allocated_buffer_pool, m_buffer_size);
cudaMalloc((void**) &m_allocated_buffer_pool, m_buffer_size);
}
}
......@@ -50,18 +50,10 @@ runtime::gpu::GPU_TensorView::~GPU_TensorView()
void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)
{
if (tensor_offset + n > m_buffer_size)
{
throw out_of_range("write access past end of tensor");
}
cudaMemcpy(m_allocated_buffer_pool, source, n, cudaMemcpyHostToDevice);
}
void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const
{
if (tensor_offset + n > m_buffer_size)
{
throw out_of_range("read access past end of tensor");
}
cudaMemcpy(target, m_allocated_buffer_pool, n, cudaMemcpyDeviceToHost);
}
......@@ -49,6 +49,6 @@ public:
/// @param n Number of bytes to read, must be integral number of elements.
void read(void* p, size_t tensor_offset, size_t n) const override;
void** m_allocated_buffer_pool;
void* m_allocated_buffer_pool;
size_t m_buffer_size;
};
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment