Unverified Commit c5549682 authored by Tristan Webb's avatar Tristan Webb Committed by GitHub

Drwebb/gpu external function (#367)

* Initial GPU_ExternalFunction implementation

Other changes:

Add GPU runtime to same cmake block as GPU, include CUDA headers if GPU enabled

Initial passing (a+b)*c test

Properly link cuda libraries

Simple GPUTensorView implementation

Initial GPU emitter

GPU codegen initial function gen, no kernels yet

Rename GPU emitter and tensor_view_wrapper to match naming convention

* GPU external function based on BASE

* Fix stray base -> gpu

* TensorViewWrapper -> GPU_TensorViewWrapper

* Copy over emitter from base transformer

* Fix for naming dense layout

* Copy kernel emitters from base -> gpu and strip out kernel_utils

* Add aliases to GPU_TensorViewWrappers

* More fixes for naming descriptor::TensorViews

* Move in call_frame implementation from base -> gpu

* apply code format

* GPU codegen running A+B*C

gpu emitters
gpu ctx setup cuda_module kernels
Remove GPU_CF perf counters
Use gpu kernels in external function
Add GPU 1d dot test

Review Changes:
* Remove CPU specific kernel emitting method bodies

* Use copy_data from test/util.cpp, uncomment compileTest

* Use test_utils copy_data function

* Grab function name from pass manager for def, clean up indentation
parent e433e55a
...@@ -190,9 +190,12 @@ endif() ...@@ -190,9 +190,12 @@ endif()
set(SRC ${SRC} set(SRC ${SRC}
runtime/gpu/gpu_call_frame.cpp runtime/gpu/gpu_call_frame.cpp
runtime/gpu/gpu_backend.cpp runtime/gpu/gpu_backend.cpp
runtime/gpu/gpu_manager.cpp runtime/gpu/gpu_emitter.cpp
runtime/gpu/gpu_external_function.cpp runtime/gpu/gpu_external_function.cpp
runtime/gpu/gpu_kernel_emitters.cpp
runtime/gpu/gpu_manager.cpp
runtime/gpu/gpu_tensor_view.cpp runtime/gpu/gpu_tensor_view.cpp
runtime/gpu/gpu_tensor_view_wrapper.cpp
) )
set_property(SOURCE codegen/compiler.cpp APPEND_STRING PROPERTY COMPILE_DEFINITIONS set_property(SOURCE codegen/compiler.cpp APPEND_STRING PROPERTY COMPILE_DEFINITIONS
"CUDA_HEADER_PATHS=\"${CUDA_INCLUDE_DIRS}\";") "CUDA_HEADER_PATHS=\"${CUDA_INCLUDE_DIRS}\";")
...@@ -265,7 +268,7 @@ if(NGRAPH_CPU_ENABLE) ...@@ -265,7 +268,7 @@ if(NGRAPH_CPU_ENABLE)
endif() endif()
if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES) if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES}) target_link_libraries(ngraph PRIVATE cuda)
endif() endif()
# Argon # Argon
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
#include "ngraph/runtime/gpu/gpu_backend.hpp" #include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
#include "ngraph/runtime/external_function.hpp" #include "ngraph/runtime/external_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view.hpp" #include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
...@@ -29,6 +30,6 @@ std::shared_ptr<ngraph::runtime::TensorView> ...@@ -29,6 +30,6 @@ std::shared_ptr<ngraph::runtime::TensorView>
runtime::gpu::GPU_Backend::make_primary_tensor_view(const ngraph::element::Type& element_type, runtime::gpu::GPU_Backend::make_primary_tensor_view(const ngraph::element::Type& element_type,
const Shape& shape) const Shape& shape)
{ {
auto rc = make_shared<runtime::gpu::GPU_TensorView>(element_type, shape); auto rc = make_shared<runtime::cpu::CPU_TensorView>(element_type, shape);
return dynamic_pointer_cast<runtime::TensorView>(rc); return dynamic_pointer_cast<runtime::TensorView>(rc);
} }
...@@ -12,25 +12,66 @@ ...@@ -12,25 +12,66 @@
// see the license for the specific language governing permissions and // see the license for the specific language governing permissions and
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
#include <cstdlib>
#include <fstream>
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
using namespace std; using namespace std;
using namespace ngraph::runtime::gpu; using namespace ngraph;
GPU_CallFrame::GPU_CallFrame(shared_ptr<GPU_ExternalFunction> external_function, runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
shared_ptr<Function> func) EntryPoint compiled_function)
: m_external_function(external_function) : m_external_function(external_function)
, m_function(func) , m_compiled_function(compiled_function)
{ {
} }
void GPU_CallFrame::call(const vector<shared_ptr<Value>>& input_tvs, void runtime::gpu::GPU_CallFrame::tensor_call(
const vector<shared_ptr<Value>>& output_tvs) const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& input_tvs,
const std::vector<std::shared_ptr<ngraph::runtime::TensorView>>& output_tvs)
{ {
// Host tensors
vector<void*> inputs;
vector<void*> outputs;
for (size_t i = 0; i < input_tvs.size(); i++)
{
shared_ptr<runtime::cpu::CPU_TensorView> tv =
static_pointer_cast<runtime::cpu::CPU_TensorView>(input_tvs[i]);
inputs.push_back(tv->get_data_ptr());
}
for (size_t i = 0; i < output_tvs.size(); i++)
{
shared_ptr<runtime::cpu::CPU_TensorView> tv =
static_pointer_cast<runtime::cpu::CPU_TensorView>(output_tvs[i]);
outputs.push_back(tv->get_data_ptr());
}
// Invoke compiled computation
m_compiled_function(inputs.data(), outputs.data());
} }
void GPU_CallFrame::tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs, void runtime::gpu::GPU_CallFrame::call(
const std::vector<std::shared_ptr<TensorView>>& outputs) const std::vector<std::shared_ptr<runtime::TensorView>>& arguments,
const std::vector<std::shared_ptr<runtime::TensorView>>& results)
{ {
// TODO: Check types of args and result
vector<shared_ptr<runtime::TensorView>> inputs;
for (shared_ptr<runtime::TensorView> argument : arguments)
{
argument->collect_tensor_views(inputs, argument);
}
vector<shared_ptr<runtime::TensorView>> outputs;
for (shared_ptr<runtime::TensorView> result : results)
{
result->collect_tensor_views(outputs, result);
}
tensor_call(inputs, outputs);
} }
...@@ -26,6 +26,8 @@ namespace ngraph ...@@ -26,6 +26,8 @@ namespace ngraph
{ {
namespace runtime namespace runtime
{ {
class PrimaryTensorView;
namespace gpu namespace gpu
{ {
class GPU_CallFrame; class GPU_CallFrame;
...@@ -40,22 +42,23 @@ namespace ngraph ...@@ -40,22 +42,23 @@ namespace ngraph
{ {
public: public:
GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function, GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
std::shared_ptr<Function> func); EntryPoint compiled_function);
/// @brief Invoke the function with values matching the signature of the function. /// @brief Invoke the function with values matching the signature of the function.
/// ///
/// Tuples will be expanded into their tensor views to build the call frame. /// Tuples will be expanded into their tensor views to build the call frame.
void call(const std::vector<std::shared_ptr<ngraph::runtime::Value>>& inputs, void
const std::vector<std::shared_ptr<ngraph::runtime::Value>>& outputs); call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;
/// @brief Invoke the function with tuples pre-expanded to their underlying /// @brief Invoke the function with tuples pre-expanded to their underlying
/// tensor views. /// tensor views.
void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs, void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
const std::vector<std::shared_ptr<TensorView>>& outputs); const std::vector<std::shared_ptr<TensorView>>& outputs) override;
protected: protected:
std::shared_ptr<GPU_ExternalFunction> m_external_function; std::shared_ptr<GPU_ExternalFunction> m_external_function;
std::shared_ptr<Function> m_function; EntryPoint m_compiled_function;
}; };
} }
} }
......
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include <algorithm>
#include <cmath>
#include <iostream>
#include <string>
#include <typeindex>
#include <unordered_map>
#include <vector>
#include "ngraph/node.hpp"
#include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/concatenate.hpp"
#include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/dot.hpp"
#include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/get_output_element.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/slice.hpp"
#include "ngraph/ops/sum.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
void runtime::gpu::GPU_Emitter::EmitNop(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitAdd(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitDot(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitDivide(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitEqual(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitGreater(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitGreaterEq(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitLess(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitLessEq(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitLog(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitMaximum(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitMinimum(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitNegative(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitNotEqual(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSelect(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSubtract(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitBroadcast(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitConvert(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitConstant(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReshape(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitFunctionCall(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReduce(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSign(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSlice(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSum(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitExp(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSin(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSinh(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitCos(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitCosh(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitTan(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitTanh(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitAsin(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitAcos(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitAtan(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitPower(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitOneHot(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitCeiling(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitFloor(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitSqrt(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitConvolution(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitNot(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitMaxPool(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReverse(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
//------------------------------------------------------------------------------------------------
// Utility methods
//------------------------------------------------------------------------------------------------
void runtime::gpu::GPU_Emitter::generate_call(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out,
shared_ptr<Function> function)
{
vector<string> input_names;
vector<string> output_names;
for (const runtime::gpu::GPU_TensorViewWrapper& input : args)
{
input_names.push_back(input.get_name());
}
for (const runtime::gpu::GPU_TensorViewWrapper& output : out)
{
output_names.push_back(output.get_name());
}
m_out << "void* args[] =\n{";
m_out.indent++;
m_out << "\n" << join(input_names, ",\n");
m_out.indent--;
m_out << "\n};\n";
m_out << "void* out[] =\n{";
m_out.indent++;
m_out << "\n" << join(output_names, ",\n");
m_out.indent--;
m_out << "\n};\n";
m_out << "\n";
m_out << function->get_name() << "(args, out);\n";
}
static string format_name(const string& name)
{
string rc;
if (!name.empty())
{
rc = " " + name;
}
return rc;
}
void runtime::gpu::GPU_Emitter::EmitAbs(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitConcat(const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitMultiply(
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#pragma once
#include <string>
#include <vector>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/node.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
#define EMITTER_DECL(E) \
E(const ngraph::Node* n, \
const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& args, \
const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& out)
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPU_Emitter
{
protected:
codegen::CodeWriter m_out;
bool m_use_ref_kernels;
public:
GPU_Emitter()
: m_out()
, m_use_ref_kernels(std::getenv("NGRAPH_GPU_USE_REF_KERNELS") != nullptr)
{
}
std::string get_code() { return m_out.get_code(); }
codegen::CodeWriter& get_code_writer() { return m_out; }
void EMITTER_DECL(EmitNop);
void EMITTER_DECL(EmitAdd);
void EMITTER_DECL(EmitDot);
void EMITTER_DECL(EmitMultiply);
void EMITTER_DECL(EmitGetOutputElement);
void EMITTER_DECL(EmitXLAGetTupleElement);
void EMITTER_DECL(EmitTuple);
void EMITTER_DECL(EmitAbs);
void EMITTER_DECL(EmitConcat);
void EMITTER_DECL(EmitDivide);
void EMITTER_DECL(EmitEqual);
void EMITTER_DECL(EmitGreater);
void EMITTER_DECL(EmitGreaterEq);
void EMITTER_DECL(EmitLess);
void EMITTER_DECL(EmitLessEq);
void EMITTER_DECL(EmitLog);
void EMITTER_DECL(EmitMaximum);
void EMITTER_DECL(EmitMinimum);
void EMITTER_DECL(EmitNegative);
void EMITTER_DECL(EmitNotEqual);
void EMITTER_DECL(EmitSelect);
void EMITTER_DECL(EmitSubtract);
void EMITTER_DECL(EmitBroadcast);
void EMITTER_DECL(EmitConvert);
void EMITTER_DECL(EmitConstant);
void EMITTER_DECL(EmitReshape);
void EMITTER_DECL(EmitFunctionCall);
void EMITTER_DECL(EmitReduce);
void EMITTER_DECL(EmitSign);
void EMITTER_DECL(EmitSlice);
void EMITTER_DECL(EmitSum);
void EMITTER_DECL(EmitExp);
void EMITTER_DECL(EmitSin);
void EMITTER_DECL(EmitSinh);
void EMITTER_DECL(EmitCos);
void EMITTER_DECL(EmitCosh);
void EMITTER_DECL(EmitTan);
void EMITTER_DECL(EmitTanh);
void EMITTER_DECL(EmitAsin);
void EMITTER_DECL(EmitAcos);
void EMITTER_DECL(EmitAtan);
void EMITTER_DECL(EmitPower);
void EMITTER_DECL(EmitReplaceSlice);
void EMITTER_DECL(EmitOneHot);
void EMITTER_DECL(EmitFloor);
void EMITTER_DECL(EmitCeiling);
void EMITTER_DECL(EmitSqrt);
void EMITTER_DECL(EmitConvolution);
void EMITTER_DECL(EmitNot);
void EMITTER_DECL(EmitMaxPool);
void EMITTER_DECL(EmitReverse);
private:
void generate_call(const std::vector<GPU_TensorViewWrapper>& args,
const std::vector<GPU_TensorViewWrapper>& out,
std::shared_ptr<Function> function);
std::string emit_vector(const GPU_TensorViewWrapper&, const std::string& name = "");
std::string emit_array1d(const GPU_TensorViewWrapper&,
const std::string& name = "");
std::string emit_matrix(const GPU_TensorViewWrapper&, const std::string& name = "");
};
}
}
}
...@@ -12,35 +12,1012 @@ ...@@ -12,35 +12,1012 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
#include <cstdlib>
#include <fstream>
#include <memory> #include <memory>
#include <string> #include <string>
#include <tuple>
#include <typeindex>
#include <typeinfo>
#include <unordered_map> #include <unordered_map>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/descriptor/primary_tensor_view.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp"
#include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concatenate.hpp"
#include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convert.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/cos.hpp"
#include "ngraph/ops/cosh.hpp"
#include "ngraph/ops/divide.hpp"
#include "ngraph/ops/dot.hpp"
#include "ngraph/ops/equal.hpp"
#include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/power.hpp"
#include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp"
#include "ngraph/ops/sign.hpp"
#include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp"
#include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp"
#include "ngraph/ops/tan.hpp"
#include "ngraph/ops/tanh.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
using namespace std; using namespace std;
using namespace ngraph::runtime::gpu;
using namespace ngraph; using namespace ngraph;
ngraph::runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction( static const string s_output_dir = "gpu_codegen";
const std::shared_ptr<ngraph::Function>& function, bool release_function)
: runtime::ExternalFunction(function, release_function) class StaticInitializers
, m_function(function) {
public:
StaticInitializers() { ngraph::file_util::remove_directory(s_output_dir); }
};
static string emit_string_array(const vector<string>& s, size_t max_line_length)
{
stringstream ss;
stringstream line;
for (size_t i = 0; i < s.size(); i++)
{
if (i != 0)
{
line << ",";
}
stringstream value;
value << s[i];
string value_string = value.str();
if (static_cast<size_t>(line.tellp()) + value_string.size() + 1 <= max_line_length)
{
if (i > 0)
{
line << " ";
}
line << value_string;
}
else
{
ss << line.str() << "\n";
line.str("");
line << value_string;
}
}
ss << line.str();
return ss.str();
}
static StaticInitializers s_static_initializers;
#define TI(x) type_index(typeid(x))
static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::EmitAdd},
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitGreater},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitNotEqual},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitPower},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::EmitSelect},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitSubtract},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::EmitBroadcast},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitConvert},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::EmitConstant},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
, m_use_tbb(std::getenv("NGRAPH_GPU_USE_TBB") != nullptr)
{ {
} }
void runtime::gpu::GPU_ExternalFunction::compile() void runtime::gpu::GPU_ExternalFunction::compile()
{ {
if (m_is_compiled)
{
return;
}
string function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
pass::Manager pass_manager;
// pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major.
pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.register_pass<pass::DumpSorted>(dump_filename);
pass_manager.run_passes(m_function);
GPU_Emitter emitter;
codegen::CodeWriter& writer = emitter.get_code_writer();
writer +=
R"(// Generated by the NGraph GPU backend
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <tuple>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include "cuda.h"
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/descriptor/primary_tensor_view.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/function.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
#include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp"
#include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concatenate.hpp"
#include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convert.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/cos.hpp"
#include "ngraph/ops/cosh.hpp"
#include "ngraph/ops/divide.hpp"
#include "ngraph/ops/dot.hpp"
#include "ngraph/ops/equal.hpp"
#include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/power.hpp"
#include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp"
#include "ngraph/ops/sign.hpp"
#include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp"
#include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp"
#include "ngraph/ops/tan.hpp"
#include "ngraph/ops/tanh.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/util.hpp"
)";
string pch_header_source = writer.get_code();
writer += R"(
using namespace ngraph::runtime;
using namespace std;
void check_cuda_errors(CUresult err) {
assert(err == CUDA_SUCCESS);
// assert(err == err);
}
)";
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
}
}
}
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{
writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs);\n";
}
writer << "\n";
writer << "extern \"C\" void " << pass_manager.get_state().get_functions()[0]->get_name()
<< "(void** inputs, void** outputs){\n";
writer += R"(
CUdevice device;
CUmodule cuda_module;
CUcontext context;
CUfunction add_function;
CUfunction mult_function;
CUlinkState linker;
int dev_count;
check_cuda_errors(cuInit(0));
check_cuda_errors(cuDeviceGetCount(&dev_count));
check_cuda_errors(cuDeviceGet(&device, 0));
// char name[128];
// check_cuda_errors(cuDeviceGetName(name, 128, device));
// std::cout << "Using CUDA Device [0]: " << name << "\n";
// int dev_major, dev_minor;
// check_cuda_errors(cuDeviceComputeCapability(&dev_major, &dev_minor, device));
// std::cout << "Device Compute Capability: "
// << dev_major << "." << dev_minor << "\n";
// if (dev_major < 2) {
// std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
// }
const auto kernels = R"#(
.version 5.0
.target sm_60
.address_size 64
// .globl _Z7ew_multPfS_S_ // -- Begin function _Z7ew_multPfS_S_
.global .align 1 .b8 threadIdx[1];
// @_Z7ew_multPfS_S_
.visible .entry _Z7ew_multPfS_S_(
.param .u64 _Z7ew_multPfS_S__param_0,
.param .u64 _Z7ew_multPfS_S__param_1,
.param .u64 _Z7ew_multPfS_S__param_2
)
{
.local .align 8 .b8 __local_depot0[24];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .f32 %f<4>;
.reg .b32 %r<2>;
.reg .b64 %rd<17>;
// BB#0:
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd3, [_Z7ew_multPfS_S__param_2];
ld.param.u64 %rd2, [_Z7ew_multPfS_S__param_1];
ld.param.u64 %rd1, [_Z7ew_multPfS_S__param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd2;
cvta.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd8, %rd1;
cvta.global.u64 %rd9, %rd8;
st.u64 [%SP+0], %rd9;
st.u64 [%SP+8], %rd7;
st.u64 [%SP+16], %rd5;
ld.u64 %rd10, [%SP+0];
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
ld.f32 %f1, [%rd12];
ld.u64 %rd13, [%SP+8];
add.s64 %rd14, %rd13, %rd11;
ld.f32 %f2, [%rd14];
mul.rn.f32 %f3, %f1, %f2;
ld.u64 %rd15, [%SP+16];
add.s64 %rd16, %rd15, %rd11;
st.f32 [%rd16], %f3;
ret;
}
// -- End function
// .globl _Z6ew_addPfS_S_ // -- Begin function _Z6ew_addPfS_S_
.visible .entry _Z6ew_addPfS_S_(
.param .u64 _Z6ew_addPfS_S__param_0,
.param .u64 _Z6ew_addPfS_S__param_1,
.param .u64 _Z6ew_addPfS_S__param_2
) // @_Z6ew_addPfS_S_
{
.local .align 8 .b8 __local_depot1[24];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .f32 %f<4>;
.reg .b32 %r<2>;
.reg .b64 %rd<17>;
// BB#0:
mov.u64 %SPL, __local_depot1;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd3, [_Z6ew_addPfS_S__param_2];
ld.param.u64 %rd2, [_Z6ew_addPfS_S__param_1];
ld.param.u64 %rd1, [_Z6ew_addPfS_S__param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd2;
cvta.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd8, %rd1;
cvta.global.u64 %rd9, %rd8;
st.u64 [%SP+0], %rd9;
st.u64 [%SP+8], %rd7;
st.u64 [%SP+16], %rd5;
ld.u64 %rd10, [%SP+0];
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
ld.f32 %f1, [%rd12];
ld.u64 %rd13, [%SP+8];
add.s64 %rd14, %rd13, %rd11;
ld.f32 %f2, [%rd14];
add.rn.f32 %f3, %f1, %f2;
ld.u64 %rd15, [%SP+16];
add.s64 %rd16, %rd15, %rd11;
st.f32 [%rd16], %f3;
ret;
}
// -- End function
)#";
// Create driver context
check_cuda_errors(cuCtxCreate(&context, 0, device));
// Create module for object
check_cuda_errors(cuModuleLoadDataEx(&cuda_module, kernels, 0, 0, 0));
// Get kernel function
check_cuda_errors(cuModuleGetFunction(&add_function, cuda_module, "_Z6ew_addPfS_S_"));
check_cuda_errors(cuModuleGetFunction(&mult_function, cuda_module, "_Z7ew_multPfS_S_"));
// Device data
CUdeviceptr dev_bufferA;
CUdeviceptr dev_bufferB;
CUdeviceptr dev_bufferC;
check_cuda_errors(cuMemAlloc(&dev_bufferA, sizeof(float) * 4));
check_cuda_errors(cuMemAlloc(&dev_bufferB, sizeof(float) * 4));
check_cuda_errors(cuMemAlloc(&dev_bufferC, sizeof(float) * 4));
float* host_A = new float[4];
float* host_B = new float[4];
float* host_C = new float[4];
// Populate input
memcpy(host_A, (float*)(inputs[0]), sizeof(float) * 4);
memcpy(host_B, (float*)(inputs[1]), sizeof(float) * 4);
memcpy(host_C, (float*)(inputs[2]), sizeof(float) * 4);
check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float) * 4));
check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float) * 4));
// check_cuda_errors(cuMemcpyHtoD(dev_bufferC, &host_C[0], sizeof(float) * 4));
unsigned block_size_X = 4;
unsigned block_size_Y = 1;
unsigned block_size_Z = 1;
unsigned grid_size_X = 1;
unsigned grid_size_Y = 1;
unsigned grid_size_Z = 1;
// Kernel parameters
void* kernel_params[] = {&dev_bufferA, &dev_bufferB, &dev_bufferC};
// Add Kernel launch
check_cuda_errors(cuLaunchKernel(add_function,
grid_size_X,
grid_size_Y,
grid_size_Z,
block_size_X,
block_size_Y,
block_size_Z,
0,
NULL,
kernel_params,
NULL));
check_cuda_errors(cuMemcpyDtoH(&host_A[0], dev_bufferC, sizeof(float) * 4));
host_B = &host_C[0];
check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float) * 4));
check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float) * 4));
// Mult Kernel launch
check_cuda_errors(cuLaunchKernel(mult_function,
grid_size_X,
grid_size_Y,
grid_size_Z,
block_size_X,
block_size_Y,
block_size_Z,
0,
NULL,
kernel_params,
NULL));
// Write final output
check_cuda_errors(cuMemcpyDtoH(&((float*)(outputs[0]))[0], dev_bufferC, sizeof(float) * 4));
// Clean up after ourselves
// // Clean-up must do this in tensor view!!!
check_cuda_errors(cuMemFree(dev_bufferA));
check_cuda_errors(cuMemFree(dev_bufferB));
check_cuda_errors(cuMemFree(dev_bufferC));
check_cuda_errors(cuModuleUnload(cuda_module));
check_cuda_errors(cuCtxDestroy(context));})";
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // This for loop creates a collection of functions that are called more than once
// // and emitting them as globally callable functions.
// // ops implement the is_functionally_identical method
// unordered_map<Node*, string> match_functions;
// for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
// {
// const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
// vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
// for (size_t i = 0; i < op_list.size() - 1; i++)
// {
// if (op_list[i]->is_constant() || op_list[i]->is_parameter())
// {
// continue;
// }
// if (contains_key(match_functions, op_list[i].get()))
// {
// continue;
// }
// string match_function_name;
// for (size_t j = i + 1; j < op_list.size(); j++)
// {
// if (op_list[i]->is_functionally_identical(*op_list[j]))
// {
// if (match_function_name.empty())
// {
// match_function_name = "func_" + op_list[i]->get_name();
// match_functions.insert({op_list[i].get(), match_function_name});
// }
// match_functions.insert({op_list[j].get(), match_function_name});
// }
// }
// if (!match_function_name.empty())
// {
// writer << "static void " << match_function_name << "(";
// writer.indent++;
// // Work around a compiler warning (*node inside typeid may have effects
// // with shared pointers, which is fine here but clang doesn't like it.)
// auto& n = *op_list[i];
// auto handler = dispatcher.find(type_index(typeid(n)));
// vector<GPU_TensorViewWrapper> in;
// size_t arg_index = 0;
// set<string> arg_names;
// for (const descriptor::Input& input : n.get_inputs())
// {
// const descriptor::Output& output = input.get_output();
// shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
// GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
// if (!contains(arg_names, tvw.get_name()))
// {
// arg_names.insert(tvw.get_name());
// if (arg_index++ > 0)
// {
// writer << ",";
// }
// writer << "\n";
// writer << tvw.get_type() << "* " << tvw.get_name();
// }
// in.push_back(tvw);
// }
// vector<GPU_TensorViewWrapper> out;
// for (const descriptor::Output& output : n.get_outputs())
// {
// shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
// GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
// if (arg_index++ > 0)
// {
// writer << ",";
// }
// writer << "\n";
// writer << tvw.get_type() << "* " << tvw.get_name();
// out.push_back(tvw);
// }
// writer.indent--;
// if (node->liveness_new_list.size() > 0)
// {
// temporaries_used = true;
// for (descriptor::Tensor* tensor : node->liveness_new_list)
// {
// worst_case_tmp_size += tensor->size();
// }
// }
// }
// if (temporaries_used)
// {
// size_t temp_pool_size = current_function->get_temporary_pool_size();
// writer << "// Allocate the memory pool\n";
// writer << "// Memory pool size is " << temp_pool_size << " bytes\n";
// writer << "// Worst case size is " << worst_case_tmp_size << " bytes\n";
// writer << "ngraph::runtime::AlignedBuffer memory_handler(" << temp_pool_size << ", "
// << ngraph::runtime::gpu::alignment << ");\n";
// writer << "size_t pool_gpu_ptr = (size_t)memory_handler.get_ptr();\n";
// writer << "\n";
// // Add temporaries to the variable name map
// for (shared_ptr<Node> node : current_function->get_ordered_ops())
// {
// for (descriptor::Tensor* tensor : node->liveness_new_list)
// {
// stringstream ss;
// ss << "((" << tensor->get_element_type().c_type_string() << "*)(pool_gpu_ptr + "
// << tensor->get_pool_offset() << "))";
// m_variable_name_map[tensor->get_name()] = ss.str();
// }
// }
// }
// // Add inputs to the variable name map
// size_t arg_index = 0;
// for (shared_ptr<op::Parameter> param : current_function->get_parameters())
// {
// for (size_t i = 0; i < param->get_output_size(); ++i)
// {
// shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
// const element::Type& et = tv->get_tensor_view_type()->get_element_type();
// string type = et.c_type_string();
// stringstream ss;
// ss << "((" << type << "*)(inputs[" << arg_index << "]))";
// m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
// arg_index++;
// }
// }
// // create output alias map
// size_t output_index = 0;
// unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
// vector<size_t> aliases;
// for (size_t i = 0; i < current_function->get_output_size(); ++i)
// {
// shared_ptr<Node> op = current_function->get_output_op(i);
// shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
// vector<size_t>& al = output_alias_map[otv.get()];
// al.push_back(output_index);
// if (al.size() > 1)
// {
// aliases.push_back(output_index);
// }
// output_index++;
// }
// // Add outputs to the variable name map
// output_index = 0;
// for (size_t i = 0; i < current_function->get_output_size(); ++i)
// {
// shared_ptr<Node> op = current_function->get_output_op(i);
// shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
// const element::Type& et = tv->get_tensor_view_type()->get_element_type();
// bool parameter_as_output = false;
// for (shared_ptr<op::Parameter> param : current_function->get_parameters())
// {
// for (const descriptor::Output& pout : param->get_outputs())
// {
// shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
// if (tv == ptv)
// {
// parameter_as_output = true;
// writer << "memcpy(static_cast<" << et.c_type_string() << "*>(outputs["
// << output_index << "]), "
// << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
// << ptv->get_tensor().size() << ");\n";
// break;
// }
// }
// }
// if (!parameter_as_output && !contains(aliases, output_index))
// {
// if (contains(constants, tv.get()))
// {
// writer << "memcpy(outputs[" << output_index << "], " << tv->get_tensor().get_name()
// << ", " << tv->get_tensor().size() << ");\n";
// }
// else
// {
// string type = et.c_type_string();
// stringstream ss;
// ss << "((" << type << "*)(outputs[" << output_index << "]))";
// m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
// }
// }
// output_index++;
// }
// for (shared_ptr<Node> node : current_function->get_ordered_ops())
// {
// auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
// // with shared pointers, which is fine here but clang doesn't like it.)
// auto handler = dispatcher.find(type_index(typeid(n)));
// if (handler == dispatcher.end())
// {
// throw ngraph_error("Unhandled op during code generation : " + node->description());
// }
// vector<GPU_TensorViewWrapper> in;
// for (const descriptor::Input& input : node->get_inputs())
// {
// const descriptor::Output& output = input.get_output();
// shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
// in.push_back(GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
// }
// vector<GPU_TensorViewWrapper> out;
// for (const descriptor::Output& output : node->get_outputs())
// {
// shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
// out.push_back(GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
// }
// // Emit operation prologue
// if (!node->is_parameter() && !node->is_constant())
// {
// if (m_use_tbb)
// {
// writer << "tbb::flow::continue_node<tbb::flow::continue_msg> "
// "flowgraph_node_"
// << node->get_name() << "(G, [&](const tbb::flow::continue_msg &msg)\n{\n";
// writer.indent++;
// }
// if (m_emit_timing)
// {
// emit_debug_function_entry(writer, node.get(), in, out);
// }
// }
// // Emit operation body
// string func_name;
// auto it = match_functions.find(node.get());
// if (it != match_functions.end())
// {
// func_name = it->second;
// }
// if (func_name.empty())
// {
// handler->second(&emitter, node.get(), in, out);
// }
// else
// {
// vector<string> names;
// for (const GPU_TensorViewWrapper& tv : in)
// {
// names.push_back(tv.get_name());
// }
// for (const GPU_TensorViewWrapper& tv : out)
// {
// names.push_back(tv.get_name());
// }
// writer << func_name << "(" << join(names) << ");\n";
// }
// // Emit operation epilogue
// if (!node->is_parameter() && !node->is_constant())
// {
// handle_output_alias(writer, *node, output_alias_map);
// if (m_emit_timing)
// {
// emit_debug_function_exit(writer, node.get(), in, out);
// }
// if (m_use_tbb)
// {
// writer.indent--;
// writer << "});\n";
// }
// }
// }
// if (m_use_tbb)
// {
// writer << "\n";
// // Build the flow graph
// vector<Node*> dependence_graph_heads;
// traverse_nodes(current_function, [&writer, &dependence_graph_heads](shared_ptr<Node> n) {
// if (!n->is_parameter() && !n->is_constant())
// {
// bool is_head = true;
// for (auto arg : n->get_input_ops())
// {
// if (!arg->is_parameter() && !arg->is_constant())
// {
// is_head = false;
// writer << "tbb::flow::make_edge(flowgraph_node_" << arg->get_name()
// << ", flowgraph_node_" << n->get_name() << ");\n";
// }
// }
// if (is_head)
// {
// dependence_graph_heads.emplace_back(n.get());
// }
// }
// });
// writer << "\n";
// // Execute the flow graph
// if (!dependence_graph_heads.empty())
// {
// for (Node* n : dependence_graph_heads)
// {
// writer << "flowgraph_node_" << n->get_name()
// << ".try_put(tbb::flow::continue_msg());\n";
// }
// writer << "try { G.wait_for_all(); } catch(...) { throw; }\n";
// }
// }
// writer.indent--;
// // End generated function
// writer += "}\n\n";
// }
// TODO: Cleanup and make this a utility function
file_util::make_directory(s_output_dir);
string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
ofstream out(filename);
string code = writer.get_code();
out << code;
out.close();
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(pch_header_source);
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
{
throw runtime_error("function failed to compile");
}
m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function);
m_is_compiled = true;
if (m_release_function)
{
release_function();
}
} }
shared_ptr<runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame() void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
codegen::CodeWriter& writer,
const Node& node,
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
{
for (const descriptor::Output& output : node.get_outputs())
{
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{
writer << "{ // handle output alias for previous op\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "memcpy(static_cast<void*>(outputs[" << outputs[i]
<< "]), static_cast<void*>(outputs[" << outputs[0] << "]), "
<< otv->get_tensor().size() << ");\n";
}
writer.indent--;
writer << "}\n";
}
}
}
}
shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
{ {
if (!m_is_compiled) if (!m_is_compiled)
{ {
compile(); compile();
} }
return make_shared<runtime::gpu::GPU_CallFrame>(shared_from_this(), m_function); return make_shared<ngraph::runtime::gpu::GPU_CallFrame>(shared_from_this(),
m_compiled_function);
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{
writer << "timer_" << node->get_name() << ".start();\n";
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{
writer << "timer_" << node->get_name() << ".stop();\n";
} }
...@@ -20,9 +20,13 @@ ...@@ -20,9 +20,13 @@
#include <typeinfo> #include <typeinfo>
#include <unordered_map> #include <unordered_map>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/runtime/external_function.hpp" #include "ngraph/runtime/external_function.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -30,9 +34,23 @@ namespace ngraph ...@@ -30,9 +34,23 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
class GPU_ExternalFunction;
class GPU_Emitter;
class GPU_CallFrame;
using OpFunction =
std::function<void(GPU_Emitter*,
const ngraph::Node*,
const std::vector<GPU_TensorViewWrapper>& inputs,
const std::vector<GPU_TensorViewWrapper>& outputs)>;
using OpMap = std::unordered_map<std::type_index, OpFunction>;
class GPU_ExternalFunction : public ngraph::runtime::ExternalFunction, class GPU_ExternalFunction : public ngraph::runtime::ExternalFunction,
public std::enable_shared_from_this<GPU_ExternalFunction> public std::enable_shared_from_this<GPU_ExternalFunction>
{ {
friend class GPU_CallFrame;
public: public:
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function, GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
bool release_function = true); bool release_function = true);
...@@ -41,7 +59,27 @@ namespace ngraph ...@@ -41,7 +59,27 @@ namespace ngraph
protected: protected:
void compile(); void compile();
std::shared_ptr<ngraph::Function> m_function; EntryPoint m_compiled_function;
private:
void emit_debug_function_entry(codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out);
void emit_debug_function_exit(codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out);
void handle_output_alias(
codegen::CodeWriter& writer,
const Node&,
const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
bool m_use_tbb;
std::unordered_map<std::string, std::string> m_variable_name_map;
}; };
} }
} }
......
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include <algorithm>
#include <map>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
using namespace ngraph;
using namespace ngraph::runtime::gpu::kernel;
void ngraph::runtime::gpu::kernel::emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& broadcast_axes)
{
}
//
// For the reference kernel this is gpud on, see ngraph/runtime/kernel/concat.hpp.
//
void ngraph::runtime::gpu::kernel::emit_concat(codegen::CodeWriter& writer,
const std::string& element_type,
const std::vector<std::string>& args,
const std::string& out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
size_t concatenation_axis)
{
}
void ngraph::runtime::gpu::kernel::emit_replace_slice(
codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& arg1, // replacement value
const std::string& out,
const Shape& arg1_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides)
{
}
void ngraph::runtime::gpu::kernel::emit_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides)
{
}
void ngraph::runtime::gpu::kernel::emit_reshape(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisVector& arg0_axis_order)
{
}
void ngraph::runtime::gpu::kernel::emit_sum(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& reduction_axes)
{
}
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#pragma once
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/common.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
namespace kernel
{
void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& broadcast_axes);
void emit_concat(codegen::CodeWriter& writer,
const std::string& element_type,
const std::vector<std::string>& args,
const std::string& out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
const size_t concatenation_axis);
void emit_replace_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& arg1, // replacement value
const std::string& out,
const Shape& arg1_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides);
void emit_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides);
void emit_reshape(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisVector& arg0_axis_order);
void emit_sum(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& reduction_axes);
}
}
}
}
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include <memory> #include <memory>
#include <cuda.h>
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp" #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/primary_tensor_view.hpp" #include "ngraph/descriptor/primary_tensor_view.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp" #include "ngraph/runtime/gpu/gpu_backend.hpp"
...@@ -30,59 +32,26 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen ...@@ -30,59 +32,26 @@ runtime::gpu::GPU_TensorView::GPU_TensorView(const ngraph::element::Type& elemen
true, true,
true, true,
false)) false))
, m_allocated_buffer_pool(nullptr)
, m_aligned_buffer_pool(nullptr)
{ {
// Need to check type and have host/device tensors
m_descriptor->set_tensor_view_layout( m_descriptor->set_tensor_view_layout(
std::make_shared<ngraph::descriptor::layout::DenseTensorViewLayout>(*m_descriptor)); std::make_shared<ngraph::descriptor::layout::DenseTensorViewLayout>(*m_descriptor));
m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size(); m_buffer_size = m_descriptor->get_tensor_view_layout()->get_size() * element_type.size();
if (m_buffer_size > 0)
{
size_t allocation_size = m_buffer_size + runtime::gpu::alignment;
m_allocated_buffer_pool = static_cast<char*>(malloc(allocation_size));
m_aligned_buffer_pool = m_allocated_buffer_pool;
size_t mod = size_t(m_aligned_buffer_pool) % alignment;
if (mod != 0)
{
m_aligned_buffer_pool += (alignment - mod);
}
}
}
runtime::gpu::GPU_TensorView::~GPU_TensorView()
{
if (m_allocated_buffer_pool != nullptr)
{
free(m_allocated_buffer_pool);
}
}
char* runtime::gpu::GPU_TensorView::get_data_ptr() // cuMemAlloc(&dev_buffer, m_buffer_size);
{
return m_aligned_buffer_pool;
} }
const char* runtime::gpu::GPU_TensorView::get_data_ptr() const runtime::gpu::GPU_TensorView::~GPU_TensorView()
{ {
return m_aligned_buffer_pool; // cuMemFree(dev_buffer);
} }
void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n) void runtime::gpu::GPU_TensorView::write(const void* source, size_t tensor_offset, size_t n)
{ {
if (tensor_offset + n > m_buffer_size) // cuMemcpyHtoD(dev_buffer, source, n);
{
throw out_of_range("write access past end of tensor");
}
char* target = get_data_ptr();
} }
void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const void runtime::gpu::GPU_TensorView::read(void* target, size_t tensor_offset, size_t n) const
{ {
if (tensor_offset + n > m_buffer_size) // cuMemcpyDtoH(target, dev_buffer, n);
{
throw out_of_range("read access past end of tensor");
}
const char* source = get_data_ptr();
} }
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <cuda.h>
#include <memory> #include <memory>
#include "ngraph/runtime/tensor_view.hpp" #include "ngraph/runtime/tensor_view.hpp"
...@@ -36,9 +37,6 @@ public: ...@@ -36,9 +37,6 @@ public:
GPU_TensorView(const ngraph::element::Type& element_type, const Shape& shape); GPU_TensorView(const ngraph::element::Type& element_type, const Shape& shape);
virtual ~GPU_TensorView(); virtual ~GPU_TensorView();
char* get_data_ptr();
const char* get_data_ptr() const;
/// @brief Write bytes directly into the tensor /// @brief Write bytes directly into the tensor
/// @param p Pointer to source of data /// @param p Pointer to source of data
/// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned. /// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
...@@ -51,8 +49,12 @@ public: ...@@ -51,8 +49,12 @@ public:
/// @param n Number of bytes to read, must be integral number of elements. /// @param n Number of bytes to read, must be integral number of elements.
void read(void* p, size_t tensor_offset, size_t n) const override; void read(void* p, size_t tensor_offset, size_t n) const override;
// const char* get_data_ptr();
// const char* get_data_ptr() const;
private: private:
char* m_allocated_buffer_pool; CUdeviceptr dev_buffer;
char* m_aligned_buffer_pool; // At some point need to deal with alignment
size_t m_buffer_size; size_t m_buffer_size;
}; };
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
#include "ngraph/descriptor/tensor.hpp"
using namespace std;
using namespace ngraph;
runtime::gpu::GPU_TensorViewWrapper::GPU_TensorViewWrapper(
const shared_ptr<descriptor::TensorView>& tv, const string& alias)
: m_tensor_view(tv)
, m_alias(alias)
{
}
size_t runtime::gpu::GPU_TensorViewWrapper::get_size() const
{
return m_tensor_view->get_tensor_view_layout()->get_size();
}
const vector<size_t>& runtime::gpu::GPU_TensorViewWrapper::get_shape() const
{
return m_tensor_view->get_tensor_view_layout()->get_shape();
}
const vector<size_t>& runtime::gpu::GPU_TensorViewWrapper::get_strides() const
{
return m_tensor_view->get_tensor_view_layout()->get_strides();
}
const element::Type& runtime::gpu::GPU_TensorViewWrapper::get_element_type() const
{
return m_tensor_view->get_tensor_view_layout()->get_element_type();
}
const std::string& runtime::gpu::GPU_TensorViewWrapper::get_name() const
{
if (m_alias.empty())
{
return m_tensor_view->get_tensor().get_name();
}
else
{
return m_alias;
}
}
const std::string& runtime::gpu::GPU_TensorViewWrapper::get_type() const
{
return get_element_type().c_type_string();
}
bool runtime::gpu::GPU_TensorViewWrapper::is_output() const
{
return m_tensor_view->get_tensor().is_output();
}
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#pragma once
#include <memory>
#include "ngraph/descriptor/tensor_view.hpp"
#include "ngraph/types/element_type.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPU_TensorViewWrapper;
}
}
}
class ngraph::runtime::gpu::GPU_TensorViewWrapper
{
public:
GPU_TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&,
const std::string& alias = "");
size_t get_size() const;
const std::vector<size_t>& get_shape() const;
const std::vector<size_t>& get_strides() const;
const element::Type& get_element_type() const;
const std::string& get_name() const;
const std::string& get_type() const;
bool is_output() const;
private:
std::shared_ptr<descriptor::TensorView> m_tensor_view;
std::string m_alias;
};
...@@ -22,7 +22,11 @@ ...@@ -22,7 +22,11 @@
#include <cudnn.h> #include <cudnn.h>
#include "ngraph/codegen/compiler.hpp" #include "ngraph/codegen/compiler.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "util/ndarray.hpp"
#include "util/test_tools.hpp"
using namespace ngraph; using namespace ngraph;
using namespace std; using namespace std;
...@@ -41,7 +45,6 @@ TEST(cudnn, compileTest) ...@@ -41,7 +45,6 @@ TEST(cudnn, compileTest)
#include <iostream> #include <iostream>
#include "cuda.h" #include "cuda.h"
void check_cuda_errors(CUresult err) { void check_cuda_errors(CUresult err) {
assert(err == CUDA_SUCCESS); assert(err == CUDA_SUCCESS);
} }
...@@ -202,7 +205,6 @@ const auto str = R"( ...@@ -202,7 +205,6 @@ const auto str = R"(
check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float)*16)); check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float)*16));
check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float)*16)); check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float)*16));
unsigned block_size_X = 16; unsigned block_size_X = 16;
unsigned block_size_Y = 1; unsigned block_size_Y = 1;
unsigned block_size_Z = 1; unsigned block_size_Z = 1;
...@@ -223,13 +225,11 @@ const auto str = R"( ...@@ -223,13 +225,11 @@ const auto str = R"(
// Retrieve device data // Retrieve device data
check_cuda_errors(cuMemcpyDtoH(&host_C[0], dev_bufferC, sizeof(float)*16)); check_cuda_errors(cuMemcpyDtoH(&host_C[0], dev_bufferC, sizeof(float)*16));
std::cout << "Results:\n"; std::cout << "Results:\n";
for (unsigned i = 0; i != 16; ++i) { for (unsigned i = 0; i != 16; ++i) {
std::cout << host_A[i] << " + " << host_B[i] << " = " << host_C[i] << "\n"; std::cout << host_A[i] << " + " << host_B[i] << " = " << host_C[i] << "\n";
} }
// Clean up after ourselves // Clean up after ourselves
delete [] host_A; delete [] host_A;
delete [] host_B; delete [] host_B;
...@@ -261,4 +261,50 @@ TEST(cudnn, abc) ...@@ -261,4 +261,50 @@ TEST(cudnn, abc)
auto external = manager->compile(f); auto external = manager->compile(f);
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external); auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
cf->call({a, b, c}, {result});
EXPECT_EQ(result->get_vector<float>(),
(test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
cf->call({b, a, c}, {result});
EXPECT_EQ(result->get_vector<float>(),
(test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
cf->call({a, c, b}, {result});
EXPECT_EQ(result->get_vector<float>(),
(test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
}
TEST(cudnn, dot1d)
{
auto shape = Shape{4};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
auto shape_r = Shape{};
auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
auto manager = runtime::Manager::get("GPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
auto a = backend->make_primary_tensor_view(element::f32, shape);
copy_data(a, vector<float>{2, 4, 8, 16});
auto b = backend->make_primary_tensor_view(element::f32, shape);
copy_data(b, vector<float>{1, 2, 4, 8});
auto result = backend->make_primary_tensor_view(element::f32, shape_r);
cf->call({a, b}, {result});
EXPECT_EQ((vector<float>{170}), result->get_vector<float>());
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment