Unverified Commit da50410b authored by Tristan Webb's avatar Tristan Webb Committed by GitHub

GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (#440)

* GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum

(A + B) * C test now with cuBLAS
Additional gemm and gemv calls
cmake updates for cuDNN calls
memcpy wrappers in gpu_util

Additional passing tests:
aliased outputs, parameter, constant tensor memcopy
parent 27fee946
......@@ -192,7 +192,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
# GPU backend current requires CPU because they share compiler.cpp,
# and compiler.cpp requires MKLDNN
if(NGRAPH_GPU_ENABLE)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
# Add sources for the GPU backend
# and all its dependencies
......@@ -275,7 +275,7 @@ endif()
# Nvidia
if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
endif()
# Argon
......
This diff is collapsed.
......@@ -80,7 +80,6 @@ namespace ngraph
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
bool m_use_tbb;
std::unordered_map<std::string, std::string> m_variable_name_map;
};
}
......
......@@ -17,8 +17,10 @@
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <stddef.h>
#include <stdio.h>
#include <string>
#include "cuda.h"
#include "cuda_runtime.h"
......@@ -44,3 +46,21 @@ void runtime::gpu::check_cuda_errors(CUresult err)
{
assert(err == CUDA_SUCCESS);
}
void** runtime::gpu::create_gpu_buffer(size_t buffer_size)
{
void** allocated_buffer_pool;
cudaMalloc(&allocated_buffer_pool, buffer_size);
return allocated_buffer_pool;
}
void runtime::gpu::cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size)
{
size_t size_in_bytes = element_size * element_count;
cudaMemcpy(d, s, size_in_bytes, cudaMemcpyDeviceToDevice);
}
void runtime::gpu::cuda_memcpyHtD(void* d, void* s, size_t buffer_size)
{
cudaMemcpy(d, s, buffer_size, cudaMemcpyHostToDevice);
}
......@@ -22,6 +22,9 @@ namespace ngraph
{
void print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size);
void check_cuda_errors(CUresult err);
void** create_gpu_buffer(size_t buffer_size);
void cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size);
void cuda_memcpyHtD(void* d, void* s, size_t buffer_size);
}
}
}
......@@ -83,11 +83,12 @@ if(NGRAPH_GPU_ENABLE AND LLVM_INCLUDE_DIR)
link_directories(${LLVM_LIB_DIR})
link_directories(${CUDA_LIBRARIES})
link_directories(${CUDA_CUBLAS_LIBRARIES})
link_directories(${CUDNN_LIBRARIES})
set(SRC
${SRC}
cudnn.cpp)
# Disabled for testing
# set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
endif()
if(NGRAPH_ARGON_ENABLE)
......
......@@ -263,63 +263,3 @@ const auto str = R"(
auto module = compiler.compile(source);
}
// TEST(cudnn, abc)
// {
// auto shape = Shape{2, 2};
// auto A = make_shared<op::Parameter>(element::f32, shape);
// auto B = make_shared<op::Parameter>(element::f32, shape);
// auto C = make_shared<op::Parameter>(element::f32, shape);
// auto f = make_shared<Function>((A + B) * C, op::Parameters{A, B, C});
// auto manager = runtime::Manager::get("GPU");
// auto external = manager->compile(f);
// auto backend = manager->allocate_backend();
// auto cf = backend->make_call_frame(external);
// // Create some tensors for input/output
// shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
// shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
// shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
// shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
// copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
// copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
// copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
// cf->call({a, b, c}, {result});
// EXPECT_EQ(result->read_vector<float>(),
// (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
// cf->call({b, a, c}, {result});
// EXPECT_EQ(result->read_vector<float>(),
// (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
// cf->call({a, c, b}, {result});
// EXPECT_EQ(result->read_vector<float>(),
// (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
// }
TEST(cudnn, dot1d)
{
auto shape = Shape{4};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
auto shape_r = Shape{1};
auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
auto manager = runtime::Manager::get("GPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
auto a = backend->make_primary_tensor_view(element::f32, shape);
copy_data(a, vector<float>{2, 4, 8, 16});
auto b = backend->make_primary_tensor_view(element::f32, shape);
copy_data(b, vector<float>{1, 2, 4, 8});
auto result = backend->make_primary_tensor_view(element::f32, shape_r);
cf->call({a, b}, {result});
EXPECT_EQ((vector<float>{170}), read_vector<float>(result));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment