GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (#440)

* GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (A + B) * C test now with cuBLAS Additional gemm and gemv calls cmake updates for cuDNN calls memcpy wrappers in gpu_util Additional passing tests: aliased outputs, parameter, constant tensor memcopy

GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (#440)
* GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (A + B) * C test now with cuBLAS Additional gemm and gemv calls cmake updates for cuDNN calls memcpy wrappers in gpu_util Additional passing tests: aliased outputs, parameter, constant tensor memcopy
da50410b · Tristan Webb · GitHub · 27fee946 · da50410b · da50410b
Unverified Commit da50410b authored Feb 09, 2018 by Tristan Webb Committed by GitHub Feb 09, 2018
8 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -192,7 +192,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
    # GPU backend current requires CPU because they share compiler.cpp,
    # and compiler.cpp requires MKLDNN
    if(NGRAPH_GPU_ENABLE)
-        include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
+        include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})

        # Add sources for the GPU backend
        # and all its dependencies
@@ -275,7 +275,7 @@ endif()

 # Nvidia
 if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
-    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
 endif()

 # Argon

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -80,7 +80,6 @@ namespace ngraph
                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
                bool m_emit_timing;
-                bool m_use_tbb;
                std::unordered_map<std::string, std::string> m_variable_name_map;
            };
        }

--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -17,8 +17,10 @@
 #include <cassert>
 #include <cstdlib>
 #include <iostream>
+#include <sstream>
 #include <stddef.h>
 #include <stdio.h>
+#include <string>

 #include "cuda.h"
 #include "cuda_runtime.h"
@@ -44,3 +46,21 @@ void runtime::gpu::check_cuda_errors(CUresult err)
 {
    assert(err == CUDA_SUCCESS);
 }
+
+void** runtime::gpu::create_gpu_buffer(size_t buffer_size)
+{
+    void** allocated_buffer_pool;
+    cudaMalloc(&allocated_buffer_pool, buffer_size);
+    return allocated_buffer_pool;
+}
+
+void runtime::gpu::cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size)
+{
+    size_t size_in_bytes = element_size * element_count;
+    cudaMemcpy(d, s, size_in_bytes, cudaMemcpyDeviceToDevice);
+}
+
+void runtime::gpu::cuda_memcpyHtD(void* d, void* s, size_t buffer_size)
+{
+    cudaMemcpy(d, s, buffer_size, cudaMemcpyHostToDevice);
+}
--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -22,6 +22,9 @@ namespace ngraph
        {
            void print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size);
            void check_cuda_errors(CUresult err);
+            void** create_gpu_buffer(size_t buffer_size);
+            void cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size);
+            void cuda_memcpyHtD(void* d, void* s, size_t buffer_size);
        }
    }
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -83,11 +83,12 @@ if(NGRAPH_GPU_ENABLE AND LLVM_INCLUDE_DIR)
    link_directories(${LLVM_LIB_DIR})
    link_directories(${CUDA_LIBRARIES})
    link_directories(${CUDA_CUBLAS_LIBRARIES})
+    link_directories(${CUDNN_LIBRARIES})
    set(SRC
        ${SRC}
        cudnn.cpp)
    # Disabled for testing
-    # set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
+    set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
 endif()

 if(NGRAPH_ARGON_ENABLE)

--- a/test/cudnn.cpp
+++ b/test/cudnn.cpp
@@ -263,63 +263,3 @@ const auto str = R"(

    auto module = compiler.compile(source);
 }
-
-// TEST(cudnn, abc)
-// {
-//     auto shape = Shape{2, 2};
-//     auto A = make_shared<op::Parameter>(element::f32, shape);
-//     auto B = make_shared<op::Parameter>(element::f32, shape);
-//     auto C = make_shared<op::Parameter>(element::f32, shape);
-//     auto f = make_shared<Function>((A + B) * C, op::Parameters{A, B, C});
-
-//     auto manager = runtime::Manager::get("GPU");
-//     auto external = manager->compile(f);
-//     auto backend = manager->allocate_backend();
-//     auto cf = backend->make_call_frame(external);
-
-//     // Create some tensors for input/output
-//     shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
-//     shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
-//     shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
-//     shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
-
-//     copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-//     copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-//     copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-
-//     cf->call({a, b, c}, {result});
-//     EXPECT_EQ(result->read_vector<float>(),
-//               (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-
-//     cf->call({b, a, c}, {result});
-//     EXPECT_EQ(result->read_vector<float>(),
-//               (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-
-//     cf->call({a, c, b}, {result});
-//     EXPECT_EQ(result->read_vector<float>(),
-//               (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
-// }
-
-TEST(cudnn, dot1d)
-{
-    auto shape = Shape{4};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto shape_r = Shape{1};
-    auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
-
-    auto manager = runtime::Manager::get("GPU");
-    auto external = manager->compile(f);
-    auto backend = manager->allocate_backend();
-    auto cf = backend->make_call_frame(external);
-
-    // Create some tensors for input/output
-    auto a = backend->make_primary_tensor_view(element::f32, shape);
-    copy_data(a, vector<float>{2, 4, 8, 16});
-    auto b = backend->make_primary_tensor_view(element::f32, shape);
-    copy_data(b, vector<float>{1, 2, 4, 8});
-    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
-
-    cf->call({a, b}, {result});
-    EXPECT_EQ((vector<float>{170}), read_vector<float>(result));
-}