/******************************************************************************* * Copyright 2017-2018 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #include <sstream> #include <string> #include <vector> #include <gtest/gtest.h> #include <cuda.h> #include <cuda_runtime.h> #include <cudnn.h> #include "ngraph/codegen/compiler.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/ngraph.hpp" #include "util/ndarray.hpp" #include "util/test_tools.hpp" using namespace ngraph; using namespace std; TEST(cudnn, loadTest) { auto cudnn_version = cudnnGetVersion(); EXPECT_FLOAT_EQ(cudnn_version, CUDNN_VERSION); } TEST(cudnn, compileTest) { const auto source = R"###( // Example developed from LLVM documentation https://llvm.org/docs/NVPTXUsage.html #include <cassert> #include <fstream> #include <iostream> #include "cublas_v2.h" #include "cuda.h" void check_cuda_errors(CUresult err) { assert(err == CUDA_SUCCESS); } /// main - Program entry point int main(int argc, char **argv) { CUdevice device; CUmodule cuda_module; CUcontext context; CUfunction function; CUlinkState linker; int dev_count; // Cublas init cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; stat = cublasCreate(&handle); cublasDestroy(handle); // CUDA initialization check_cuda_errors(cuInit(0)); check_cuda_errors(cuDeviceGetCount(&dev_count)); check_cuda_errors(cuDeviceGet(&device, 0)); char name[128]; check_cuda_errors(cuDeviceGetName(name, 128, device)); std::cout << "Using CUDA Device [0]: " << name << "\n"; int dev_major, dev_minor; check_cuda_errors(cuDeviceComputeCapability(&dev_major, &dev_minor, device)); std::cout << "Device Compute Capability: " << dev_major << "." << dev_minor << "\n"; if (dev_major < 2) { std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; return 1; } const auto str = R"( .version 5.0 .target sm_60 .address_size 64 // .globl _Z7ew_multPfS_S_ // -- Begin function _Z7ew_multPfS_S_ .global .align 1 .b8 threadIdx[1]; // @_Z7ew_multPfS_S_ .visible .entry _Z7ew_multPfS_S_( .param .u64 _Z7ew_multPfS_S__param_0, .param .u64 _Z7ew_multPfS_S__param_1, .param .u64 _Z7ew_multPfS_S__param_2 ) { .local .align 8 .b8 __local_depot0[24]; .reg .b64 %SP; .reg .b64 %SPL; .reg .f32 %f<4>; .reg .b32 %r<2>; .reg .b64 %rd<17>; // BB#0: mov.u64 %SPL, __local_depot0; cvta.local.u64 %SP, %SPL; ld.param.u64 %rd3, [_Z7ew_multPfS_S__param_2]; ld.param.u64 %rd2, [_Z7ew_multPfS_S__param_1]; ld.param.u64 %rd1, [_Z7ew_multPfS_S__param_0]; cvta.to.global.u64 %rd4, %rd3; cvta.global.u64 %rd5, %rd4; cvta.to.global.u64 %rd6, %rd2; cvta.global.u64 %rd7, %rd6; cvta.to.global.u64 %rd8, %rd1; cvta.global.u64 %rd9, %rd8; st.u64 [%SP+0], %rd9; st.u64 [%SP+8], %rd7; st.u64 [%SP+16], %rd5; ld.u64 %rd10, [%SP+0]; mov.u32 %r1, %tid.x; mul.wide.u32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.f32 %f1, [%rd12]; ld.u64 %rd13, [%SP+8]; add.s64 %rd14, %rd13, %rd11; ld.f32 %f2, [%rd14]; mul.rn.f32 %f3, %f1, %f2; ld.u64 %rd15, [%SP+16]; add.s64 %rd16, %rd15, %rd11; st.f32 [%rd16], %f3; ret; } // -- End function // .globl _Z6ew_addPfS_S_ // -- Begin function _Z6ew_addPfS_S_ .visible .entry _Z6ew_addPfS_S_( .param .u64 _Z6ew_addPfS_S__param_0, .param .u64 _Z6ew_addPfS_S__param_1, .param .u64 _Z6ew_addPfS_S__param_2 ) // @_Z6ew_addPfS_S_ { .local .align 8 .b8 __local_depot1[24]; .reg .b64 %SP; .reg .b64 %SPL; .reg .f32 %f<4>; .reg .b32 %r<2>; .reg .b64 %rd<17>; // BB#0: mov.u64 %SPL, __local_depot1; cvta.local.u64 %SP, %SPL; ld.param.u64 %rd3, [_Z6ew_addPfS_S__param_2]; ld.param.u64 %rd2, [_Z6ew_addPfS_S__param_1]; ld.param.u64 %rd1, [_Z6ew_addPfS_S__param_0]; cvta.to.global.u64 %rd4, %rd3; cvta.global.u64 %rd5, %rd4; cvta.to.global.u64 %rd6, %rd2; cvta.global.u64 %rd7, %rd6; cvta.to.global.u64 %rd8, %rd1; cvta.global.u64 %rd9, %rd8; st.u64 [%SP+0], %rd9; st.u64 [%SP+8], %rd7; st.u64 [%SP+16], %rd5; ld.u64 %rd10, [%SP+0]; mov.u32 %r1, %tid.x; mul.wide.u32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.f32 %f1, [%rd12]; ld.u64 %rd13, [%SP+8]; add.s64 %rd14, %rd13, %rd11; ld.f32 %f2, [%rd14]; add.rn.f32 %f3, %f1, %f2; ld.u64 %rd15, [%SP+16]; add.s64 %rd16, %rd15, %rd11; st.f32 [%rd16], %f3; ret; } // -- End function )"; // Create driver context check_cuda_errors(cuCtxCreate(&context, 0, device)); // Create module for object check_cuda_errors(cuModuleLoadDataEx(&cuda_module, str, 0, 0, 0)); // Get kernel function check_cuda_errors(cuModuleGetFunction(&function, cuda_module, "_Z7ew_multPfS_S_")); // Device data CUdeviceptr dev_bufferA; CUdeviceptr dev_bufferB; CUdeviceptr dev_bufferC; check_cuda_errors(cuMemAlloc(&dev_bufferA, sizeof(float)*16)); check_cuda_errors(cuMemAlloc(&dev_bufferB, sizeof(float)*16)); check_cuda_errors(cuMemAlloc(&dev_bufferC, sizeof(float)*16)); float* host_A = new float[16]; float* host_B = new float[16]; float* host_C = new float[16]; // Populate input for (unsigned i = 0; i != 16; ++i) { host_A[i] = (float)i; host_B[i] = (float)(2*i); host_C[i] = 0.0f; } check_cuda_errors(cuMemcpyHtoD(dev_bufferA, &host_A[0], sizeof(float)*16)); check_cuda_errors(cuMemcpyHtoD(dev_bufferB, &host_B[0], sizeof(float)*16)); unsigned block_size_X = 16; unsigned block_size_Y = 1; unsigned block_size_Z = 1; unsigned grid_size_X = 1; unsigned grid_size_Y = 1; unsigned grid_size_Z = 1; // Kernel parameters void *kernel_params[] = { &dev_bufferA, &dev_bufferB, &dev_bufferC }; std::cout << "Launching kernel\n"; // Kernel launch check_cuda_errors(cuLaunchKernel(function, grid_size_X, grid_size_Y, grid_size_Z, block_size_X, block_size_Y, block_size_Z, 0, NULL, kernel_params, NULL)); // Retrieve device data check_cuda_errors(cuMemcpyDtoH(&host_C[0], dev_bufferC, sizeof(float)*16)); std::cout << "Results:\n"; for (unsigned i = 0; i != 16; ++i) { std::cout << host_A[i] << " + " << host_B[i] << " = " << host_C[i] << "\n"; } // Clean up after ourselves delete [] host_A; delete [] host_B; delete [] host_C; // Clean-up check_cuda_errors(cuMemFree(dev_bufferA)); check_cuda_errors(cuMemFree(dev_bufferB)); check_cuda_errors(cuMemFree(dev_bufferC)); check_cuda_errors(cuModuleUnload(cuda_module)); check_cuda_errors(cuCtxDestroy(context)); return 0; })###"; codegen::Compiler compiler; auto module = compiler.compile(source); }