GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (#440)

* GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (A + B) * C test now with cuBLAS Additional gemm and gemv calls cmake updates for cuDNN calls memcpy wrappers in gpu_util Additional passing tests: aliased outputs, parameter, constant tensor memcopy

GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (#440)
* GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (A + B) * C test now with cuBLAS Additional gemm and gemv calls cmake updates for cuDNN calls memcpy wrappers in gpu_util Additional passing tests: aliased outputs, parameter, constant tensor memcopy
da50410b · Tristan Webb · GitHub · 27fee946 · da50410b · da50410b
Unverified Commit da50410b authored Feb 09, 2018 by Tristan Webb Committed by GitHub Feb 09, 2018
8 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -192,7 +192,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
    # GPU backend current requires CPU because they share compiler.cpp,
    # and compiler.cpp requires MKLDNN
    if(NGRAPH_GPU_ENABLE)
-        include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
+        include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})

        # Add sources for the GPU backend
        # and all its dependencies
@@ -275,7 +275,7 @@ endif()

 # Nvidia
 if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
-    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
 endif()

 # Argon

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -50,6 +50,7 @@ void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
@@ -57,6 +58,7 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
@@ -64,6 +66,49 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    const Shape& arg0_shape = args[0].get_shape();
+    const Shape& arg1_shape = args[1].get_shape();
+    else if ((arg0_shape.size() <= 2) && (arg1_shape.size() <= 2))
+    {
+        writer << "{   // " << n->get_name() << "\n";
+        writer.indent++;
+        writer << "static const float alpha = 1.0;\n";
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+        // clang-format off
+        writer << "cublasScopy("
+            << "cublas_handle,"
+            << out[0].get_size() << ","
+            << args[0].get_name() << ","
+            << "1,"
+            << out[0].get_name() << ","
+            << "1);\n";
+        writer << "cublasSaxpy("
+            << "cublas_handle,"
+            << out[0].get_size() << ","
+            << "&alpha,"
+            << args[1].get_name() << ","
+            << "1,"
+            << out[0].get_name() << ","
+            << "1);\n";
+        // clang-format on
+        writer.indent--;
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+        writer << "}\n";
+    }
+    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
+    {
+        throw ngraph_error("Argument shape not supported");
+    }
+    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
+    {
+        // GEMM Call
+        throw ngraph_error("Argument shape not supported");
+    }
+    else
+    {
+        // General ND Call?
+        throw ngraph_error("Argument shape not supported");
+    }
 }

 void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
@@ -84,33 +129,96 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    {
        auto& first = (arg0_shape.empty() ? args[0] : args[1]);
        auto& second = (arg0_shape.empty() ? args[1] : args[0]);
+        writer << "{   // " << n->get_name() << "\n";
+        writer.indent++;
+        // clang-format off
+        writer << "cublasSdot("
+            << "cublas_handle,"
+            << second.get_size() << ","
+            << first.get_name() << ","
+            << "1,"
+            << second.get_name() << ","
+            << "1,"
+            << out[0].get_name() << ");\n";
+        // clang-format on
+        writer.indent--;
+        writer << "}\n";
    }

-    // clang-format off
    else if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
    {
-      // TODO Assert arg0_shape[0] == arg1_shape[0]?
-      writer << "{   // " << n->get_name() << "\n";
-      writer.indent++;
-      writer << "cublasSdot("
-          << "cublas_handle,"
-          << arg0_shape[0] << ","
-          << args[0].get_name() << ","
-          // Todo handle striding?
-          << "1,"
-          << args[1].get_name() << ","
-          << "1,"
-          << out[0].get_name() << ");\n";
-      writer.indent--;
-      writer << "}\n";
+        writer << "{   // " << n->get_name() << "\n";
+        writer.indent++;
+        // clang-format off
+        writer << "cublasSdot("
+            << "cublas_handle,"
+            << arg0_shape[0] << ","
+            << args[0].get_name() << ","
+            << "1,"
+            << args[1].get_name() << ","
+            << "1,"
+            << out[0].get_name() << ");\n";
+        // clang-format on
+        writer.indent--;
+        writer << "}\n";
    }
-    // clang-format on
    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
    {
+        writer << "{   // " << n->get_name() << "\n";
+        writer.indent++;
+        writer << "static const float alpha = 1.0;\n";
+        writer << "static const float beta  = 1.0;\n";
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+        // clang-format off
+        writer << "cublasSgemv("
+            << "cublas_handle,"
+            << "CUBLAS_OP_T,"
+            << arg0_shape[0] << ","
+            << arg0_shape[1] << ","
+            << "&alpha,"
+            << args[0].get_name() << ","
+            << arg0_shape[1] << ","
+            << args[1].get_name() << ","
+            << "1,"
+            << "&beta,"
+            << out[0].get_name() << ","
+            << "1);\n";
+        // clang-format on
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+        writer.indent--;
+        writer << "}\n";
    }
    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
    {
        // GEMM Call
+        writer << "{   // " << n->get_name() << "\n";
+        writer.indent++;
+        writer << "static const float alpha = 1.0;\n";
+        writer << "static const float beta  = 0.0;\n";
+        writer << "int m = " << arg0_shape[0] << ";\n";
+        writer << "int n = " << arg1_shape[1] << ";\n";
+        writer << "int k = " << arg0_shape[0] << ";\n";
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+        // clang-format off
+        writer << "cublasSgemm("
+            << "cublas_handle,"
+            << "CUBLAS_OP_N,"
+            << "CUBLAS_OP_N,"
+            << "n,"
+            << "m,"
+            << "k,"
+            << "&alpha,"
+            << args[1].get_name() << ","
+            << "n,"
+            << args[0].get_name() << ","
+            << "k,"
+            << "&beta,"
+            << out[0].get_name() << ","
+            << "n);\n";
+        // clang-format on
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+        writer.indent--;
+        writer << "}\n";
    }
    else
    {
@@ -123,6 +231,7 @@ void runtime::gpu::GPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
@@ -130,6 +239,7 @@ void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
@@ -137,6 +247,7 @@ void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitGreaterEq(
@@ -145,6 +256,7 @@ void runtime::gpu::GPU_Emitter::EmitGreaterEq(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
@@ -152,6 +264,7 @@ void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
@@ -159,6 +272,7 @@ void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
@@ -166,6 +280,7 @@ void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
@@ -173,6 +288,48 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    const Shape& arg0_shape = args[0].get_shape();
+    const Shape& arg1_shape = args[1].get_shape();
+    writer << "{   // " << n->get_name() << "\n";
+    writer.indent++;
+    writer << "static const int count = " << out[0].get_size() << ";\n";
+    writer << "static const float alpha1 = 1.0, alpha2 = 1.0, beta = 0;\n";
+    // TODO Move cudnn creation to backend initialization
+    writer += R"(
+              cudnnHandle_t cudnnHandle;
+              (cudnnCreate(&cudnnHandle));
+              cudnnTensorDescriptor_t descriptor;
+              (cudnnCreateTensorDescriptor(&descriptor));
+              (cudnnSetTensor4dDescriptor(descriptor,
+                                          /*format=*/CUDNN_TENSOR_NHWC,
+                                          /*dataType=*/CUDNN_DATA_FLOAT,
+                                          /*batch_size=*/1,
+                                          /*channels=*/1,
+                                          /*image_height=*/1,
+                                          /*image_width=*/count));
+
+              cudnnOpTensorDescriptor_t opTensorDesc;
+              (cudnnCreateOpTensorDescriptor(&opTensorDesc));
+              (cudnnSetOpTensorDescriptor(opTensorDesc,
+                                          CUDNN_OP_TENSOR_MAX,
+                                          CUDNN_DATA_FLOAT,
+                                          CUDNN_NOT_PROPAGATE_NAN));
+      )";
+
+    // clang-format off
+    writer << "cudnnOpTensor(cudnnHandle,"
+        << "opTensorDesc,"
+        << "&alpha1,"
+        << "descriptor," << args[0].get_name() << ","
+        << "&alpha2,"
+        << "descriptor," << args[1].get_name() << ","
+        << "&beta,"
+        << "descriptor," << out[0].get_name() << ");\n";
+    // clang-format on
+
+    writer << "(cudnnDestroy(cudnnHandle));\n";
+    writer.indent--;
+    writer << "}\n";
 }

 void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
@@ -180,6 +337,7 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitNegative(
@@ -188,6 +346,7 @@ void runtime::gpu::GPU_Emitter::EmitNegative(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitNotEqual(
@@ -196,12 +355,14 @@ void runtime::gpu::GPU_Emitter::EmitNotEqual(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }
 void runtime::gpu::GPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
                                           const ngraph::Node* n,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSubtract(
@@ -210,6 +371,7 @@ void runtime::gpu::GPU_Emitter::EmitSubtract(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitBroadcast(
@@ -218,6 +380,7 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
@@ -225,6 +388,7 @@ void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitConstant(
@@ -233,6 +397,7 @@ void runtime::gpu::GPU_Emitter::EmitConstant(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
@@ -240,6 +405,72 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    auto reshape = static_cast<const op::Reshape*>(n);
+    writer << "{   // " << n->get_name() << "\n";
+    writer.indent++;
+    auto arg_shape = args[0].get_shape();
+    auto arg_rank = arg_shape.size();
+
+    auto result_shape = out[0].get_shape();
+    auto& result_element_type = out[0].get_element_type();
+
+    auto input_order = reshape->get_input_order();
+
+    bool same_layout = is_sorted(input_order.begin(), input_order.end());
+
+    size_t result_shape_product = 1;
+    for (auto i : result_shape)
+    {
+        result_shape_product *= i;
+    }
+
+    // If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
+    //  we can just copy.
+    if (same_layout || result_shape_product < 2)
+    {
+        writer << "{   // " << n->get_name() << " 1\n";
+        writer.indent++;
+        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
+               << ", " << out[0].get_size() << "," << out[0].get_element_type().size() << ");\n";
+        writer.indent--;
+        writer << "}\n";
+    }
+    // If there *is* a layout change in the 2D case, we transpose the input.
+    else if (arg_rank == 2)
+    {
+        writer << "{   // " << n->get_name() << "\n";
+        writer.indent++;
+        writer << "static const float alpha = 1.0;\n";
+        writer << "static const float beta = 0.0;\n";
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+        // clang-format off
+        writer << "cublasSgeam("
+            << "cublas_handle,"
+            << "CUBLAS_OP_T,"
+            << "CUBLAS_OP_T,"
+            << arg_shape[0] << ","
+            << arg_shape[1] << ","
+            << "&alpha,"
+            << args[0].get_name() << ","
+            << arg_shape[1] << ","
+            << "&beta,"
+            << args[0].get_name() << ","
+            << arg_shape[1] << ","
+            << out[0].get_name() << ","
+            << out[0].get_shape()[1] << ");\n";
+        //clang-format on
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+        writer.indent--;
+        writer << "}\n";
+    }
+    // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
+    else
+    {
+        throw ngraph_error(
+            "Axis permutation in reshape is not implemented yet for tensors with rank>2");
+    }
+    writer.indent--;
+    writer << "}\n";
 }

 void runtime::gpu::GPU_Emitter::EmitFunctionCall(
@@ -248,6 +479,7 @@ void runtime::gpu::GPU_Emitter::EmitFunctionCall(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+  throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
@@ -255,6 +487,7 @@ void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+  throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
@@ -262,6 +495,7 @@ void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+  throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
@@ -269,6 +503,7 @@ void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+  throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer,
@@ -276,6 +511,7 @@ void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+  throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitMultiply(
@@ -284,6 +520,36 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    const Shape& arg0_shape = args[0].get_shape();
+      const Shape& arg1_shape = args[1].get_shape();
+    // Until we have EW kernel gen, use cuBLAS
+    // From https://stackoverflow.com/questions/7621520/element-wise-vector-vector-multiplication-in-bl as/7634831
+
+    writer << "{   // " << n->get_name() << "\n";
+    writer.indent++;
+    writer << "static const float alpha = 1.0;\n";
+    writer << "static const float beta  = 0.0;\n";
+    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";;
+    // clang-format off
+    writer << "cublasSsbmv("
+        << "cublas_handle,"
+        << "CUBLAS_FILL_MODE_LOWER," // Corresponds to FORTRAN "L"
+        << out[0].get_size() << ","  // N = input size
+        << "0,"                      // k = super-diagonal i.e. just use the diagonal of A
+        << "&alpha,"                 // alpha
+        << args[0].get_name() << "," // vec A (broadcast to a matrix)
+        << "1,"                      // LDA = 1
+        << args[1].get_name() << "," // vector x
+        << "1,"                      // Stride x
+        << "&beta,"                  // beta
+        << out[0].get_name() << ","  // y
+        << "1"                       // Stride y
+        << ");\n";
+    // clang-format on
+    writer.indent--;
+    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+    ;
+    writer << "}\n";
 }

 void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
@@ -291,6 +557,7 @@ void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
@@ -298,6 +565,7 @@ void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
@@ -305,6 +573,7 @@ void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
@@ -312,6 +581,7 @@ void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
@@ -319,6 +589,7 @@ void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
@@ -326,6 +597,7 @@ void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
@@ -333,6 +605,7 @@ void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
@@ -340,6 +613,7 @@ void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
@@ -347,6 +621,7 @@ void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
@@ -354,6 +629,7 @@ void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
@@ -361,6 +637,7 @@ void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
@@ -369,6 +646,7 @@ void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
@@ -376,6 +654,7 @@ void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
@@ -383,6 +662,7 @@ void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
@@ -390,6 +670,7 @@ void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
@@ -397,6 +678,7 @@ void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitConvolution(
@@ -405,6 +687,7 @@ void runtime::gpu::GPU_Emitter::EmitConvolution(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
@@ -412,6 +695,7 @@ void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
@@ -419,6 +703,7 @@ void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
@@ -426,6 +711,7 @@ void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitReduceWindow(
@@ -434,6 +720,7 @@ void runtime::gpu::GPU_Emitter::EmitReduceWindow(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }

 void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
@@ -442,4 +729,5 @@ void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    throw ngraph_error("Op not supported in GPU Backend");
 }
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -199,7 +199,6 @@ runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
    : ngraph::runtime::ExternalFunction(function, release_function)
    , m_compiled_function(nullptr)
    , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
-    , m_use_tbb(std::getenv("NGRAPH_GPU_USE_TBB") != nullptr)
 {
 }

@@ -226,8 +225,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    writer +=
        R"(// Generated by the NGraph GPU backend
-    #define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
-
    #include <cassert>
    #include <cmath>
    #include <cstdlib>
@@ -242,6 +239,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    #include <unordered_map>

    #include <cuda_runtime.h>
+    #include <cudnn_v7.h>
    #include "cublas_v2.h"
    #include "cuda.h"

@@ -317,6 +315,74 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    using namespace std;
 )";

+    if (m_emit_timing)
+    {
+        writer << "// Declare debug timers\n";
+        vector<string> names;
+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        {
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                if (!node->is_parameter() && !node->is_constant())
+                {
+                    names.push_back(node->get_name());
+                }
+            }
+        }
+        for (const string& s : names)
+        {
+            writer << "ngraph::stopwatch timer_" << s << ";\n";
+        }
+        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
+               << "; }\n";
+        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "const char* rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
+        }
+        writer << "default: rc = \"\";\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "size_t rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = timer_" << names[i]
+                   << ".get_total_microseconds(); break;\n";
+        }
+        writer << "default: rc = 0;\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+        writer << "{\n";
+        writer.indent++;
+        writer << "size_t rc;\n";
+        writer << "switch(index)\n";
+        writer << "{\n";
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
+        }
+        writer << "default: rc = 0;\n";
+        writer << "}\n";
+        writer << "return rc;\n";
+        writer.indent--;
+        writer << "}\n";
+        writer << "\n";
+    }
    //     // The "dso_handle" symbol is required by __cxa_atexit()
    //     // which is enabled because the JIT uses it as the default mechanism
    //     // to register cleanup handlers. We use it, and not atexit(), because
@@ -349,17 +415,13 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
    {
        writer << "extern \"C\" void " << f->get_name()
-               << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle);\n";
+               << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle);\n";
    }

    writer << "\n";

-    unordered_map<Node*, string> match_functions;
    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
    {
-        bool temporaries_used = false;
-        size_t worst_case_tmp_size = 0;
-
        set<string> output_names;
        for (shared_ptr<Node> op : current_function->get_results())
        {
@@ -377,80 +439,12 @@ void runtime::gpu::GPU_ExternalFunction::compile()
        }

        writer << "extern \"C\" void " << current_function->get_name();
-        writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle)\n";
+        writer << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle)\n";
        writer << "{\n";
        writer.indent++;

-        if (m_emit_timing)
-        {
-            writer << "// Declare debug timers\n";
-            vector<string> names;
-            for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-            {
-                for (shared_ptr<Node> node : current_function->get_ordered_ops())
-                {
-                    if (!node->is_parameter() && !node->is_constant())
-                    {
-                        names.push_back(node->get_name());
-                    }
-                }
-            }
-            for (const string& s : names)
-            {
-                writer << "ngraph::stopwatch timer_" << s << ";\n";
-            }
-            writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
-                   << "; }\n";
-            writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
-            writer << "{\n";
-            writer.indent++;
-            writer << "const char* rc;\n";
-            writer << "switch(index)\n";
-            writer << "{\n";
-            for (size_t i = 0; i < names.size(); i++)
-            {
-                writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
-            }
-            writer << "default: rc = \"\";\n";
-            writer << "}\n";
-            writer << "return rc;\n";
-            writer.indent--;
-            writer << "}\n";
-            writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-            writer << "{\n";
-            writer.indent++;
-            writer << "size_t rc;\n";
-            writer << "switch(index)\n";
-            writer << "{\n";
-            for (size_t i = 0; i < names.size(); i++)
-            {
-                writer << "case " << i << ": rc = timer_" << names[i]
-                       << ".get_total_microseconds(); break;\n";
-            }
-            writer << "default: rc = 0;\n";
-            writer << "}\n";
-            writer << "return rc;\n";
-            writer.indent--;
-            writer << "}\n";
-            writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-            writer << "{\n";
-            writer.indent++;
-            writer << "size_t rc;\n";
-            writer << "switch(index)\n";
-            writer << "{\n";
-            for (size_t i = 0; i < names.size(); i++)
-            {
-                writer << "case " << i << ": rc = timer_" << names[i]
-                       << ".get_call_count(); break;\n";
-            }
-            writer << "default: rc = 0;\n";
-            writer << "}\n";
-            writer << "return rc;\n";
-            writer.indent--;
-            writer << "}\n";
-            writer << "\n";
-        }
-
+        bool temporaries_used = false;
+        size_t worst_case_tmp_size = 0;
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
            if (node->liveness_new_list.size() > 0)
@@ -462,10 +456,25 @@ void runtime::gpu::GPU_ExternalFunction::compile()
                }
            }
        }
-
        if (temporaries_used)
        {
-            // TODO use temporary variables
+            size_t temp_pool_size = current_function->get_temporary_pool_size();
+            writer << "// Allocate the memory pool\n";
+            // TODO memory pool malloc.
+            writer << "void** pool_base_ptr = runtime::gpu::create_gpu_buffer(" << temp_pool_size
+                   << ");\n";
+
+            // Add temporaries to the variable name map
+            for (shared_ptr<Node> node : current_function->get_ordered_ops())
+            {
+                for (descriptor::Tensor* tensor : node->liveness_new_list)
+                {
+                    stringstream ss;
+                    ss << "((" << tensor->get_element_type().c_type_string()
+                       << "*)(pool_base_ptr + " << tensor->get_pool_offset() << "))";
+                    m_variable_name_map[tensor->get_name()] = ss.str();
+                }
+            }
        }

        // Add inputs to the variable name map
@@ -517,10 +526,10 @@ void runtime::gpu::GPU_ExternalFunction::compile()
                    if (tv == ptv)
                    {
                        parameter_as_output = true;
-                        writer << "memcpy(static_cast<" << et.c_type_string() << "*>(outputs["
-                               << output_index << "]), "
+                        writer << "runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
+                               << et.c_type_string() << "*>(outputs[" << output_index << "]), "
                               << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
-                               << ptv->get_tensor().size() << ");\n";
+                               << ptv->get_tensor().size() << ",1);\n";
                        break;
                    }
                }
@@ -529,7 +538,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
            {
                if (contains(constants, tv.get()))
                {
-                    writer << "memcpy(outputs[" << output_index << "], "
+                    writer << "runtime::gpu::cuda_memcpyHtD(outputs[" << output_index << "], "
                           << tv->get_tensor().get_name() << ", " << tv->get_tensor().size()
                           << ");\n";
                }
@@ -618,7 +627,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
        // End generated function
        writer += "}\n\n";
    }
-
    // TODO: Cleanup and make this a utility function

    file_util::make_directory(s_output_dir);
@@ -669,9 +677,9 @@ void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
                writer.indent++;
                for (size_t i = 1; i < outputs.size(); i++)
                {
-                    writer << "memcpy(static_cast<void*>(outputs[" << outputs[i]
-                           << "]), static_cast<void*>(outputs[" << outputs[0] << "]), "
-                           << otv->get_tensor().size() << ");\n";
+                    writer << "runtime::gpu::cuda_memcpyDtD(static_cast<void*>(outputs["
+                           << outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
+                           << "]), " << otv->get_tensor().size() << ",1);\n";
                }
                writer.indent--;
                writer << "}\n";

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -80,7 +80,6 @@ namespace ngraph
                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
                bool m_emit_timing;
-                bool m_use_tbb;
                std::unordered_map<std::string, std::string> m_variable_name_map;
            };
        }

--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -17,8 +17,10 @@
 #include <cassert>
 #include <cstdlib>
 #include <iostream>
+#include <sstream>
 #include <stddef.h>
 #include <stdio.h>
+#include <string>

 #include "cuda.h"
 #include "cuda_runtime.h"
@@ -44,3 +46,21 @@ void runtime::gpu::check_cuda_errors(CUresult err)
 {
    assert(err == CUDA_SUCCESS);
 }
+
+void** runtime::gpu::create_gpu_buffer(size_t buffer_size)
+{
+    void** allocated_buffer_pool;
+    cudaMalloc(&allocated_buffer_pool, buffer_size);
+    return allocated_buffer_pool;
+}
+
+void runtime::gpu::cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size)
+{
+    size_t size_in_bytes = element_size * element_count;
+    cudaMemcpy(d, s, size_in_bytes, cudaMemcpyDeviceToDevice);
+}
+
+void runtime::gpu::cuda_memcpyHtD(void* d, void* s, size_t buffer_size)
+{
+    cudaMemcpy(d, s, buffer_size, cudaMemcpyHostToDevice);
+}
--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -22,6 +22,9 @@ namespace ngraph
        {
            void print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size);
            void check_cuda_errors(CUresult err);
+            void** create_gpu_buffer(size_t buffer_size);
+            void cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size);
+            void cuda_memcpyHtD(void* d, void* s, size_t buffer_size);
        }
    }
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -83,11 +83,12 @@ if(NGRAPH_GPU_ENABLE AND LLVM_INCLUDE_DIR)
    link_directories(${LLVM_LIB_DIR})
    link_directories(${CUDA_LIBRARIES})
    link_directories(${CUDA_CUBLAS_LIBRARIES})
+    link_directories(${CUDNN_LIBRARIES})
    set(SRC
        ${SRC}
        cudnn.cpp)
    # Disabled for testing
-    # set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
+    set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
 endif()

 if(NGRAPH_ARGON_ENABLE)

--- a/test/cudnn.cpp
+++ b/test/cudnn.cpp
@@ -263,63 +263,3 @@ const auto str = R"(

    auto module = compiler.compile(source);
 }
-
-// TEST(cudnn, abc)
-// {
-//     auto shape = Shape{2, 2};
-//     auto A = make_shared<op::Parameter>(element::f32, shape);
-//     auto B = make_shared<op::Parameter>(element::f32, shape);
-//     auto C = make_shared<op::Parameter>(element::f32, shape);
-//     auto f = make_shared<Function>((A + B) * C, op::Parameters{A, B, C});
-
-//     auto manager = runtime::Manager::get("GPU");
-//     auto external = manager->compile(f);
-//     auto backend = manager->allocate_backend();
-//     auto cf = backend->make_call_frame(external);
-
-//     // Create some tensors for input/output
-//     shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
-//     shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
-//     shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
-//     shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
-
-//     copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-//     copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-//     copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-
-//     cf->call({a, b, c}, {result});
-//     EXPECT_EQ(result->read_vector<float>(),
-//               (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-
-//     cf->call({b, a, c}, {result});
-//     EXPECT_EQ(result->read_vector<float>(),
-//               (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-
-//     cf->call({a, c, b}, {result});
-//     EXPECT_EQ(result->read_vector<float>(),
-//               (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
-// }
-
-TEST(cudnn, dot1d)
-{
-    auto shape = Shape{4};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto shape_r = Shape{1};
-    auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
-
-    auto manager = runtime::Manager::get("GPU");
-    auto external = manager->compile(f);
-    auto backend = manager->allocate_backend();
-    auto cf = backend->make_call_frame(external);
-
-    // Create some tensors for input/output
-    auto a = backend->make_primary_tensor_view(element::f32, shape);
-    copy_data(a, vector<float>{2, 4, 8, 16});
-    auto b = backend->make_primary_tensor_view(element::f32, shape);
-    copy_data(b, vector<float>{1, 2, 4, 8});
-    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
-
-    cf->call({a, b}, {result});
-    EXPECT_EQ((vector<float>{170}), read_vector<float>(result));
-}