add cuda kernel

144d790b · fenglei.tian · e63322d9 · 144d790b · 144d790b · 144d790b
Commit 144d790b authored Feb 09, 2018 by fenglei.tian
3 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include <algorithm>
+#include <map>
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace cuda
+            {
+                namespace kernel
+                {
+                    void emit_abs(float* in, float* out, size_t count)
+                    {
+                    }
+                    void emit_broadcast(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const AxisSet& broadcast_axes)
+                    {
+                    }
+                    //
+                    // For the reference kernel this is gpud on, see ngraph/runtime/kernel/concat.hpp.
+                    //
+                    void emit_concat(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::vector<std::string>& args,
+                            const std::string& out,
+                            const std::vector<Shape>& in_shapes,
+                            const Shape& out_shape,
+                            size_t concatenation_axis)
+                    {
+                    }
+                    void emit_replace_slice(
+                            codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& arg1, // replacement value
+                            const std::string& out,
+                            const Shape& arg1_shape,
+                            const Shape& out_shape,
+                            const Coordinate& lower_bounds,
+                            const Coordinate& upper_bounds,
+                            const Strides& strides)
+                    {
+                    }
+                    void emit_slice(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const Coordinate& lower_bounds,
+                            const Coordinate& upper_bounds,
+                            const Strides& strides)
+                    {
+                    }
+                    void emit_reshape(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const AxisVector& arg0_axis_order)
+                    {
+                    }
+                    void emit_sum(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const AxisSet& reduction_axes)
+                    {
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace kernel
+            {
+                void emit_broadcast(codegen::CodeWriter& writer,
+                                    const std::string& element_type,
+                                    const std::string& arg0, // replacement context
+                                    const std::string& out,
+                                    const Shape& arg0_shape,
+                                    const Shape& out_shape,
+                                    const AxisSet& broadcast_axes);
+                void emit_concat(codegen::CodeWriter& writer,
+                                 const std::string& element_type,
+                                 const std::vector<std::string>& args,
+                                 const std::string& out,
+                                 const std::vector<Shape>& in_shapes,
+                                 const Shape& out_shape,
+                                 const size_t concatenation_axis);
+                void emit_replace_slice(codegen::CodeWriter& writer,
+                                        const std::string& element_type,
+                                        const std::string& arg0, // replacement context
+                                        const std::string& arg1, // replacement value
+                                        const std::string& out,
+                                        const Shape& arg1_shape,
+                                        const Shape& out_shape,
+                                        const Coordinate& lower_bounds,
+                                        const Coordinate& upper_bounds,
+                                        const Strides& strides);
+                void emit_slice(codegen::CodeWriter& writer,
+                                const std::string& element_type,
+                                const std::string& arg0, // replacement context
+                                const std::string& out,
+                                const Shape& arg0_shape,
+                                const Shape& out_shape,
+                                const Coordinate& lower_bounds,
+                                const Coordinate& upper_bounds,
+                                const Strides& strides);
+                void emit_reshape(codegen::CodeWriter& writer,
+                                  const std::string& element_type,
+                                  const std::string& arg0, // replacement context
+                                  const std::string& out,
+                                  const Shape& arg0_shape,
+                                  const Shape& out_shape,
+                                  const AxisVector& arg0_axis_order);
+                void emit_sum(codegen::CodeWriter& writer,
+                              const std::string& element_type,
+                              const std::string& arg0, // replacement context
+                              const std::string& out,
+                              const Shape& arg0_shape,
+                              const Shape& out_shape,
+                              const AxisSet& reduction_axes);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -52,40 +52,40 @@ using namespace ngraph;
 #define NVRTC_SAFE_CALL(x) \ 
 do { \ 
-nvrtcResult result = x; \ 
+    nvrtcResult result = x; \ 
-if (result != NVRTC_SUCCESS) { \ 
+        if (result != NVRTC_SUCCESS) { \ 
-std::cerr << "\nerror: " #x " failed with error " \ 
+            std::cerr << "\nerror: " #x " failed with error " \ 
-<< nvrtcGetErrorString(result) << '\n'; \
+                << nvrtcGetErrorString(result) << '\n'; \
- exit(1); \ 
+                exit(1); \ 
-} \
+        } \
- } while(0) 
+} while(0) 
 #define CUDA_SAFE_CALL(x) \ 
 do { \ 
-CUresult result = x; \ 
+    CUresult result = x; \ 
-if (result != CUDA_SUCCESS) { \ 
+        if (result != CUDA_SUCCESS) { \ 
-const char *msg; \ 
+            const char *msg; \ 
-cuGetErrorName(result, &msg); \ 
+                cuGetErrorName(result, &msg); \ 
-std::cerr << "\nerror: " #x " failed with error " \ 
+                std::cerr << "\nerror: " #x " failed with error " \ 
-<< msg << '\n'; \ 
+                << msg << '\n'; \ 
-exit(1); \ 
+                exit(1); \ 
-} \ 
+        } \ 
 } while(0)
 void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
 void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-   const char *op_abs = R"(  
+    const char *op_abs = R"(  
  extern "C" __global__  
  void cuda_op_abs(float* in, float* out, size_t n)  
  {  
@@ -96,59 +96,59 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    }
  })";
-size_t numBlocks = 4; size_t numThreads = 4; 
+        size_t numBlocks = 4; size_t numThreads = 4; 
-// Create an instance of nvrtcProgram with the code string. 
+    // Create an instance of nvrtcProgram with the code string. 
-nvrtcProgram prog; 
+    nvrtcProgram prog; 
-NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
+    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
-                                    op_abs, // buffer 
+                op_abs, // buffer 
-                                    "op_abs.cu", // name 
+                "op_abs.cu", // name 
-0, // numHeaders 
+                0, // numHeaders 
-NULL, // headers 
+                NULL, // headers 
-NULL)); // includeNames
+                NULL)); // includeNames
-const char *opts[] = {"--gpu-architecture=compute_35",
+    const char *opts[] = {"--gpu-architecture=compute_35",
-                        "--relocatable-device-code=true"};
+        "--relocatable-device-code=true"};
-nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 
+    nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 
-                          2, // numOptions 
+            2, // numOptions 
-                          opts); // options
+            opts); // options
-// Obtain compilation log from the program. 
+    // Obtain compilation log from the program. 
-size_t logSize; 
+    size_t logSize; 
-NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
+    NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
-char *log = new char[logSize]; 
+    char *log = new char[logSize]; 
-NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
+    NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
-std::cout << log << '\n'; 
+    std::cout << log << '\n'; 
-delete[] log; 
+    delete[] log; 
-if (compileResult != NVRTC_SUCCESS) {
+    if (compileResult != NVRTC_SUCCESS) {
-    exit(1);
+        exit(1);
-  }
+    }
-size_t ptxSize; 
+    size_t ptxSize; 
- NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
+    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
-char *ptx = new char[ptxSize]; 
+    char *ptx = new char[ptxSize]; 
- NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
+    NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
- NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 
+    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 
-CUdevice cuDevice; 
+    CUdevice cuDevice; 
-CUcontext context; 
+    CUcontext context; 
-CUmodule module; 
+    CUmodule module; 
-CUfunction cuda_op_abs_kernel; 
+    CUfunction cuda_op_abs_kernel; 
-CUDA_SAFE_CALL( cuInit(0)); 
+    CUDA_SAFE_CALL( cuInit(0)); 
- CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); 
+    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); 
- CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
+    CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
-// CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); 
+    // CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); 
- //CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
+    //CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
-//CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
+    //CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
-//size_t cubinSize;
+    //size_t cubinSize;
-//void *cubin;
+    //void *cubin;
-//CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
+    //CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
-CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
+    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
-CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
+    CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
    writer << "{  // " << n->get_name() << "\n";
@@ -157,38 +157,39 @@ CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
    writer << "if(count == 0) return;\n";
    writer << "void *argsList[] = {(void *)" << args[0].get_name() << ", (void *)" << out[0].get_name() << ", &count};\n";
    writer << "//cuLaunchKernel(cuda_op_abs_kernel, count, 1, 1, 1, 1, 1, 0, NULL, argsList, 0);\n";
+    writer.indent--;
    writer << "}\n";
-// Generate input for execution, and create output buffers. 
+    // Generate input for execution, and create output buffers. 
-//size_t nt = numBlocks * numThreads; 
+    //size_t nt = numBlocks * numThreads; 
-//size_t bufferSize = nt * sizeof(float); 
+    //size_t bufferSize = nt * sizeof(float); 
-//float *hOut = new float[nt]; 
+    //float *hOut = new float[nt]; 
-//float *hIn = new float[nt]; 
+    //float *hIn = new float[nt]; 
-//for(int i = 0; i< nt; i++) hIn[i] = -i;
+    //for(int i = 0; i< nt; i++) hIn[i] = -i;
-//
+    //
-//CUdeviceptr dOut, dIn;
+    //CUdeviceptr dOut, dIn;
-//cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
+    //cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
-//cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
+    //cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
-//cuMemcpyHtoD(dIn, hIn, bufferSize); 
+    //cuMemcpyHtoD(dIn, hIn, bufferSize); 
-//
+    //
-//void *argst[] = {&dIn, &dOut, &nt};
+    //void *argst[] = {&dIn, &dOut, &nt};
-// CUDA_SAFE_CALL(
+    // CUDA_SAFE_CALL(
-// cuLaunchKernel(kernel, 
+    // cuLaunchKernel(kernel, 
-//    numBlocks , 1, 1, // grid dim 
+    //    numBlocks , 1, 1, // grid dim 
-//    numThreads, 1, 1, // block dim 
+    //    numThreads, 1, 1, // block dim 
-//    0, NULL, // shared mem and stream 
+    //    0, NULL, // shared mem and stream 
-//   argst, 0)); // arguments 
+    //   argst, 0)); // arguments 
-//CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
+    //CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
-//cuMemcpyDtoH(hOut, dOut, bufferSize); 
+    //cuMemcpyDtoH(hOut, dOut, bufferSize); 
-//for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. 
+    //for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. 
-//cuMemFree(dOut); 
+    //cuMemFree(dOut); 
-//cuModuleUnload(module); 
+    //cuModuleUnload(module); 
 }
 void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
@@ -214,200 +215,200 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_NOT_PROPAGATE_NAN);
    )";
-    writer  << "cudnnOpTensor(cudnn_handle,"
+        writer  << "cudnnOpTensor(cudnn_handle,"
-            << "opTensorDesc,"
+        << "opTensorDesc,"
-            << "&alpha1,"
+        << "&alpha1,"
-            << "descriptor,"
+        << "descriptor,"
-            <<  args[0].get_name() << ","
+        <<  args[0].get_name() << ","
-            << "&alpha2,"
+        << "&alpha2,"
-            << "descriptor,"
+        << "descriptor,"
-            << args[1].get_name() << ","
+        << args[1].get_name() << ","
-            << "&beta,"
+        << "&beta,"
-            << "descriptor,"
+        << "descriptor,"
-            << out[0].get_name() << ");\n";
+        << out[0].get_name() << ");\n";
-   writer.indent--;
+    writer.indent--;
-   writer << "}\n"; 
+    writer << "}\n"; 
 }
 void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
+        const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
 void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
-/*
+    /*
-    const Shape& arg0_shape = args[0].get_shape();
+       const Shape& arg0_shape = args[0].get_shape();
-    const Shape& arg1_shape = args[1].get_shape();
+       const Shape& arg1_shape = args[1].get_shape();
-    if (arg0_shape.empty() || arg1_shape.empty())
+       if (arg0_shape.empty() || arg1_shape.empty())
-    {
+       {
-        auto& first = (arg0_shape.empty() ? args[0] : args[1]);
+       auto& first = (arg0_shape.empty() ? args[0] : args[1]);
-        auto& second = (arg0_shape.empty() ? args[1] : args[0]);
+       auto& second = (arg0_shape.empty() ? args[1] : args[0]);
-        writer << "{   // " << n->get_name() << "\n";
+       writer << "{   // " << n->get_name() << "\n";
-        writer.indent++;
+       writer.indent++;
-        // clang-format off
+    // clang-format off
-        writer << "cublasSdot("
+    writer << "cublasSdot("
-               << "cublas_handle,"
+    << "cublas_handle,"
-               << second.get_size() << ","
+    << second.get_size() << ","
-               << first.get_name() << ","
+    << first.get_name() << ","
-               << "1,"
+    << "1,"
-               << second.get_name() << ","
+    << second.get_name() << ","
-               << "1,"
+    << "1,"
-               << out[0].get_name() << ");\n";
+    << out[0].get_name() << ");\n";
-        // clang-format on
+    // clang-format on
-        writer.indent--;
+    writer.indent--;
-        writer << "}\n";
+    writer << "}\n";
    }
    else if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
    {
-        writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << n->get_name() << "\n";
-        writer.indent++;
+    writer.indent++;
-        // clang-format off
+    // clang-format off
-        writer << "cublasSdot("
+    writer << "cublasSdot("
-            << "cublas_handle,"
+    << "cublas_handle,"
-            << arg0_shape[0] << ","
+    << arg0_shape[0] << ","
-            << args[0].get_name() << ","
+    << args[0].get_name() << ","
-            << "1,"
+    << "1,"
-            << args[1].get_name() << ","
+    << args[1].get_name() << ","
-            << "1,"
+    << "1,"
-            << out[0].get_name() << ");\n";
+    << out[0].get_name() << ");\n";
-        // clang-format on
+    // clang-format on
-        writer.indent--;
+    writer.indent--;
-        writer << "}\n";
+    writer << "}\n";
    }
    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
    {
-        writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << n->get_name() << "\n";
-        writer.indent++;
+    writer.indent++;
-        writer << "static const float alpha = 1.0;\n";
+    writer << "static const float alpha = 1.0;\n";
-        writer << "static const float beta  = 1.0;\n";
+    writer << "static const float beta  = 1.0;\n";
-        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
-        ;
+    ;
-        // clang-format off
+    // clang-format off
-        writer << "cublasSgemv("
+    writer << "cublasSgemv("
-            << "cublas_handle,"
+    << "cublas_handle,"
-            << "CUBLAS_OP_T,"
+    << "CUBLAS_OP_T,"
-            << arg0_shape[0] << ","
+    << arg0_shape[0] << ","
-            << arg0_shape[1] << ","
+    << arg0_shape[1] << ","
-            << "&alpha,"                      // Alpha
+    << "&alpha,"                      // Alpha
-            << args[0].get_name() << ","
+    << args[0].get_name() << ","
-            << arg0_shape[1] << ","
+    << arg0_shape[1] << ","
-            << args[1].get_name() << ","
+    << args[1].get_name() << ","
-            << "1,"
+    << "1,"
-            << "&beta,"                      // beta
+    << "&beta,"                      // beta
-            << out[0].get_name() << ","
+    << out[0].get_name() << ","
-            << "1);\n";
+    << "1);\n";
-        // clang-format on
+    // clang-format on
-        ;
+    ;
-        writer.indent--;
+    writer.indent--;
-        writer << "}\n";
+    writer << "}\n";
    }
    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
    {
-        // GEMM Call
+    // GEMM Call
-        assert(arg0_shape[0] == out[0].get_shape()[0]); // m
+    assert(arg0_shape[0] == out[0].get_shape()[0]); // m
-        assert(arg1_shape[1] == out[0].get_shape()[1]); // n
+    assert(arg1_shape[1] == out[0].get_shape()[1]); // n
-        assert(arg0_shape[1] == arg1_shape[0]);         // k
+    assert(arg0_shape[1] == arg1_shape[0]);         // k
-        writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << n->get_name() << "\n";
-        writer.indent++;
+    writer.indent++;
-        writer << "static const float alpha = 1.0;\n";
+    writer << "static const float alpha = 1.0;\n";
-        writer << "static const float beta  = 0.0;\n";
+    writer << "static const float beta  = 0.0;\n";
-        writer << "int m = " << arg0_shape[0] << ";\n";
+    writer << "int m = " << arg0_shape[0] << ";\n";
-        writer << "int n = " << arg1_shape[1] << ";\n";
+    writer << "int n = " << arg1_shape[1] << ";\n";
-        writer << "int k = " << arg0_shape[0] << ";\n";
+    writer << "int k = " << arg0_shape[0] << ";\n";
-        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
+    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
-        // clang-format off
+    // clang-format off
-        writer << "cublasSgemm("
+    writer << "cublasSgemm("
-             << "cublas_handle,"
+        << "cublas_handle,"
-             << "CUBLAS_OP_N,"
+        << "CUBLAS_OP_N,"
-             << "CUBLAS_OP_N,"
+        << "CUBLAS_OP_N,"
-             << "n,"
+        << "n,"
-             << "m,"
+        << "m,"
-             << "k,"
+        << "k,"
-             << "&alpha,"                      // Alpha
+        << "&alpha,"                      // Alpha
-             << args[1].get_name() << ","
+        << args[1].get_name() << ","
-             << "n,"
+        << "n,"
-             << args[0].get_name() << ","
+        << args[0].get_name() << ","
-             << "k,"
+        << "k,"
-             << "&beta,"                      // beta
+        << "&beta,"                      // beta
-             << out[0].get_name() << ","
+        << out[0].get_name() << ","
-             << "n);\n";
+        << "n);\n";
-        // clang-format on
+    // clang-format on
-        writer.indent--;
+    writer.indent--;
-        writer << "}\n";
+    writer << "}\n";
-    }
+}
-    else
+else
-    {
+{
-        // General ND Call?
+    // General ND Call?
-    }
+}
 */
 }
 void runtime::gpu::GPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
+        const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
 void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
+        const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
 void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
 void runtime::gpu::GPU_Emitter::EmitGreaterEq(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
+        const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
@@ -433,25 +434,25 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_NOT_PROPAGATE_NAN);
    )";
-    writer  << "cudnnOpTensor(cudnn_handle,"
+        writer  << "cudnnOpTensor(cudnn_handle,"
-            << "opTensorDesc,"
+        << "opTensorDesc,"
-            << "&alpha1,"
+        << "&alpha1,"
-            << "descriptor,"
+        << "descriptor,"
-            <<  args[0].get_name() << ","
+        <<  args[0].get_name() << ","
-            << "&alpha2,"
+        << "&alpha2,"
-            << "descriptor,"
+        << "descriptor,"
-            << args[1].get_name() << ","
+        << args[1].get_name() << ","
-            << "&beta,"
+        << "&beta,"
-            << "descriptor,"
+        << "descriptor,"
-            << out[0].get_name() << ");\n";
+        << out[0].get_name() << ");\n";
-   writer.indent--;
+    writer.indent--;
-   writer << "}\n"; 
+    writer << "}\n"; 
 }
 void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
@@ -477,27 +478,27 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_NOT_PROPAGATE_NAN);
    )";
-    writer  << "cudnnOpTensor(cudnn_handle,"
+        writer  << "cudnnOpTensor(cudnn_handle,"
-            << "opTensorDesc,"
+        << "opTensorDesc,"
-            << "&alpha1,"
+        << "&alpha1,"
-            << "descriptor,"
+        << "descriptor,"
-            <<  args[0].get_name() << ","
+        <<  args[0].get_name() << ","
-            << "&alpha2,"
+        << "&alpha2,"
-            << "descriptor,"
+        << "descriptor,"
-            << args[1].get_name() << ","
+        << args[1].get_name() << ","
-            << "&beta,"
+        << "&beta,"
-            << "descriptor,"
+        << "descriptor,"
-            << out[0].get_name() << ");\n";
+        << out[0].get_name() << ");\n";
-   writer.indent--;
+    writer.indent--;
-   writer << "}\n"; 
+    writer << "}\n"; 
 }
 void runtime::gpu::GPU_Emitter::EmitNegative(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
@@ -523,76 +524,76 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_NOT_PROPAGATE_NAN);
    )";
-    writer  << "cudnnOpTensor(cudnn_handle,"
+        writer  << "cudnnOpTensor(cudnn_handle,"
-            << "opTensorDesc,"
+        << "opTensorDesc,"
-            << "&alpha1,"
+        << "&alpha1,"
-            << "descriptor,"
+        << "descriptor,"
-            <<  args[0].get_name() << ","
+        <<  args[0].get_name() << ","
-            << "&alpha2,"
+        << "&alpha2,"
-            << "descriptor,"
+        << "descriptor,"
-            << args[0].get_name() << ","
+        << args[0].get_name() << ","
-            << "&beta,"
+        << "&beta,"
-            << "descriptor,"
+        << "descriptor,"
-            << out[0].get_name() << ");\n";
+        << out[0].get_name() << ");\n";
-   writer.indent--;
+    writer.indent--;
-   writer << "}\n"; 
+    writer << "}\n"; 
 }
 void runtime::gpu::GPU_Emitter::EmitNotEqual(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
+        const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSubtract(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitBroadcast(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitConstant(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    auto reshape = static_cast<const op::Reshape*>(n);
    writer << "{   // " << n->get_name() << "\n";
@@ -620,7 +621,7 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
        writer << "{   // " << n->get_name() << " 1\n";
        writer.indent++;
        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
-               << ", " << out[0].get_size() << "," << out[0].get_element_type().size() << ");\n";
+            << ", " << out[0].get_size() << "," << out[0].get_element_type().size() << ");\n";
        writer.indent--;
        writer << "}\n";
    }
@@ -628,85 +629,85 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    else if (arg_rank == 2)
    {
        // clang-format off
-          // TODO Assert arg0_shape[0] == arg1_shape[0]?
+        // TODO Assert arg0_shape[0] == arg1_shape[0]?
-          writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << n->get_name() << "\n";
-          writer.indent++;
+        writer.indent++;
-          writer << "static const float alpha = 1.0;\n";
+        writer << "static const float alpha = 1.0;\n";
-          writer << "static const float beta = 0.0;\n";
+        writer << "static const float beta = 0.0;\n";
-          writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";;
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";;
-          writer << "cublasSgeam("
+        writer << "cublasSgeam("
-                 << "cublas_handle,"
+            << "cublas_handle,"
-                 << "CUBLAS_OP_T,"
+            << "CUBLAS_OP_T,"
-                 << "CUBLAS_OP_T,"
+            << "CUBLAS_OP_T,"
-                 << arg_shape[0] << ","
+            << arg_shape[0] << ","
-                 << arg_shape[1] << ","
+            << arg_shape[1] << ","
-                 << "&alpha,"                      // Alpha
+            << "&alpha,"                      // Alpha
-                 << args[0].get_name() << ","
+            << args[0].get_name() << ","
-                 << arg_shape[1] << ","
+            << arg_shape[1] << ","
-                 << "&beta,"                      // beta
+            << "&beta,"                      // beta
-                 << args[0].get_name() << ","
+            << args[0].get_name() << ","
-                 << arg_shape[1] << ","
+            << arg_shape[1] << ","
-                 << out[0].get_name() << ","
+            << out[0].get_name() << ","
-                 << out[0].get_shape()[1] << ");\n";
+            << out[0].get_shape()[1] << ");\n";
-          writer.indent--;
+        writer.indent--;
-          writer << "}\n";
+        writer << "}\n";
-          //clang-format on
+        //clang-format on
    }
    // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
    else
    {
        throw ngraph_error(
-            "Axis permutation in reshape is not implemented yet for tensors with rank>2");
+                "Axis permutation in reshape is not implemented yet for tensors with rank>2");
    }
    writer.indent--;
    writer << "}\n";
 }
 void runtime::gpu::GPU_Emitter::EmitFunctionCall(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
 }
 void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
+        const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
+        const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitMultiply(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
@@ -732,147 +733,147 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_NOT_PROPAGATE_NAN);
    )";
-    writer  << "cudnnOpTensor(cudnn_handle,"
+        writer  << "cudnnOpTensor(cudnn_handle,"
-            << "opTensorDesc,"
+        << "opTensorDesc,"
-            << "&alpha1,"
+        << "&alpha1,"
-            << "descriptor,"
+        << "descriptor,"
-            <<  args[0].get_name() << ","
+        <<  args[0].get_name() << ","
-            << "&alpha2,"
+        << "&alpha2,"
-            << "descriptor,"
+        << "descriptor,"
-            << args[1].get_name() << ","
+        << args[1].get_name() << ","
-            << "&beta,"
+        << "&beta,"
-            << "descriptor,"
+        << "descriptor,"
-            << out[0].get_name() << ");\n";
+        << out[0].get_name() << ");\n";
-   writer.indent--;
+    writer.indent--;
-   writer << "}\n"; 
+    writer << "}\n"; 
 }
 void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
+        const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
-                                           const ngraph::Node* n,
+        const ngraph::Node* n,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                           const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
-                                          const ngraph::Node* n,
+        const ngraph::Node* n,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                          const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
-                                         const ngraph::Node* n,
+        const ngraph::Node* n,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                         const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << "{  // " << n->get_name() << "\n";
    writer.indent++;
@@ -898,68 +899,68 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_NOT_PROPAGATE_NAN);
    )";
-    writer  << "cudnnOpTensor(cudnn_handle,"
+        writer  << "cudnnOpTensor(cudnn_handle,"
-            << "opTensorDesc,"
+        << "opTensorDesc,"
-            << "&alpha1,"
+        << "&alpha1,"
-            << "descriptor,"
+        << "descriptor,"
-            <<  args[0].get_name() << ","
+        <<  args[0].get_name() << ","
-            << "&alpha2,"
+        << "&alpha2,"
-            << "descriptor,"
+        << "descriptor,"
-            << args[0].get_name() << ","
+        << args[0].get_name() << ","
-            << "&beta,"
+        << "&beta,"
-            << "descriptor,"
+        << "descriptor,"
-            << out[0].get_name() << ");\n";
+        << out[0].get_name() << ");\n";
-   writer.indent--;
+    writer.indent--;
-   writer << "}\n"; 
+    writer << "}\n"; 
 }
 void runtime::gpu::GPU_Emitter::EmitConvolution(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
-                                        const ngraph::Node* n,
+        const ngraph::Node* n,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
-                                            const ngraph::Node* n,
+        const ngraph::Node* n,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-                                            const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitReduceWindow(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }
 void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
-    codegen::CodeWriter& writer,
+        codegen::CodeWriter& writer,
-    const ngraph::Node* n,
+        const ngraph::Node* n,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
-    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
+        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
 }