Updated gpu cpp files with consistent use of namespaces (cosmetic) (#629)

* Updated namespace use in cpp files.

Updated gpu cpp files with consistent use of namespaces (cosmetic) (#629)
* Updated namespace use in cpp files.
b5467550 · Chris Sullivan · GitHub · a32fdab5 · b5467550 · b5467550
Unverified Commit b5467550 authored Mar 13, 2018 by Chris Sullivan Committed by GitHub Mar 13, 2018
6 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
@@ -19,25 +19,18 @@
 #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
-namespace ngraph
+using namespace ngraph;
+runtime::gpu::CudaContextManager& runtime::gpu::CudaContextManager::instance()
 {
-    namespace runtime
-    {
-        namespace gpu
-        {
-            CudaContextManager& CudaContextManager::instance()
-            {
    static CudaContextManager manager;
    return manager;
-            }
+}
-            CudaContextManager::CudaContextManager()
+runtime::gpu::CudaContextManager::CudaContextManager()
-            {
+{
    CUDA_SAFE_CALL(cuInit(0));
    CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
    CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device));
    m_context_ptr = std::make_shared<CUcontext>(m_context);
-            }
-        }
-    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
@@ -20,17 +20,13 @@
 #include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
-namespace ngraph
+using namespace ngraph;
-{
-    namespace runtime
+std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::string& name,
-    {
-        namespace gpu
-        {
-            std::shared_ptr<CUfunction> CudaFunctionBuilder::get(const std::string& name,
                                                                   const std::string& kernel,
                                                                   int number_of_options,
                                                                   const char** options)
-            {
+{
    nvrtcProgram prog;
    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
                                       kernel.c_str(),
@@ -49,8 +45,8 @@ namespace ngraph
    size_t ptx_size;
    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
    char* ptx = new char[ptx_size];
-                NVRTC_SAFE_CALL(nvrtcGetPTX(
+    NVRTC_SAFE_CALL(
-                    prog,
+        nvrtcGetPTX(prog,
                    ptx)); // Load the generated PTX and get a handle to the parent kernel.
    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
@@ -59,7 +55,4 @@ namespace ngraph
    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
    CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
    return std::make_shared<CUfunction>(function);
-            }
-        }
-    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
@@ -26,40 +26,31 @@
 static const std::string s_output_dir = "gpu_codegen";
-namespace ngraph
+using namespace ngraph;
+runtime::gpu::CudaFunctionPool& runtime::gpu::CudaFunctionPool::instance()
 {
-    namespace runtime
-    {
-        namespace gpu
-        {
-            CudaFunctionPool& CudaFunctionPool::instance()
-            {
    static CudaFunctionPool pool;
    return pool;
-            }
+}
-            void CudaFunctionPool::set(const std::string& name, const std::string& kernel)
+void runtime::gpu::CudaFunctionPool::set(const std::string& name, const std::string& kernel)
-            {
+{
-                const char* opts[] = {"--gpu-architecture=compute_35",
+    const char* opts[] = {"--gpu-architecture=compute_35", "--relocatable-device-code=true"};
-                                      "--relocatable-device-code=true"};
    std::string filename =
        file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
    std::ofstream out(filename);
    out << kernel;
    out.close();
-                m_function_map.insert(
+    m_function_map.insert({name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
-                    {name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
+}
-            }
-            std::shared_ptr<CUfunction> CudaFunctionPool::get(const std::string& name)
+std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::get(const std::string& name)
-            {
+{
    auto it = m_function_map.find(name);
    if (it != m_function_map.end())
    {
        return (*it).second;
    }
    return nullptr;
-            }
-        }
-    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -16,18 +16,14 @@
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
 #include "ngraph/codegen/code_writer.hpp"
-namespace ngraph
+using namespace ngraph;
-{
-    namespace runtime
+void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
-    {
-        namespace gpu
-        {
-            void CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
                                                         const std::string& name,
                                                         const std::string& data_type,
                                                         const std::string& op,
                                                         const size_t& num_inputs)
-            {
+{
    writer << "extern \"C\" __global__ void cuda_" << name << "(";
    for (size_t i = 0; i < num_inputs; i++)
    {
@@ -57,14 +53,14 @@ namespace ngraph
    writer << "}\n";
    return;
-            }
+}
-            void CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
+void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
                                                        const std::string& name,
                                                        const std::string& data_type,
                                                        const std::string& math_kernel,
                                                        const size_t& num_inputs)
-            {
+{
    if (math_kernel.size())
    {
        writer << "__device__ " << data_type << " " << name << "(";
@@ -83,7 +79,4 @@ namespace ngraph
        writer << "}\n";
    }
    return;
-            }
-        }
-    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -20,15 +20,10 @@
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
-namespace ngraph
+using namespace ngraph;
-{
+void runtime::gpu::emit_broadcast(
-    namespace runtime
-    {
-        namespace gpu
-        {
-            void emit_broadcast(
    void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
-            {
+{
    std::string name = "broadcast";
    // Create an instance of nvrtcProgram with the code string.
    if (CudaFunctionPool::instance().get(name) == nullptr)
@@ -38,8 +33,9 @@ namespace ngraph
        kernel = R"(
 extern "C" __global__
-void cuda_)" + name + "(" + data_type +
+void cuda_)" + name +
-                             "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"(
+                 "(" + data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" +
+                 R"(
 {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid < n)
@@ -69,7 +65,4 @@ void cuda_)" + name + "(" + data_type +
                                  args_list,
                                  0));  // arguments
    CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
-            }
-        }
-    }
 }
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -114,6 +114,7 @@
 #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
 using namespace std;
+using namespace ngraph;
 static const string s_output_dir = "gpu_codegen";
@@ -159,110 +160,104 @@ static StaticInitializers s_static_initializers;
 #define TI(x) type_index(typeid(x))
-namespace ngraph
+static const runtime::gpu::OpMap dispatcher{
-{
+    {TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Add>},
-    namespace runtime
+    {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>},
-    {
+    {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>},
-        namespace gpu
+    {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop},
-        {
+    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise},
-            static const OpMap dispatcher{
+    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
-                {TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
+    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
+    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Equal>},
-                {TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
-                {TI(ngraph::op::Parameter), &GPU_Emitter::nop},
-                {TI(ngraph::op::Abs), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
-                {TI(ngraph::op::Divide), &GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
    {TI(ngraph::op::GetOutputElement),
-                 &GPU_Emitter::emit<ngraph::op::GetOutputElement>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
-                {TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
+    {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Greater>},
-                {TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
+    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::GreaterEq>},
-                {TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
+    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Less>},
-                {TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
+    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::LessEq>},
-                {TI(ngraph::op::Log), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
+    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
-                {TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
+    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
-                {TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
+    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
-                {TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
+    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit<ngraph::op::NotEqual>},
-                {TI(ngraph::op::Power), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
+    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
-                {TI(ngraph::op::Subtract), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
+    {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
-                {TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
+    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convert>},
-                {TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
+    {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
-                {TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
+    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
-                {TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
+    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
-                {TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
+    {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>},
-                {TI(ngraph::op::Sign), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
+    {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>},
-                {TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
+    {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>},
-                {TI(ngraph::op::Exp), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Sin), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Sinh), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Cos), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Cosh), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Tan), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Tanh), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Asin), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Acos), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Atan), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
+    {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
-                {TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
+    {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>},
-                {TI(ngraph::op::Floor), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Ceiling), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
+    {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>},
-                {TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
+    {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>},
    {TI(ngraph::op::ConvolutionBackpropFilters),
-                 &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
    {TI(ngraph::op::ConvolutionBackpropData),
-                 &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
-                {TI(ngraph::op::Not), &GPU_Emitter::EmitElementwise},
+    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitElementwise},
-                {TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
+    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
-                {TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
+    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
-                {TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
+    {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
-                {TI(ngraph::op::ReduceWindow), &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
+    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReduceWindow>},
    {TI(ngraph::op::SelectAndScatter),
-                 &GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
-                {TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
+    {TI(ngraph::op::AvgPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::AvgPool>},
-                {TI(ngraph::op::AvgPoolBackprop), &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
+    {TI(ngraph::op::AvgPoolBackprop),
-                {TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
-                {TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
+    {TI(ngraph::op::Pad), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Pad>},
+    {TI(ngraph::op::BatchNorm), &runtime::gpu::GPU_Emitter::emit<ngraph::op::BatchNorm>},
    {TI(ngraph::op::BatchNormBackprop),
-                 &GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
-                {TI(ngraph::op::MaxPoolBackprop), &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
+    {TI(ngraph::op::MaxPoolBackprop),
-                {TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
-                {TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
+    {TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>},
-                {TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
+    {TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>},
-                {TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
+    {TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>},
-                {TI(ngraph::op::ReluBackprop), &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
+    {TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Relu>},
-                {TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
+    {TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReluBackprop>},
-            };
+    {TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>},
+};
-            GPU_ExternalFunction::GPU_ExternalFunction(const shared_ptr<ngraph::Function>& function,
-                                                       bool release_function)
+runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
+    const shared_ptr<ngraph::Function>& function, bool release_function)
    : ngraph::runtime::ExternalFunction(function, release_function)
    , m_compiled_function(nullptr)
    , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
-            {
+{
-            }
+}
-            void GPU_ExternalFunction::compile()
+void runtime::gpu::GPU_ExternalFunction::compile()
-            {
+{
    if (m_is_compiled)
    {
        return;
    }
    string function_name = m_function->get_name();
-                string dump_filename =
+    string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
-                    file_util::path_join(s_output_dir, function_name + "_ops.txt");
    pass::Manager pass_manager;
    // pass_manager.register_pass<pass::TopologicalSort>();
    // For now, just make everyone row-major.
-                pass_manager
+    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
-                    .register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
    pass_manager.register_pass<pass::Liveness>();
    pass_manager.register_pass<pass::MemoryLayout>(64);
    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
@@ -308,8 +303,7 @@ using namespace std;
    {
        writer << "// Declare debug timers\n";
        vector<string> names;
-                    for (shared_ptr<Function> current_function :
+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-                         pass_manager.get_state().get_functions())
        {
            for (shared_ptr<Node> node : current_function->get_ordered_ops())
            {
@@ -323,8 +317,8 @@ using namespace std;
        {
            writer << "ngraph::stopwatch timer_" << s << ";\n";
        }
-                    writer << "extern \"C\" size_t get_debug_timer_count() { return "
+        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
-                           << names.size() << "; }\n";
+               << "; }\n";
        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
        writer << "{\n";
        writer.indent++;
@@ -340,8 +334,7 @@ using namespace std;
        writer << "return rc;\n";
        writer.indent--;
        writer << "}\n";
-                    writer
+        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-                        << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
        writer << "{\n";
        writer.indent++;
        writer << "size_t rc;\n";
@@ -357,8 +350,7 @@ using namespace std;
        writer << "return rc;\n";
        writer.indent--;
        writer << "}\n";
-                    writer
+        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-                        << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
        writer << "{\n";
        writer.indent++;
        writer << "size_t rc;\n";
@@ -366,8 +358,7 @@ using namespace std;
        writer << "{\n";
        for (size_t i = 0; i < names.size(); i++)
        {
-                        writer << "case " << i << ": rc = timer_" << names[i]
+            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
-                               << ".get_call_count(); break;\n";
        }
        writer << "default: rc = 0;\n";
        writer << "}\n";
@@ -383,31 +374,26 @@ using namespace std;
    writer << "void *__dso_handle = 0;\n\n";
    writer << "// Declare all constants\n";
-                for (shared_ptr<Function> current_function :
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-                     pass_manager.get_state().get_functions())
    {
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
            const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
-                            shared_ptr<descriptor::TensorView> tv =
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                                node->get_outputs()[0].get_tensor_view();
                auto c_value_strings = c->get_value_strings();
-                            writer << "static "
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
-                                   << tv->get_tensor().get_element_type().c_type_string() << " "
+                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
-                                   << tv->get_tensor().get_name() << "_cpu["
+                       << "] =\n";
-                                   << c_value_strings.size() << "] =\n";
                writer << "{\n";
                writer.indent++;
                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
                writer.indent--;
                writer << "\n};\n\n";
-                            writer << "static "
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
-                                   << tv->get_tensor().get_element_type().c_type_string() << " *"
                       << tv->get_tensor().get_name() << ";\n";
-                            m_variable_name_map[tv->get_tensor().get_name()] =
+                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
-                                tv->get_tensor().get_name();
            }
        }
    }
@@ -415,8 +401,7 @@ using namespace std;
    writer << "// Declare all functions\n";
    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
    {
-                    writer << "extern \"C\" void " << f->get_name()
+        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
-                           << "(void** inputs, void** outputs, "
                                                           "cublasHandle_t& cublas_handle, "
                                                           "cudnnHandle_t& cudnn_handle);\n";
    }
@@ -424,8 +409,7 @@ using namespace std;
    writer << "\n";
    unordered_map<Node*, string> match_functions;
-                for (shared_ptr<Function> current_function :
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-                     pass_manager.get_state().get_functions())
    {
        set<string> output_names;
        for (shared_ptr<Node> op : current_function->get_results())
@@ -503,8 +487,7 @@ using namespace std;
        }
    }
-                for (shared_ptr<Function> current_function :
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-                     pass_manager.get_state().get_functions())
    {
        set<string> output_names;
        for (shared_ptr<Node> op : current_function->get_results())
@@ -517,8 +500,7 @@ using namespace std;
        {
            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
            {
-                            shared_ptr<descriptor::TensorView> tv =
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                                node->get_outputs()[0].get_tensor_view();
                constants.insert(tv.get());
            }
        }
@@ -535,14 +517,13 @@ using namespace std;
            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
            if (c)
            {
-                            shared_ptr<descriptor::TensorView> tv =
+                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                                node->get_outputs()[0].get_tensor_view();
                writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
                writer << "{\n";
                writer.indent++;
-                            writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name()
+                writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
-                                   << ", " << tv->get_tensor().get_name() << "_cpu, "
+                       << tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
-                                   << tv->get_tensor().size() << ");\n";
+                       << ");\n";
                writer.indent--;
                writer << "}\n";
            }
@@ -576,8 +557,7 @@ using namespace std;
                {
                    stringstream ss;
                    ss << "((" << tensor->get_element_type().c_type_string()
-                                   << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset()
+                       << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
-                                   << "))";
                    m_variable_name_map[tensor->get_name()] = ss.str();
                }
            }
@@ -585,15 +565,12 @@ using namespace std;
        // Add inputs to the variable name map
        size_t arg_index = 0;
-                    for (shared_ptr<ngraph::op::Parameter> param :
+        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-                         current_function->get_parameters())
        {
            for (size_t i = 0; i < param->get_output_size(); ++i)
            {
-                            shared_ptr<descriptor::TensorView> tv =
+                shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
-                                param->get_output_tensor_view(i);
+                const element::Type& et = tv->get_tensor_view_type()->get_element_type();
-                            const element::Type& et =
-                                tv->get_tensor_view_type()->get_element_type();
                string type = et.c_type_string();
                stringstream ss;
                ss << "((" << type << "*)(inputs[" << arg_index << "]))";
@@ -627,8 +604,7 @@ using namespace std;
            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
            bool parameter_as_output = false;
-                        for (shared_ptr<ngraph::op::Parameter> param :
+            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
-                             current_function->get_parameters())
            {
                for (const descriptor::Output& pout : param->get_outputs())
                {
@@ -636,10 +612,8 @@ using namespace std;
                    if (tv == ptv)
                    {
                        parameter_as_output = true;
-                                    writer
+                        writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
-                                        << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
+                               << et.c_type_string() << "*>(outputs[" << output_index << "]), "
-                                        << et.c_type_string() << "*>(outputs[" << output_index
-                                        << "]), "
                               << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
                               << ptv->get_tensor().size() << ");\n";
                        break;
@@ -650,9 +624,9 @@ using namespace std;
            {
                if (contains(constants, tv.get()))
                {
-                                writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
+                    writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
-                                       << output_index << "], " << tv->get_tensor().get_name()
+                           << "], " << tv->get_tensor().get_name() << ", "
-                                       << ", " << tv->get_tensor().size() << ");\n";
+                           << tv->get_tensor().size() << ");\n";
                }
                else
                {
@@ -667,29 +641,27 @@ using namespace std;
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-                        auto& n =
+            auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
-                            *node; // Work around a compiler warning (*node inside typeid may have effects
            // with shared pointers, which is fine here but clang doesn't like it.)
            auto handler = dispatcher.find(type_index(typeid(n)));
            if (handler == dispatcher.end())
            {
-                            throw ngraph_error("Unhandled op during code generation : " +
+                throw ngraph_error("Unhandled op during code generation : " + node->description());
-                                               node->description());
            }
            vector<GPU_TensorViewWrapper> in;
            for (const descriptor::Input& input : node->get_inputs())
            {
                const descriptor::Output& output = input.get_output();
                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                            in.push_back(GPU_TensorViewWrapper(
+                in.push_back(
-                                tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
            }
            vector<GPU_TensorViewWrapper> out;
            for (const descriptor::Output& output : node->get_outputs())
            {
                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                            out.push_back(GPU_TensorViewWrapper(
+                out.push_back(
-                                tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
            }
            // Emit operation prologue
@@ -743,8 +715,7 @@ using namespace std;
    // TODO: Cleanup and make this a utility function
    file_util::make_directory(s_output_dir);
-                string filename =
+    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
-                    file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
    ofstream out(filename);
    string code = writer.get_code();
    out << code;
@@ -763,8 +734,7 @@ using namespace std;
    }
    m_execution_engine->add_module(codegen_module);
    m_execution_engine->finalize();
-                m_compiled_function =
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
-                    m_execution_engine->find_function<EntryPoint_t>(function_name);
    assert(m_compiled_function);
    m_is_compiled = true;
@@ -772,13 +742,13 @@ using namespace std;
    {
        release_function();
    }
-            }
+}
-            void GPU_ExternalFunction::handle_output_alias(
+void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
    codegen::CodeWriter& writer,
    const Node& node,
    const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
-            {
+{
    for (const descriptor::Output& output : node.get_outputs())
    {
        shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
@@ -794,44 +764,40 @@ using namespace std;
                {
                    writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
                              "outputs["
-                                       << outputs[i] << "]), static_cast<void*>(outputs["
+                           << outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
-                                       << outputs[0] << "]), " << otv->get_tensor().size()
+                           << "]), " << otv->get_tensor().size() << ");\n";
-                                       << ");\n";
                }
                writer.indent--;
                writer << "}\n";
            }
        }
    }
-            }
+}
-            shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame()
+shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
-            {
+{
    if (!m_is_compiled)
    {
        compile();
    }
    return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
-            }
+}
-            void GPU_ExternalFunction::emit_debug_function_entry(
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
    codegen::CodeWriter& writer,
    Node* node,
    const std::vector<GPU_TensorViewWrapper>& in,
    const std::vector<GPU_TensorViewWrapper>& out)
-            {
+{
    writer << "timer_" << node->get_name() << ".start();\n";
-            }
+}
-            void GPU_ExternalFunction::emit_debug_function_exit(
+void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
    codegen::CodeWriter& writer,
    Node* node,
    const std::vector<GPU_TensorViewWrapper>& in,
    const std::vector<GPU_TensorViewWrapper>& out)
-            {
+{
    writer << "timer_" << node->get_name() << ".stop();\n";
-            }
-        }
-    }
 }