Merge branch 'master' into jmenon/dex2

95594d93 · Jayaram Bobba · GitHub · c829a9c7 · 3d66cba4 · 95594d93
Unverified Commit 95594d93 authored Jun 14, 2018 by Jayaram Bobba Committed by GitHub Jun 14, 2018
9 changed files
--- a/src/ngraph/codegen/code_writer.hpp
+++ b/src/ngraph/codegen/code_writer.hpp
@@ -68,16 +68,16 @@ public:

    std::string generate_temporary_name(std::string prefix = "tempvar");

-    void block_begin(std::string block_prefix = "")
+    void block_begin()
    {
-        *this << "{" << block_prefix << "\n";
+        *this << "{\n";
        indent++;
    }

-    void block_end(std::string block_suffix = "")
+    void block_end()
    {
        indent--;
-        *this << "}" << block_suffix << "\n";
+        *this << "}\n";
    }

 private:

--- a/src/ngraph/codegen/compiler.cpp
+++ b/src/ngraph/codegen/compiler.cpp
@@ -265,7 +265,6 @@ void codegen::StaticCompiler::add_header_search_path(const string& p)
    vector<string> paths = split(p, ';');
    for (const string& path : paths)
    {
-        NGRAPH_INFO << path;
        if (!contains(m_extra_search_path_list, path))
        {
            m_extra_search_path_list.push_back(path);

--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -268,8 +268,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }

-    unsigned int rank = static_cast<unsigned int>(input_shape.size());
-    unsigned int nthreads = static_cast<unsigned int>(shape_size(input_shape));
+    uint32_t rank = static_cast<uint32_t>(input_shape.size());
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
    GPUShape pad_below(input_shape.size(), 0);
    GPUShape pad_interior(input_shape.size(), 1);

@@ -286,14 +286,14 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti

    // get an allocator for transient per kernel gpu memory
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
-    size_t idx_input_strides = allocator.reserve_argspace(
-        input_strides.data(), input_strides.size() * sizeof(unsigned int));
-    size_t idx_output_strides = allocator.reserve_argspace(
-        output_strides.data(), output_strides.size() * sizeof(unsigned int));
+    size_t idx_input_strides =
+        allocator.reserve_argspace(input_strides.data(), input_strides.size() * sizeof(uint32_t));
+    size_t idx_output_strides =
+        allocator.reserve_argspace(output_strides.data(), output_strides.size() * sizeof(uint32_t));
    size_t idx_padding_below =
-        allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(unsigned int));
+        allocator.reserve_argspace(pad_below.data(), pad_below.size() * sizeof(uint32_t));
    size_t idx_padding_interior =
-        allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(unsigned int));
+        allocator.reserve_argspace(pad_interior.data(), pad_interior.size() * sizeof(uint32_t));

    // create the launch primitive
    std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs,
@@ -1015,7 +1015,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
        args_list[6] = &nthreads;

        CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                      static_cast<unsigned int>(nthreads),
+                                      static_cast<uint32_t>(nthreads),
                                      1,
                                      1, // grid dim
                                      1,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -285,19 +285,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
    const std::array<std::string, 2>& data_types)
 {
    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
-           << data_types[1] << "* out, unsigned int* input_strides, unsigned int* output_strides, "
-                               "unsigned int* padding_below, unsigned int* "
-                               "padding_interior, unsigned int rank, unsigned int n)\n";
+           << data_types[1] << "* out, uint32_t* input_strides, uint32_t* output_strides, "
+                               "uint32_t* padding_below, uint32_t* "
+                               "padding_interior, uint32_t rank, uint32_t n)\n";
    writer.block_begin();
    {
-        writer << "unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
+        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
        writer << "if (tid < n)\n";
        writer.block_begin();
        {
-            writer << "unsigned int output_idx = 0;\n";
-            writer << "unsigned int input_idx = tid;\n";
+            writer << "uint32_t output_idx = 0;\n";
+            writer << "uint32_t input_idx = tid;\n";

-            writer << "for(unsigned int i = 0; i < rank; i++)\n";
+            writer << "for(uint32_t i = 0; i < rank; i++)\n";
            writer.block_begin();
            {
                writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i]  + "

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -47,7 +47,7 @@ void runtime::gpu::emit_onehot(const std::string& name,

    void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
    CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                  static_cast<unsigned int>(count),
+                                  static_cast<uint32_t>(count),
                                  1,
                                  1, // grid dim
                                  1,
@@ -84,7 +84,7 @@ void runtime::gpu::emit_reshape(const std::string& name,

    void* args_list[] = {&in, &out, &input_strides, &trans_strides, &rank, &count};
    CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                  static_cast<unsigned int>(count),
+                                  static_cast<uint32_t>(count),
                                  1,
                                  1, // grid dim
                                  1,
@@ -124,7 +124,7 @@ void runtime::gpu::emit_slice(const std::string& name,
    void* args_list[] = {
        &in, &out, &input_strides, &lower_bounds, &slice_strides, &output_strides, &rank, &count};
    CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                  static_cast<unsigned int>(count),
+                                  static_cast<uint32_t>(count),
                                  1,
                                  1, // grid dim
                                  1,
@@ -161,7 +161,7 @@ void runtime::gpu::emit_reverse(const std::string& name,

    void* args_list[] = {&in, &out, &input_shapes, &reverse_axes, &rank, &count};
    CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                  static_cast<unsigned int>(count),
+                                  static_cast<uint32_t>(count),
                                  1,
                                  1, // grid dim
                                  1,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -118,7 +118,7 @@ namespace ngraph
                {
                    return;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer += R"(
 float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
@@ -136,7 +136,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_OP_TENSOR_ADD,
                            CUDNN_DATA_FLOAT,
                            CUDNN_NOT_PROPAGATE_NAN));
-    )";
+)";

                writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
                       << "opTensorDesc,"
@@ -193,7 +193,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                auto input_shape = args[0].get_shape();
                Shape input_shape_padded = input_shape;
                Shape padding_interior(data_dilation_strides);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                if (pad_required || is_deconvolution)
                {
                    input_shape_padded = get_padded_shape(
@@ -314,7 +314,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                }

                Shape padding_interior(data_dilation_strides);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                if (pad_required || is_deconvolution)
                {
                    output_shape_padded = get_padded_shape(
@@ -467,7 +467,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                auto input_shape = args[0].get_shape();
                auto input_shape_padded = input_shape;
                Shape padding_interior(data_dilation_strides);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                if (pad_required || is_deconvolution)
                {
                    input_shape_padded = get_padded_shape(
@@ -549,7 +549,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                    auto& first = (arg0_shape.empty() ? args[0] : args[1]);
                    auto& second = (arg0_shape.empty() ? args[1] : args[0]);

-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    writer << "int count = " << second.get_size() << ";\n";
                    writer << "CUBLAS_SAFE_CALL(cublasScopy("
                           << "*ctx->cublas_handle,"
@@ -566,7 +566,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                // set output to 0 if input size is 0
                if (args[0].get_size() == 0 || args[1].get_size() == 0)
                {
-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
                           << out[0].get_size() << " * sizeof(float));\n";
                    writer.block_end();
@@ -586,7 +586,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                                "arg0 and arg1 shape does not match for dot.");
                        }
                    }
-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    writer << "CUBLAS_SAFE_CALL(cublasSdot("
                           << "*ctx->cublas_handle," << args[0].get_size() << ","
                           << args[0].get_name() << ","
@@ -598,7 +598,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) &&
                         (dot->get_reduction_axes_count() == 1))
                {
-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    writer << "const float alpha = 1.0;\n";
                    writer << "const float beta  = 0;\n";
                    writer << "CUBLAS_SAFE_CALL(cublasSetPointerMode(*ctx->cublas_handle, "
@@ -668,7 +668,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                    }

                    // GEMM Call
-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    writer << "const float alpha = 1.0;\n";
                    writer << "const float beta  = 0.0;\n";
                    writer << "int m = " << m << ";\n";
@@ -703,7 +703,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                {
                    return;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer += R"(
 float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
@@ -721,7 +721,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_OP_TENSOR_MAX,
                            CUDNN_DATA_FLOAT,
                            CUDNN_NOT_PROPAGATE_NAN));
-    )";
+)";

                writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
                       << "opTensorDesc,"
@@ -741,7 +741,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                {
                    return;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer += R"(
 float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
@@ -759,7 +759,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_OP_TENSOR_MIN,
                            CUDNN_DATA_FLOAT,
                            CUDNN_NOT_PROPAGATE_NAN));
-    )";
+)";

                writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
                       << "opTensorDesc,"
@@ -779,7 +779,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                {
                    return;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer += R"(
 float alpha1 = -1.0, alpha2 = 0, beta = 0;
@@ -797,7 +797,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_OP_TENSOR_ADD,
                            CUDNN_DATA_FLOAT,
                            CUDNN_NOT_PROPAGATE_NAN));
-    )";
+)";

                writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
                       << "opTensorDesc,"
@@ -825,7 +825,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                // broadcast axes is empty, do a copy
                if (axes.empty())
                {
-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    kernel::emit_memcpyDtD(writer, out[0], args[0]);
                    writer.block_end();
                    return;
@@ -867,7 +867,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                    block_size += block_strides[i];
                }

-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer << "int num_inputs = " << args.size() << ";\n";

@@ -910,7 +910,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                    return;
                }
                auto reshape = static_cast<const op::Reshape*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                auto arg_shape = args[0].get_shape();
                auto arg_rank = arg_shape.size();
                auto result_shape = out[0].get_shape();
@@ -1013,7 +1013,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                const auto input_strides = row_major_strides(arg_shape);
                const auto output_strides = row_major_strides(result_shape);

-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                if (args[0].get_size() == out[0].get_size())
                {
                    kernel::emit_memcpyDtD(writer, out[0], args[0]);
@@ -1077,7 +1077,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                {
                    reverse_axes_flag[a] = 1;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                if (out[0].get_size() == 1)
                {
                    kernel::emit_memcpyDtD(writer, out[0], args[0]);
@@ -1124,7 +1124,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                {
                    return;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer += R"(
 float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
@@ -1142,7 +1142,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_OP_TENSOR_MUL,
                            CUDNN_DATA_FLOAT,
                            CUDNN_NOT_PROPAGATE_NAN));
-    )";
+)";

                writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
                       << "opTensorDesc,"
@@ -1173,7 +1173,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                    repeat_size *= result_shape[i];
                }

-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
                       << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
                writer << "runtime::gpu::emit_onehot(\"" << node->description() << "\", {\""
@@ -1193,7 +1193,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                {
                    return;
                }
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "int count = " << out[0].get_size() << ";\n";
                writer += R"(
 float alpha1 = 1.0, alpha2 = 0, beta = 0;
@@ -1211,7 +1211,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                            CUDNN_OP_TENSOR_SQRT,
                            CUDNN_DATA_FLOAT,
                            CUDNN_NOT_PROPAGATE_NAN));
-    )";
+)";

                writer << "CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,"
                       << "opTensorDesc,"
@@ -1227,7 +1227,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
            {
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                kernel::emit_memcpyDtD(writer, out[0], args[0]);
                writer.block_end();
                return;
@@ -1237,7 +1237,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Max)
            {
                const ngraph::op::Max* max_op = static_cast<const ngraph::op::Max*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    if (out[0].get_size() != 0)
                    {
@@ -1285,7 +1285,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Min)
            {
                const ngraph::op::Min* min_op = static_cast<const ngraph::op::Min*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    if (out[0].get_size() != 0)
                    {
@@ -1333,7 +1333,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Sum)
            {
                const ngraph::op::Sum* sum = static_cast<const ngraph::op::Sum*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    if (out[0].get_size() != 0)
                    {
@@ -1372,7 +1372,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Product)
            {
                const ngraph::op::Product* product = static_cast<const ngraph::op::Product*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    if (out[0].get_size() != 0)
                    {
@@ -1432,7 +1432,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                    {TI(ngraph::op::Maximum), CUDNN_REDUCE_TENSOR_MAX},
                    {TI(ngraph::op::Minimum), CUDNN_REDUCE_TENSOR_MIN}};
                const ngraph::op::Reduce* reduce_op = static_cast<const ngraph::op::Reduce*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    if (out[0].get_size() != 0)
                    {
@@ -1521,7 +1521,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,

                const ngraph::op::ReduceWindow* reduce_window_op =
                    static_cast<const ngraph::op::ReduceWindow*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    if (out[0].get_size() != 0)
                    {
@@ -1620,7 +1620,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Pad)
            {
                auto pad = static_cast<const ngraph::op::Pad*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto input_shape = args[0].get_shape();
                    auto output_shape = out[0].get_shape();
@@ -1653,7 +1653,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            {
                // assumes NC{d1,d2,...} format
                auto max_pool = static_cast<const ngraph::op::MaxPool*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto& input_shape = args[0].get_shape();
                    auto& result_shape = out[0].get_shape();
@@ -1785,7 +1785,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::MaxPoolBackprop)
            {
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto mpb = static_cast<const ngraph::op::MaxPoolBackprop*>(node);
                    auto fp_input_shape = out[0].get_shape();
@@ -1843,7 +1843,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                                                               args[0].get_shape(),
                                                               batchnorm->get_eps_value());

-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    writer << "gpu::invoke_primitive(ctx, " << bn_index << ", ";
                    writer << "std::vector<void*>{" << args.front().get_name();
@@ -1879,7 +1879,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                                                               args[0].get_shape(),
                                                               batchnorm->get_eps_value());

-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    writer << "gpu::invoke_primitive(ctx, " << bn_index << ", ";
                    writer << "std::vector<void*>{" << args.front().get_name();
@@ -1904,7 +1904,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            {
                auto get_tuple_element = static_cast<const ngraph::op::GetOutputElement*>(node);

-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                writer << "runtime::gpu::cuda_memcpyDtH(" << out[0].get_name() << ", "
                       << args[get_tuple_element->get_n()].get_name() << ", "
                       << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -1959,7 +1959,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            {
                // assumes NC{d1,d2,...} format
                auto avg_pool = static_cast<const ngraph::op::AvgPool*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto& input_shape = args[0].get_shape();
                    auto& result_shape = out[0].get_shape();
@@ -2034,7 +2034,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::AvgPoolBackprop)
            {
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto apb = static_cast<const ngraph::op::AvgPoolBackprop*>(node);
                    auto output_shape = out[0].get_shape();
@@ -2079,7 +2079,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            {
                // assumes NC{d1,d2,...} format
                auto rep_slice = static_cast<const ngraph::op::ReplaceSlice*>(node);
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto& input_shape = args[0].get_shape();
                    auto& source_shape = args[1].get_shape();
@@ -2129,7 +2129,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
            {
-                writer.block_begin("  // " + node->get_name());
+                writer.block_begin();
                {
                    auto softmax = static_cast<const ngraph::op::Softmax*>(node);
                    auto tensor_shape = args[0].get_shape();

--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -77,7 +77,7 @@ namespace ngraph
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();

-                    writer.block_begin("  // " + node->get_name());
+                    writer.block_begin();
                    {
                        std::vector<std::string> dtypes;
                        for (auto& arg : args)

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -312,32 +312,32 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    writer +=
        R"(// Generated by the NGraph GPU backend
-    #include <cublas_v2.h>
-    #include <cuda.h>
-    #include <cuda_runtime.h>
-    #include <cudnn.h>
-
-    #include "ngraph/descriptor/input.hpp"
-    #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
-    #include "ngraph/descriptor/output.hpp"
-    #include "ngraph/descriptor/primary_tensor_view.hpp"
-    #include "ngraph/file_util.hpp"
-    #include "ngraph/function.hpp"
-    #include "ngraph/graph_util.hpp"
-    #include "ngraph/node.hpp"
-    #include "ngraph/pass/assign_layout.hpp"
-    #include "ngraph/pass/dump_sorted.hpp"
-    #include "ngraph/pass/liveness.hpp"
-    #include "ngraph/pass/manager.hpp"
-    #include "ngraph/pass/memory_layout.hpp"
-    #include "ngraph/runtime/aligned_buffer.hpp"
-    #include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
-    #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
-    #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
-    #include "ngraph/runtime/gpu/gpu_invoke.hpp"
-    #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
-    #include "ngraph/runtime/gpu/gpu_util.hpp"
-    #include "ngraph/util.hpp"
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+#include "ngraph/descriptor/input.hpp"
+#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
+#include "ngraph/descriptor/output.hpp"
+#include "ngraph/descriptor/primary_tensor_view.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/function.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/pass/assign_layout.hpp"
+#include "ngraph/pass/dump_sorted.hpp"
+#include "ngraph/pass/liveness.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/runtime/aligned_buffer.hpp"
+#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
+#include "ngraph/runtime/gpu/gpu_invoke.hpp"
+#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
+#include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/util.hpp"
 )";

    string pch_header_source = writer.get_code();
@@ -346,81 +346,12 @@ void runtime::gpu::GPU_ExternalFunction::compile()
 using namespace ngraph;
 using namespace ngraph::runtime;
 using namespace std;
-    )";
-
-    if (m_emit_timing)
-    {
-        writer << "// Declare debug timers\n";
-        vector<string> names;
-        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
-        {
-            for (shared_ptr<Node> node : current_function->get_ordered_ops())
-            {
-                if (!node->is_parameter() && !node->is_constant())
-                {
-                    names.push_back(node->get_name());
-                }
-            }
-        }
-        for (const string& s : names)
-        {
-            writer << "ngraph::stopwatch timer_" << s << ";\n";
-        }
-        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
-               << "; }\n";
-        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
-        writer << "{\n";
-        writer.indent++;
-        writer << "const char* rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
-        }
-        writer << "default: rc = \"\";\n";
-        writer << "}\n";
-        writer << "return rc;\n";
-        writer.indent--;
-        writer << "}\n";
-        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
-        writer << "{\n";
-        writer.indent++;
-        writer << "size_t rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = timer_" << names[i]
-                   << ".get_total_microseconds(); break;\n";
-        }
-        writer << "default: rc = 0;\n";
-        writer << "}\n";
-        writer << "return rc;\n";
-        writer.indent--;
-        writer << "}\n";
-        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
-        writer << "{\n";
-        writer.indent++;
-        writer << "size_t rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
-        }
-        writer << "default: rc = 0;\n";
-        writer << "}\n";
-        writer << "return rc;\n";
-        writer.indent--;
-        writer << "}\n";
-        writer << "\n";
-    }
-    //     // The "dso_handle" symbol is required by __cxa_atexit()
-    //     // which is enabled because the JIT uses it as the default mechanism
-    //     // to register cleanup handlers. We use it, and not atexit(), because
-    //     // atexit() happens too late, when the JIT is no longer alive
+)";

+    // The "dso_handle" symbol is required by __cxa_atexit()
+    // which is enabled because the JIT uses it as the default mechanism
+    // to register cleanup handlers. We use it, and not atexit(), because
+    // atexit() happens too late, when the JIT is no longer alive
    writer << "void *__dso_handle = 0;\n\n";
    writer << "// Declare all constants\n";
    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
@@ -432,6 +363,8 @@ using namespace std;
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
                auto c_value_strings = c->get_value_strings();
+                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
+                       << tv->get_tensor().get_name() << ";\n";
                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
                       << tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
                       << "] =\n";
@@ -440,8 +373,6 @@ using namespace std;
                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
                writer.indent--;
                writer << "\n};\n\n";
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
-                       << tv->get_tensor().get_name() << ";\n";
                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
            }
        }
@@ -449,7 +380,7 @@ using namespace std;
    // Add cudnn descriptor factory for descriptor management.
    // After the cuDNN code emitted in gpu_emitter.cc is refactored
    // into the CUDNNEmitter class, this can be removed.
-    writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n";
+    writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n\n";

    writer << "// Declare all functions\n";
    for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
@@ -457,85 +388,52 @@ using namespace std;
        writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
               << "gpu::GPURuntimeContext* ctx);\n";
    }
-
    writer << "\n";

-    unordered_map<Node*, string> match_functions;
+    // This for loop creates a collection of functions that are called more than once
+    // and emitting them as globally callable functions.
+    // ops implement the is_functionally_identical method
+    unordered_map<string, string> match_function_map;
+    unordered_map<const Node*, string> node_function_map;
    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
    {
-        set<string> output_names;
-        for (shared_ptr<Node> op : current_function->get_results())
-        {
-            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
-            output_names.insert(tv->get_tensor().get_name());
-        }
-        const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
+        list<shared_ptr<Node>> tmp = current_function->get_ordered_ops();
        if (tmp.size() < 2)
        {
            // Since we are comparing ops there must be at least two ops to proceed.
            continue;
        }
        vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
-        for (size_t i = 0; i < op_list.size() - 1; i++)
+        for (size_t i = 0; i < op_list.size(); i++)
        {
            if (op_list[i]->is_constant() || op_list[i]->is_parameter())
            {
                continue;
            }
-            if (contains_key(match_functions, op_list[i].get()))
+
+            Node& node = *op_list[i];
+            auto handler = dispatcher.find(type_index(typeid(node)));
+            if (handler == dispatcher.end())
            {
-                continue;
+                throw ngraph_error("Unhandled op during code generation : " + node.description());
            }
+
+            string match_function = emit_op_as_function(node, "__f__");
            string match_function_name;
-            if (!match_function_name.empty())
+            if (contains_key(match_function_map, match_function))
            {
-                writer << "static void " << match_function_name << "(";
-                writer.indent++;
-                // Work around a compiler warning (*node inside typeid may have effects
-                // with shared pointers, which is fine here but clang doesn't like it.)
-                auto& n = *op_list[i];
-                auto handler = dispatcher.find(type_index(typeid(n)));
-                vector<GPU_TensorViewWrapper> in;
-                size_t arg_index = 0;
-                set<string> arg_names;
-                for (const descriptor::Input& input : n.get_inputs())
-                {
-                    const descriptor::Output& output = input.get_output();
-                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                    GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
-                    if (!contains(arg_names, tvw.get_name()))
-                    {
-                        arg_names.insert(tvw.get_name());
-                        if (arg_index++ > 0)
-                        {
-                            writer << ",";
-                        }
-                        writer << "\n";
-                        writer << tvw.get_type() << "* " << tvw.get_name();
-                    }
-                    in.push_back(tvw);
-                }
-                vector<GPU_TensorViewWrapper> out;
-                for (const descriptor::Output& output : n.get_outputs())
-                {
-                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
-                    GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
-                    if (arg_index++ > 0)
-                    {
-                        writer << ",";
-                    }
-                    writer << "\n";
-                    writer << tvw.get_type() << "* " << tvw.get_name();
-                    out.push_back(tvw);
-                }
-                writer.indent--;
-                writer << "\n)\n";
-                writer << "{\n";
-                writer.indent++;
-                handler->second(this, writer, &n, in, out);
-                writer.indent--;
-                writer << "}\n";
+                match_function_name = match_function_map[match_function];
            }
+            else
+            {
+                auto offset = match_function.find("__f__");
+                string emitted_function = match_function;
+                match_function_name = "func_" + node.get_name();
+                emitted_function.replace(offset, 5, match_function_name);
+                match_function_map.insert({match_function, match_function_name});
+                writer << emitted_function << "\n";
+            }
+            node_function_map.insert({&node, match_function_name});
        }
    }

@@ -704,12 +602,15 @@ using namespace std;
                throw ngraph_error("Unhandled op during code generation : " + node->description());
            }
            vector<GPU_TensorViewWrapper> in;
+            vector<string> node_input_names;
+            vector<string> node_output_names;
            for (const descriptor::Input& input : node->get_inputs())
            {
                const descriptor::Output& output = input.get_output();
                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
                in.push_back(
                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                node_input_names.emplace_back(tv->get_tensor().get_name());
            }
            vector<GPU_TensorViewWrapper> out;
            for (const descriptor::Output& output : node->get_outputs())
@@ -717,6 +618,18 @@ using namespace std;
                shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
                out.push_back(
                    GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
+                node_output_names.emplace_back(tv->get_tensor().get_name());
+            }
+
+            // Emit function description comment
+            if (!node->is_parameter() && !node->is_constant())
+            {
+                writer << "\n// " << node->get_name() << "(";
+                vector<string> parameter_nodes = node_input_names;
+                parameter_nodes.insert(
+                    parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
+                writer << join(parameter_nodes);
+                writer << ")\n";
            }

            // Emit operation prologue
@@ -730,13 +643,10 @@ using namespace std;

            // Emit operation body
            string func_name;
-            auto it = match_functions.find(node.get());
-            if (it != match_functions.end())
-            {
-                func_name = it->second;
-            }
+            func_name = node_function_map[node.get()];
            if (func_name.empty())
            {
+                //throw runtime_error("No matching function found for '" + node->get_name() + "'");
                handler->second(this, writer, node.get(), in, out);
            }
            else
@@ -750,6 +660,7 @@ using namespace std;
                {
                    names.push_back(tv.get_name());
                }
+                names.push_back("ctx");
                writer << func_name << "(" << join(names) << ");\n";
            }

@@ -875,3 +786,117 @@ std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunc
 {
    return m_ctx;
 }
+
+bool runtime::gpu::GPU_ExternalFunction::is_functionally_identical(
+    const Node& n1, const Node& n2, const unordered_map<const Node*, string>& node_cache) const
+{
+    return node_cache.at(&n1) == node_cache.at(&n2);
+}
+
+string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
+                                                               const string& function_name)
+{
+    codegen::CodeWriter writer;
+    writer << "static void " << function_name << "(";
+    writer.indent++;
+    // Work around a compiler warning (*node inside typeid may have effects
+    // with shared pointers, which is fine here but clang doesn't like it.)
+    auto handler = dispatcher.find(type_index(typeid(node)));
+    vector<GPU_TensorViewWrapper> in;
+    size_t arg_index = 0;
+    set<string> arg_names;
+    for (const descriptor::Input& input : node.get_inputs())
+    {
+        const descriptor::Output& output = input.get_output();
+        shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+        GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
+        if (!contains(arg_names, tvw.get_name()))
+        {
+            arg_names.insert(tvw.get_name());
+            if (arg_index++ > 0)
+            {
+                writer << ",";
+            }
+            writer << "\n";
+            writer << tvw.get_type() << "* " << tvw.get_name();
+        }
+        in.push_back(tvw);
+    }
+    vector<GPU_TensorViewWrapper> out;
+    for (const descriptor::Output& output : node.get_outputs())
+    {
+        shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+        GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
+        if (arg_index++ > 0)
+        {
+            writer << ",";
+        }
+        writer << "\n";
+        writer << tvw.get_type() << "* " << tvw.get_name();
+        out.push_back(tvw);
+    }
+    writer << ",\ngpu::GPURuntimeContext* ctx";
+    writer.indent--;
+    writer << "\n)\n";
+    codegen::CodeWriter tmp_writer;
+    handler->second(this, tmp_writer, &node, in, out);
+    string body = tmp_writer.get_code();
+    if (body.size() > 0 && body[0] == '{')
+    {
+        // Body already surrounded by curly braces so don't add more
+        writer << body;
+    }
+    else
+    {
+        writer.block_begin();
+        writer << body;
+        writer.block_end();
+    }
+
+    string rc = writer.get_code();
+    if (function_name == "f")
+    {
+        rc = strip_comments(rc);
+    }
+    return rc;
+}
+
+string runtime::gpu::GPU_ExternalFunction::strip_comments(const string& s) const
+{
+    stringstream out;
+    for (size_t i = 0; i < s.size(); i++)
+    {
+        if (i < s.size() - 2)
+        {
+            if (s[i] == '/' && s[i + 1] == '/')
+            {
+                // line comment
+                i += 2;
+                while (s[i] != '\n')
+                {
+                    i++;
+                }
+                out << '\n';
+            }
+            else if (s[i] == '/' && s[i + 1] == '*')
+            {
+                // multi-line comment
+                i += 2;
+                while (!(s[i] == '*' && s[i + 1] == '/'))
+                {
+                    i++;
+                }
+                i++;
+            }
+            else
+            {
+                out << s[i];
+            }
+        }
+        else
+        {
+            out << s[i];
+        }
+    }
+    return out.str();
+}
--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -83,6 +83,13 @@ namespace ngraph
                    const Node&,
                    const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
                void release_function() { m_function = nullptr; }
+                std::string emit_op_as_function(const Node& node, const std::string& function_name);
+                std::string strip_comments(const std::string& s) const;
+                bool is_functionally_identical(
+                    const Node& n1,
+                    const Node& n2,
+                    const std::unordered_map<const Node*, std::string>& node_cache) const;
+
                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
                bool m_emit_timing;