softmax & convolution memory primitive cacheing (#1290)

* Updated softmax. * Formatting. * Updated convolution. * Use build_primitive overloading. Add helper to emit type_string given a node. * Formatting. * Update ConvolutionBackpropData. * convolution backprop & max pool memory primitive cacheing (#1303) * Updated ConvolutionBackpropFilters. * Update MaxPool. * Update Max and Min. (#1307)

softmax & convolution memory primitive cacheing (#1290)
* Updated softmax. * Formatting. * Updated convolution. * Use build_primitive overloading. Add helper to emit type_string given a node. * Formatting. * Update ConvolutionBackpropData. * convolution backprop & max pool memory primitive cacheing (#1303) * Updated ConvolutionBackpropFilters. * Update MaxPool. * Update Max and Min. (#1307)
bb94fa85 · Chris Sullivan · Scott Cyphers · eba9439b · bb94fa85 · bb94fa85
Commit bb94fa85 authored Aug 02, 2018 by Chris Sullivan Committed by Scott Cyphers Aug 02, 2018
7 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -22,10 +22,13 @@
 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/runtime/gpu/cuda_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
+#include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_invoke.hpp"
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/runtime/gpu/type_info.hpp"
 #include "ngraph/util.hpp"

 using namespace ngraph;
@@ -1153,6 +1156,160 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std
    return primitive_index;
 }

+size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto& input_shape = args[0].get_shape();
+    auto& result_shape = out[0].get_shape();
+    auto padding_below = node->get_padding_below();
+    auto padding_above = node->get_padding_above();
+    auto input_type = args[0].get_element_type().c_type_string();
+    auto output_type = out[0].get_element_type().c_type_string();
+
+    // construct hash to determine if kernel needs to be emitted
+    // or if it already exists in the primitive list
+    std::stringstream ss;
+    ss << "max_pool_" << runtime::gpu::kernel::emit_type_string(node) << "_i"
+       << join(input_shape, "_") << "_o" << join(result_shape, "_") << "_ws"
+       << join(node->get_window_shape(), "_") << "_wst"
+       << join(node->get_window_movement_strides(), "_") << "_pb" << join(padding_below, "_")
+       << "_pb" << join(padding_above, "_");
+    std::string hash = ss.str();
+
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    /// assymetric padding detection
+    bool pad_required = false;
+    auto shape_to_pool =
+        runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
+    if (shape_to_pool != input_shape)
+    {
+        pad_required = true;
+    }
+
+    pad_required = pad_required && (padding_below != padding_above);
+    // asymetric padding
+    size_t idx_workspace = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
+    if (pad_required)
+    {
+        auto temp_size = shape_size(shape_to_pool) * args[0].get_element_type().size();
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+        idx_workspace = allocator.reserve_workspace(temp_size);
+
+        auto pad_value = TypeInfo::Get(args[0].get_element_type())->lowest();
+
+        pad_index = build_pad({{input_type, output_type}},
+                              input_shape,
+                              shape_to_pool,
+                              padding_below,
+                              padding_above,
+                              Shape{},
+                              pad_value);
+    }
+    /// end asymmetric padding detection
+
+    size_t max_pool_index = build_1d_max_pool({{input_type, output_type}},
+                                              input_shape,
+                                              result_shape,
+                                              node->get_window_shape().back(),
+                                              node->get_window_movement_strides().back());
+
+    std::unique_ptr<gpu::primitive> kernel_launch(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            if (idx_workspace != std::numeric_limits<size_t>::max() &&
+                pad_index != std::numeric_limits<size_t>::max())
+            {
+                // void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                // gpu::invoke_primitive(m_ctx,
+                //                       pad_dynamic_index,
+                //                       std::vector<void*>{inputs[0]}.data(),
+                //                       std::vector<void*>{pad_buffer}.data());
+
+                // gpu::invoke_primitive(
+                //     m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
+
+                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                gpu::invoke_primitive(m_ctx,
+                                      pad_index,
+                                      std::vector<void*>{inputs[0]}.data(),
+                                      std::vector<void*>{pad_buffer}.data());
+                gpu::invoke_primitive(
+                    m_ctx, max_pool_index, std::vector<void*>{pad_buffer}.data(), outputs);
+            }
+            else
+            {
+                gpu::invoke_primitive(m_ctx, max_pool_index, inputs, outputs);
+            }
+        }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto tensor_shape = args[0].get_shape();
+    auto axes = node->get_axes();
+
+    std::stringstream ss;
+    ss << "softmax_" << runtime::gpu::kernel::emit_type_string(node) << "_s"
+       << join(tensor_shape, "_") << "_ra" << join(axes, "_");
+    auto hash = ss.str();
+
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    // build composite primitive
+
+    // reserve a temporary buffer for the intermediate reduction
+    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+    auto reduced_shape = tensor_shape;
+    for (auto const& axis : axes)
+    {
+        reduced_shape[axis] = 1;
+    }
+    size_t reduced_size = shape_size(reduced_shape);
+    size_t workspace_idx =
+        allocator.reserve_workspace(reduced_size * out[0].get_element_type().size());
+
+    // exponentiate with fused sum reduction to calculate softmax denominator
+    auto input_type = args[0].get_element_type().c_type_string();
+    auto output_type = out[0].get_element_type().c_type_string();
+    size_t exp_sum_reduce = build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>(
+        {{input_type, output_type}}, tensor_shape, {}, axes, true /* multi-output */);
+
+    // inplace binary division with fused broadcast to calculate softmax
+    size_t div_broadcast = build_elementwise_collective<ngraph::op::Divide>(
+        std::vector<std::string>(3, output_type), tensor_shape, {1}, axes);
+
+    std::unique_ptr<gpu::primitive> kernel_launch(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void* workspace = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
+            // cache the elementwise result and the fused result (multi-output)
+            runtime::gpu::invoke_primitive(
+                m_ctx, exp_sum_reduce, inputs, std::vector<void*>{workspace, outputs[0]}.data());
+            runtime::gpu::invoke_primitive(
+                m_ctx, div_broadcast, std::vector<void*>{outputs[0], workspace}.data(), outputs);
+        }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
 size_t
    runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
                                                            GPUShape tensor_shape,
@@ -1653,6 +1810,133 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
    return primitive_index;
 }

+size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
+{
+    std::stringstream ss;
+    ss << "convolution_fprop_" << runtime::gpu::kernel::emit_type_string(node);
+
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto input_shape = args[0].get_shape();
+    auto filter_shape = args[1].get_shape();
+    auto output_shape = out[0].get_shape();
+    auto tensor_size = input_shape.size();
+
+    // primitive cache parameters
+    ss << "_s" << join(input_shape, "_") << "_pb" << join(node->get_padding_below(), "_") << "_pi"
+       << join(node->get_data_dilation_strides(), "_") << "_fs" << join(filter_shape, "_") << "_fst"
+       << join(node->get_window_movement_strides(), "_") << "_fdi"
+       << join(node->get_window_dilation_strides(), "_");
+
+    auto hash = ss.str();
+
+    // check if the requested primtive is already built
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    // Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
+    // and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
+
+    // TODO: This should be done via a pass similar to
+    // what is done for convolution in the IA transformer
+    // c.f runtime/cpu/pass/cpu_layout.cpp
+
+    GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+    size_t transposed_data_idx =
+        allocator.reserve_workspace(shape_size(input_shape) * args[0].get_element_type().size());
+    size_t transposed_filter_idx =
+        allocator.reserve_workspace(shape_size(filter_shape) * args[1].get_element_type().size());
+    size_t transposed_output_idx =
+        allocator.reserve_workspace(shape_size(output_shape) * out[0].get_element_type().size());
+
+    GPUShape input_order;
+    for (int i = 1; i <= tensor_size; i++)
+    {
+        input_order.push_back(i % tensor_size);
+    }
+
+    size_t reshape_data_index = build_reshape(
+        {{args[0].get_element_type().c_type_string(), args[0].get_element_type().c_type_string()}},
+        input_shape,
+        input_order);
+
+    size_t reshape_filter_index = build_reshape(
+        {{args[1].get_element_type().c_type_string(), args[1].get_element_type().c_type_string()}},
+        filter_shape,
+        input_order);
+
+    // local helper to reshape tensor shape objects
+    auto reshape = [](const Shape& shape, const GPUShape& order) {
+        Shape output(shape.size(), 0);
+        for (size_t i = 0; i < shape.size(); i++)
+        {
+            output[i] = shape[order[i]];
+        }
+        return output;
+    };
+
+    // reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
+    input_shape = reshape(input_shape, input_order);
+    // reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
+    filter_shape = reshape(filter_shape, input_order);
+    // reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
+    output_shape = reshape(output_shape, input_order);
+
+    size_t conv_index = build_convolution({{args[0].get_element_type().c_type_string(),
+                                            args[1].get_element_type().c_type_string(),
+                                            out[0].get_element_type().c_type_string()}},
+                                          input_shape,
+                                          node->get_padding_below(),
+                                          node->get_data_dilation_strides(),
+                                          filter_shape,
+                                          node->get_window_movement_strides(),
+                                          node->get_window_dilation_strides(),
+                                          output_shape);
+
+    // reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
+    input_order.clear();
+    input_order.push_back(static_cast<int>(tensor_size - 1));
+    for (int i = 0; i < tensor_size - 1; i++)
+    {
+        input_order.push_back(i);
+    }
+
+    size_t reshape_output_index = build_reshape(
+        {{args[1].get_element_type().c_type_string(), args[1].get_element_type().c_type_string()}},
+        output_shape,
+        input_order);
+
+    std::unique_ptr<gpu::primitive> kernel_launch(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void* data = gpu::invoke_memory_primitive(m_ctx, transposed_data_idx);
+            void* filter = gpu::invoke_memory_primitive(m_ctx, transposed_filter_idx);
+            void* output = gpu::invoke_memory_primitive(m_ctx, transposed_output_idx);
+            gpu::invoke_primitive(m_ctx,
+                                  reshape_data_index,
+                                  std::vector<void*>{inputs[0]}.data(),
+                                  std::vector<void*>{data}.data());
+            gpu::invoke_primitive(m_ctx,
+                                  reshape_filter_index,
+                                  std::vector<void*>{inputs[1]}.data(),
+                                  std::vector<void*>{filter}.data());
+            gpu::invoke_primitive(m_ctx,
+                                  conv_index,
+                                  std::vector<void*>{data, filter}.data(),
+                                  std::vector<void*>{output}.data());
+            gpu::invoke_primitive(m_ctx,
+                                  reshape_output_index,
+                                  std::vector<void*>{output}.data(),
+                                  std::vector<void*>{outputs[0]}.data());
+        }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
 size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes,
                                                    GPUShape input_shape,
                                                    GPUShape input_pad_below,
@@ -1675,7 +1959,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
    // coalescing and vectorization is maintained regardless of coordinate access
    // (e.g. data and filter dilation).

-    std::string kernel_name = "convolution_fprop_" + join(dtypes, "_");
+    std::string kernel_name = "convolution_fprop_c_nd_n" + join(dtypes, "_");
    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');

    // prerequisits for kernel cacheing and building
@@ -1693,20 +1977,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
    kernel_name = kernel_name + "_n" + std::to_string(N) + "_k" + std::to_string(K) + "_fsz" +
                  std::to_string(filter_size) + "_r" + std::to_string(rank);

-    // primitive cache parameters
-    std::stringstream ss;
-    ss << kernel_name << "_s" << join(input_shape, "_") << "_pb" << join(input_pad_below, "_")
-       << "_pi" << join(input_dilation, "_") << "_fs" << join(filter_shape, "_") << "_fst"
-       << join(filter_stride, "_") << "_fdi" << join(filter_dilation, "_");
-    auto hash = ss.str();
-
-    // check if the requested kernel is already an inserted primitive
-    size_t primitive_index = m_primitive_emitter->lookup(hash);
-    if (primitive_index != std::numeric_limits<size_t>::max())
-    {
-        return primitive_index;
-    }
-
    // tiling options are determined by
    // batch size (N) and number of filters (K)
    int reg_tile_size = 1;
@@ -1904,9 +2174,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
        debug_sync();
    }});

-    primitive_index = this->m_primitive_emitter->insert(std::move(conv));
-    m_primitive_emitter->cache(hash, primitive_index);
-    return primitive_index;
+    return this->m_primitive_emitter->insert(std::move(conv));
 }

 void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& writer,

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -22,6 +22,10 @@
 #include "ngraph/runtime/gpu/gpu_shape.hpp"
 #include "ngraph/strides.hpp"

+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/softmax.hpp"
+
 namespace ngraph
 {
    class GPUShape;
@@ -37,6 +41,11 @@ namespace ngraph
            {
                friend class GPUPrimitiveEmitter;

+            public:
+                size_t build_primitive(const op::Softmax* node);
+                size_t build_primitive(const op::Convolution* node);
+                size_t build_primitive(const op::MaxPool* node);
+
            public:
                size_t build_pad(const std::array<std::string, 2>& dtypes,
                                 GPUShape input_shape,

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
@@ -20,10 +20,12 @@

 #include "ngraph/log.hpp"
 #include "ngraph/runtime/gpu/cudnn_emitter.hpp"
+#include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_invoke.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/runtime/gpu/type_info.hpp"
 #include "ngraph/util.hpp"

 using namespace ngraph;
@@ -327,22 +329,538 @@ cudnnConvolutionDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_convolution_
    return conv_descriptor;
 }

-size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
-                                                     const Shape& input_tensor_shape,
-                                                     const Shape& input_filter_shape,
-                                                     const Shape& output_tensor_shape,
-                                                     const Strides& window_movement_strides,
-                                                     const Strides& window_dilation_strides,
-                                                     const Shape& padding_below)
+size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto input_shape = args[0].get_shape();
+    auto filter_shape = args[1].get_shape();
+    auto output_shape = out[0].get_shape();
+    Strides window_dilation_strides = node->get_window_dilation_strides();
+    Strides window_movement_strides = node->get_window_movement_strides();
+    Strides data_dilation_strides = node->get_data_dilation_strides();
+    CoordinateDiff padding_below_diff = node->get_padding_below();
+    CoordinateDiff padding_above_diff = node->get_padding_above();
+    auto dtype = out[0].get_element_type().c_type_string();
+
+    // construct hash to determine if kernel needs to be emitted
+    // or if it already exists in the primitive list
+    std::stringstream ss;
+    ss << "convolution_op_" << dtype << "_i" << join(input_shape, "_") << "_w"
+       << join(filter_shape, "_") << "_o" << join(output_shape, "_") << "_ws"
+       << join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
+       << join(padding_below_diff, "_");
+    std::string hash = ss.str();
+
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    bool is_deconvolution = false;
+    for (auto a : data_dilation_strides)
+    {
+        if (a != 1)
+        {
+            is_deconvolution = true;
+            break;
+        }
+    }
+
+    bool pad_required = (padding_below_diff != padding_above_diff);
+
+    Shape padding_below(padding_below_diff.size(), 0);
+    Shape padding_above(padding_above_diff.size(), 0);
+    for (int i = 0; i < padding_below.size(); i++)
+    {
+        padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
+        padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
+    }
+    Shape input_shape_padded = input_shape;
+    Shape padding_interior(data_dilation_strides);
+
+    size_t idx_workspace = std::numeric_limits<size_t>::max();
+    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    if (pad_required || is_deconvolution)
+    {
+        input_shape_padded = runtime::gpu::get_padded_shape(
+            input_shape, padding_below, padding_above, padding_interior);
+        auto temp_size = shape_size(input_shape_padded) * args[0].get_element_type().size();
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+
+        // reserve zero initialized workspace
+        idx_workspace = allocator.reserve_workspace(temp_size, true);
+
+        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
+        pad_dynamic_index =
+            cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
+                                              out[0].get_element_type().c_type_string()}},
+                                            input_shape,
+                                            input_shape_padded,
+                                            padding_below,
+                                            padding_interior);
+
+        // asymetric padding has been applied, zero out padding vectors to
+        // ensure cudnn does not assume padding
+        std::fill(padding_below.begin(), padding_below.end(), 0);
+    }
+
+    size_t conv_index = build_convolution(dtype,
+                                          input_shape_padded,
+                                          filter_shape,
+                                          output_shape,
+                                          window_movement_strides,
+                                          window_dilation_strides,
+                                          padding_below);
+
+    std::unique_ptr<gpu::primitive> kernel_launch(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            if (idx_workspace != std::numeric_limits<size_t>::max() &&
+                pad_dynamic_index != std::numeric_limits<size_t>::max())
+            {
+                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                gpu::invoke_primitive(m_ctx,
+                                      pad_dynamic_index,
+                                      std::vector<void*>{inputs[0]}.data(),
+                                      std::vector<void*>{pad_buffer}.data());
+                gpu::invoke_primitive(
+                    m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
+            }
+            else
+            {
+                gpu::invoke_primitive(m_ctx, conv_index, inputs, outputs);
+            }
+        }});
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackpropData* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto input_shape = args[0].get_shape();
+    auto filter_shape = args[1].get_shape();
+    auto output_shape = out[0].get_shape();
+    Strides window_dilation_strides = node->get_window_dilation_strides_forward();
+    Strides window_movement_strides = node->get_window_movement_strides_forward();
+    Strides data_dilation_strides = node->get_data_dilation_strides_forward();
+    CoordinateDiff padding_below_diff = node->get_padding_below_forward();
+    CoordinateDiff padding_above_diff = node->get_padding_above_forward();
+    auto input_type = args[0].get_element_type().c_type_string();
+    auto output_type = out[0].get_element_type().c_type_string();
+
+    // construct hash to determine if kernel needs to be emitted
+    // or if it already exists in the primitive list
+    std::stringstream ss;
+    ss << "convolution_bp_data_op_" << output_type << "_i" << join(input_shape, "_") << "_w"
+       << join(filter_shape, "_") << "_o" << join(output_shape, "_") << "_ws"
+       << join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
+       << join(padding_below_diff, "_");
+    std::string hash = ss.str();
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    bool is_deconvolution = false;
+    for (auto a : data_dilation_strides)
+    {
+        if (a != 1)
+        {
+            is_deconvolution = true;
+            break;
+        }
+    }
+
+    bool pad_required = (padding_below_diff != padding_above_diff);
+
+    Shape padding_below(padding_below_diff.size(), 0);
+    Shape padding_above(padding_above_diff.size(), 0);
+    for (int i = 0; i < padding_below.size(); i++)
+    {
+        padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
+        padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
+    }
+
+    auto output_shape_padded = output_shape;
+    Shape padding_below_back(output_shape.size(), 0);
+    Shape padding_interior_back(output_shape.size(), 1);
+    size_t i = padding_below_back.size() - padding_below.size();
+    size_t j = 0;
+    for (; i < padding_below_back.size(); i++)
+    {
+        padding_below_back[i] = padding_below[j];
+        padding_interior_back[i] = data_dilation_strides[j];
+        j++;
+    }
+
+    Shape padding_interior(data_dilation_strides);
+
+    size_t idx_workspace = std::numeric_limits<size_t>::max();
+    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t slice_index = std::numeric_limits<size_t>::max();
+    if (pad_required || is_deconvolution)
+    {
+        output_shape_padded =
+            get_padded_shape(output_shape, padding_below, padding_above, padding_interior);
+        auto temp_size = shape_size(output_shape_padded) * args[0].get_element_type().size();
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+
+        // reserve zero initialized workspace
+        idx_workspace = allocator.reserve_workspace(temp_size, true);
+
+        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
+        pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
+                                                            output_shape,
+                                                            output_shape_padded,
+                                                            padding_below,
+                                                            padding_interior);
+
+        slice_index = cuda_emitter->build_slice({{input_type, output_type}},
+                                                output_shape_padded,
+                                                padding_below_back,
+                                                padding_interior_back,
+                                                output_shape);
+
+        // asymetric padding has been applied, zero out padding vectors to
+        // ensure cudnn does not assume padding
+        std::fill(padding_below.begin(), padding_below.end(), 0);
+    }
+
+    size_t conv_index = build_convolution_backward_data(output_type,
+                                                        args[0].get_shape(),
+                                                        args[1].get_shape(),
+                                                        output_shape_padded,
+                                                        window_movement_strides,
+                                                        window_dilation_strides,
+                                                        padding_below);
+
+    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
+                                                                         void** outputs) mutable {
+        if (idx_workspace != std::numeric_limits<size_t>::max() &&
+            pad_dynamic_index != std::numeric_limits<size_t>::max() &&
+            slice_index != std::numeric_limits<size_t>::max())
+        {
+            void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+            gpu::invoke_primitive(m_ctx,
+                                  pad_dynamic_index,
+                                  std::vector<void*>{inputs[0]}.data(),
+                                  std::vector<void*>{pad_buffer}.data());
+            gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
+            gpu::invoke_primitive(
+                m_ctx, slice_index, std::vector<void*>{pad_buffer}.data(), outputs);
+        }
+        else
+        {
+            gpu::invoke_primitive(m_ctx, conv_index, inputs, outputs);
+        }
+    }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackpropFilters* node)
 {
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto input_shape_0 = args[0].get_shape();
+    auto input_shape_1 = args[1].get_shape();
+    auto filter_shape = out[0].get_shape();
+    Strides window_dilation_strides = node->get_window_dilation_strides_forward();
+    Strides window_movement_strides = node->get_window_movement_strides_forward();
+    Strides data_dilation_strides = node->get_data_dilation_strides_forward();
+    CoordinateDiff padding_below_diff = node->get_padding_below_forward();
+    CoordinateDiff padding_above_diff = node->get_padding_above_forward();
+    auto input_type = args[0].get_element_type().c_type_string();
+    auto output_type = out[0].get_element_type().c_type_string();
+
    // construct hash to determine if kernel needs to be emitted
    // or if it already exists in the primitive list
+
    std::stringstream ss;
-    ss << "convolution_op_" << dtype << "_i" << join(input_tensor_shape, "_") << "_w"
-       << join(input_filter_shape, "_") << "_o" << join(output_tensor_shape, "_") << "_ws"
+    ss << "convolution_bp_filter_op_" << output_type << "_i" << join(input_shape_0, "_") << "_w"
+       << join(filter_shape, "_") << "_o" << join(input_shape_1, "_") << "_ws"
       << join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
-       << join(padding_below, "_");
+       << join(padding_below_diff, "_");
+    std::string hash = ss.str();
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    bool is_deconvolution = false;
+    for (auto a : data_dilation_strides)
+    {
+        if (a != 1)
+        {
+            is_deconvolution = true;
+            break;
+        }
+    }
+
+    bool pad_required = (padding_below_diff != padding_above_diff);
+
+    Shape padding_below(padding_below_diff.size(), 0);
+    Shape padding_above(padding_above_diff.size(), 0);
+    for (int i = 0; i < padding_below.size(); i++)
+    {
+        padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
+        padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
+    }
+    auto input_shape_padded = input_shape_0;
+    Shape padding_interior(data_dilation_strides);
+
+    size_t idx_workspace = std::numeric_limits<size_t>::max();
+    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    if (pad_required || is_deconvolution)
+    {
+        input_shape_padded = runtime::gpu::get_padded_shape(
+            input_shape_0, padding_below, padding_above, padding_interior);
+        auto temp_size = shape_size(input_shape_padded) * args[0].get_element_type().size();
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+
+        // reserve zero initialized workspace
+        idx_workspace = allocator.reserve_workspace(temp_size, true);
+
+        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
+        pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
+                                                            input_shape_0,
+                                                            input_shape_padded,
+                                                            padding_below,
+                                                            padding_interior);
+
+        // asymetric padding has been applied, zero out padding vectors to
+        // ensure cudnn does not assume padding
+        std::fill(padding_below.begin(), padding_below.end(), 0);
+    }
+
+    size_t conv_index = build_convolution_backward_filter(output_type,
+                                                          input_shape_padded,
+                                                          input_shape_1,
+                                                          filter_shape,
+                                                          window_movement_strides,
+                                                          window_dilation_strides,
+                                                          padding_below);
+
+    std::unique_ptr<gpu::primitive> kernel_launch(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            if (idx_workspace != std::numeric_limits<size_t>::max() &&
+                pad_dynamic_index != std::numeric_limits<size_t>::max())
+            {
+                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                gpu::invoke_primitive(m_ctx,
+                                      pad_dynamic_index,
+                                      std::vector<void*>{inputs[0]}.data(),
+                                      std::vector<void*>{pad_buffer}.data());
+                gpu::invoke_primitive(
+                    m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
+            }
+            else
+            {
+                gpu::invoke_primitive(m_ctx, conv_index, inputs, outputs);
+            }
+        }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto& input_shape = args[0].get_shape();
+    auto& result_shape = out[0].get_shape();
+    auto padding_below = node->get_padding_below();
+    auto padding_above = node->get_padding_above();
+    auto input_type = args[0].get_element_type().c_type_string();
+    auto output_type = out[0].get_element_type().c_type_string();
+
+    // construct hash to determine if kernel needs to be emitted
+    // or if it already exists in the primitive list
+    std::stringstream ss;
+    ss << "max_pool_" << output_type << "_i" << join(input_shape, "_") << "_o"
+       << join(result_shape, "_") << "_ws" << join(node->get_window_shape(), "_") << "_wst"
+       << join(node->get_window_movement_strides(), "_") << "_pb" << join(padding_below, "_")
+       << "_pb" << join(padding_above, "_");
+    std::string hash = ss.str();
+
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    /// assymetric padding detection
+    bool pad_required = false;
+    auto shape_to_pool =
+        runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
+    if (shape_to_pool != input_shape)
+    {
+        pad_required = true;
+    }
+
+    pad_required = pad_required && (padding_below != padding_above);
+    // asymetric padding
+    size_t idx_workspace = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
+    if (pad_required)
+    {
+        auto temp_size = shape_size(shape_to_pool) * args[0].get_element_type().size();
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+        idx_workspace = allocator.reserve_workspace(temp_size);
+
+        auto pad_value = TypeInfo::Get(args[0].get_element_type())->lowest();
+
+        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            input_shape,
+                                            shape_to_pool,
+                                            padding_below,
+                                            padding_above,
+                                            Shape{},
+                                            pad_value);
+
+        // asymetric padding has been applied, zero out padding vectors to
+        // ensure cuDNN does not assume padding during pooling
+        std::fill(padding_below.begin(), padding_below.end(), 0);
+        std::fill(padding_above.begin(), padding_above.end(), 0);
+    }
+
+    /// end asymmetric padding detection
+
+    size_t max_pool_index = build_pooling(CUDNN_POOLING_MAX,
+                                          output_type,
+                                          CUDNNEmitter::Prop::Forward,
+                                          shape_to_pool,
+                                          result_shape,
+                                          node->get_window_movement_strides(),
+                                          node->get_window_shape(),
+                                          padding_below,
+                                          padding_above);
+
+    std::unique_ptr<gpu::primitive> kernel_launch(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            if (idx_workspace != std::numeric_limits<size_t>::max() &&
+                pad_index != std::numeric_limits<size_t>::max())
+            {
+                // void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                // gpu::invoke_primitive(m_ctx,
+                //                       pad_dynamic_index,
+                //                       std::vector<void*>{inputs[0]}.data(),
+                //                       std::vector<void*>{pad_buffer}.data());
+
+                // gpu::invoke_primitive(
+                //     m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
+
+                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                gpu::invoke_primitive(m_ctx,
+                                      pad_index,
+                                      std::vector<void*>{inputs[0]}.data(),
+                                      std::vector<void*>{pad_buffer}.data());
+                gpu::invoke_primitive(
+                    m_ctx, max_pool_index, std::vector<void*>{pad_buffer}.data(), outputs);
+            }
+            else
+            {
+                gpu::invoke_primitive(m_ctx, max_pool_index, inputs, outputs);
+            }
+        }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Max* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto& input_shape = args[0].get_shape();
+    auto& output_shape = out[0].get_shape();
+    auto input_size = shape_size(input_shape);
+    auto output_size = shape_size(output_shape);
+    auto output_element_size = out[0].get_element_type().size();
+    auto output_type = out[0].get_element_type().c_type_string();
+
+    std::stringstream ss;
+    ss << "max_" << output_type << "_i" << join(input_shape, "_") << "_ra"
+       << join(node->get_reduction_axes(), "_");
+    std::string hash = ss.str();
+
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    std::unique_ptr<gpu::primitive> kernel_launch;
+    ;
+
+    // one of args[] axes has zero size, zero output
+    if (input_size == 0)
+    {
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+        std::vector<float> negative_inf(output_size, -std::numeric_limits<float>::infinity());
+        size_t idx_float_inf =
+            allocator.reserve_argspace(negative_inf.data(), negative_inf.size() * sizeof(float));
+
+        kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void* temp_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_float_inf);
+            runtime::gpu::cuda_memcpyDtD(outputs[0], temp_d, output_size * output_element_size);
+        }});
+    }
+    else if (input_size == output_size)
+    {
+        // no reduction
+        kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            runtime::gpu::cuda_memcpyDtD(outputs[0], inputs[0], output_size * output_element_size);
+        }});
+    }
+    else
+    {
+        auto& cudnn_emitter = m_primitive_emitter->get_cudnn_emitter();
+        auto max_index = cudnn_emitter->build_reduce_forward(
+            CUDNN_REDUCE_TENSOR_MAX, output_type, input_shape, node->get_reduction_axes());
+        kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            gpu::invoke_primitive(m_ctx, max_index, inputs, outputs);
+        }});
+    }
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Min* node)
+{
+    auto& args = node->get_inputs();
+    auto& out = node->get_outputs();
+    auto& input_shape = args[0].get_shape();
+    auto& output_shape = out[0].get_shape();
+    auto input_size = shape_size(input_shape);
+    auto output_size = shape_size(output_shape);
+    auto output_element_size = out[0].get_element_type().size();
+    auto output_type = out[0].get_element_type().c_type_string();
+
+    std::stringstream ss;
+    ss << "min_" << output_type << "_i" << join(input_shape, "_") << "_ra"
+       << join(node->get_reduction_axes(), "_");
    std::string hash = ss.str();
+
    // check if the requested kernel is already an inserted primitive
    size_t primitive_index = m_primitive_emitter->lookup(hash);
    if (primitive_index != std::numeric_limits<size_t>::max())
@@ -350,6 +868,52 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
        return primitive_index;
    }

+    std::unique_ptr<gpu::primitive> kernel_launch;
+    ;
+
+    // one of args[] axes has zero size, zero output
+    if (input_size == 0)
+    {
+        GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
+        std::vector<float> negative_inf(output_size, std::numeric_limits<float>::infinity());
+        size_t idx_float_inf =
+            allocator.reserve_argspace(negative_inf.data(), negative_inf.size() * sizeof(float));
+
+        kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void* temp_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_float_inf);
+            runtime::gpu::cuda_memcpyDtD(outputs[0], temp_d, output_size * output_element_size);
+        }});
+    }
+    else if (input_size == output_size)
+    {
+        // no reduction
+        kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            runtime::gpu::cuda_memcpyDtD(outputs[0], inputs[0], output_size * output_element_size);
+        }});
+    }
+    else
+    {
+        auto& cudnn_emitter = m_primitive_emitter->get_cudnn_emitter();
+        auto min_index = cudnn_emitter->build_reduce_forward(
+            CUDNN_REDUCE_TENSOR_MIN, output_type, input_shape, node->get_reduction_axes());
+        kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            gpu::invoke_primitive(m_ctx, min_index, inputs, outputs);
+        }});
+    }
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
+                                                     const Shape& input_tensor_shape,
+                                                     const Shape& input_filter_shape,
+                                                     const Shape& output_tensor_shape,
+                                                     const Strides& window_movement_strides,
+                                                     const Strides& window_dilation_strides,
+                                                     const Shape& padding_below)
+{
    cudnnDataType_t data_type = get_cudnn_datatype(dtype);
    const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
    const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
@@ -399,9 +963,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
        debug_sync();
    }});

-    primitive_index = this->m_primitive_emitter->insert(std::move(conv));
-    m_primitive_emitter->cache(hash, primitive_index);
-    return primitive_index;
+    return this->m_primitive_emitter->insert(std::move(conv));
 }

 size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
@@ -413,20 +975,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
    const Strides& window_dilation_strides,
    const Shape& padding_below)
 {
-    // construct hash to determine if kernel needs to be emitted
-    // or if it already exists in the primitive list
-    std::stringstream ss;
-    ss << "convolution_bp_data_op_" << dtype << "_i" << join(input_tensor_shape, "_") << "_w"
-       << join(input_filter_shape, "_") << "_o" << join(output_tensor_shape, "_") << "_ws"
-       << join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
-       << join(padding_below, "_");
-    std::string hash = ss.str();
-    // check if the requested kernel is already an inserted primitive
-    size_t primitive_index = m_primitive_emitter->lookup(hash);
-    if (primitive_index != std::numeric_limits<size_t>::max())
-    {
-        return primitive_index;
-    }
    const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
    const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
    const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
@@ -476,9 +1024,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
        debug_sync();
    }});

-    primitive_index = this->m_primitive_emitter->insert(std::move(conv));
-    m_primitive_emitter->cache(hash, primitive_index);
-    return primitive_index;
+    return this->m_primitive_emitter->insert(std::move(conv));
 }

 size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
@@ -490,21 +1036,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
    const Strides& window_dilation_strides,
    const Shape& padding_below)
 {
-    // construct hash to determine if kernel needs to be emitted
-    // or if it already exists in the primitive list
-
-    std::stringstream ss;
-    ss << "convolution_bp_filter_op_" << dtype << "_i" << join(input_tensor_shape_0, "_") << "_w"
-       << join(output_filter_shape, "_") << "_o" << join(input_tensor_shape_1, "_") << "_ws"
-       << join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
-       << join(padding_below, "_");
-    std::string hash = ss.str();
-    // check if the requested kernel is already an inserted primitive
-    size_t primitive_index = m_primitive_emitter->lookup(hash);
-    if (primitive_index != std::numeric_limits<size_t>::max())
-    {
-        return primitive_index;
-    }
    const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
    const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
    const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
@@ -554,9 +1085,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
                                                       outputs[0]));
        debug_sync();
    }});
-    primitive_index = this->m_primitive_emitter->insert(std::move(conv));
-    m_primitive_emitter->cache(hash, primitive_index);
-    return primitive_index;
+
+    return this->m_primitive_emitter->insert(std::move(conv));
 }

 size_t runtime::gpu::CUDNNEmitter::build_pooling(const cudnnPoolingMode_t& pool_op,
@@ -591,7 +1121,10 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const cudnnPoolingMode_t& pool_
    auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format);
    auto& output_desc = tensor_descriptor_from_shape(output_shape, data_type, tensor_format);

-    if (input_shape.size() == 4)
+    if (input_shape.size() == 3)
+    {
+    }
+    else if (input_shape.size() == 4)
    {
        CUDNN_SAFE_CALL(cudnnSetPooling2dDescriptor(desc,
                                                    pool_op,

--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
@@ -30,6 +30,11 @@
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 #include "ngraph/shape.hpp"

+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/max.hpp"
+#include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/min.hpp"
+
 namespace ngraph
 {
    namespace runtime
@@ -48,6 +53,14 @@ namespace ngraph
            {
                friend class GPUPrimitiveEmitter;

+            public:
+                size_t build_primitive(const op::Convolution* node);
+                size_t build_primitive(const op::ConvolutionBackpropData* node);
+                size_t build_primitive(const op::ConvolutionBackpropFilters* node);
+                size_t build_primitive(const op::MaxPool* node);
+                size_t build_primitive(const op::Max* node);
+                size_t build_primitive(const op::Min* node);
+
            public:
                enum class Prop
                {

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -144,190 +144,26 @@ namespace ngraph

                auto convolution = static_cast<const ngraph::op::Convolution*>(node);

-                auto input_shape = args[0].get_shape();
-                auto filter_shape = args[1].get_shape();
-                auto output_shape = out[0].get_shape();
-                auto rank = input_shape.size();
-                Strides window_dilation_strides = convolution->get_window_dilation_strides();
-                Strides window_movement_strides = convolution->get_window_movement_strides();
-                Strides data_dilation_strides = convolution->get_data_dilation_strides();
-                CoordinateDiff padding_below_diff = convolution->get_padding_below();
-                CoordinateDiff padding_above_diff = convolution->get_padding_above();
-
-                if (padding_below_diff.size() > 3)
+                size_t conv_index = 0;
+                if (convolution->get_padding_below().size() > 3)
                {
-                    // Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
-                    // and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
-
-                    // TODO: This should be done via a pass similar to
-                    // what is done for convolution in the IA transformer
-                    // c.f runtime/cpu/pass/cpu_layout.cpp
-
-                    GPUAllocator allocator =
-                        external_function->get_primitive_emitter()->get_memory_allocator();
-                    size_t transposed_data_idx = allocator.reserve_workspace(
-                        args[0].get_size() * args[0].get_element_type().size());
-                    size_t transposed_filter_idx = allocator.reserve_workspace(
-                        args[1].get_size() * args[1].get_element_type().size());
-                    size_t transposed_output_idx = allocator.reserve_workspace(
-                        out[0].get_size() * out[0].get_element_type().size());
-
-                    GPUShape input_order;
-                    for (int i = 1; i <= rank; i++)
-                    {
-                        input_order.push_back(i % rank);
-                    }
-
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
-
-                    size_t reshape_data_index = cuda_emitter->build_reshape(
-                        {{args[0].get_type(), args[0].get_type()}}, input_shape, input_order);
-                    writer << "void* data = gpu::invoke_memory_primitive(ctx, "
-                           << transposed_data_idx << ");\n";
-                    writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
-                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                    writer << "std::vector<void*>{data}.data());\n";
-
-                    size_t reshape_filter_index = cuda_emitter->build_reshape(
-                        {{args[1].get_type(), args[1].get_type()}}, filter_shape, input_order);
-                    writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
-                           << transposed_filter_idx << ");\n";
-                    writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
-                    writer << "std::vector<void*>{" << args[1].get_name() << "}.data(), ";
-                    writer << "std::vector<void*>{filter}.data());\n";
-
-                    // local helper to reshape tensor shape objects
-                    auto reshape = [](const Shape& shape, const GPUShape& order) {
-                        Shape output(shape.size(), 0);
-                        for (size_t i = 0; i < shape.size(); i++)
-                        {
-                            output[i] = shape[order[i]];
-                        }
-                        return output;
-                    };
-
-                    // reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
-                    input_shape = reshape(input_shape, input_order);
-                    // reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
-                    filter_shape = reshape(filter_shape, input_order);
-                    // reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
-                    output_shape = reshape(output_shape, input_order);
-
-                    size_t conv_index = cuda_emitter->build_convolution(
-                        {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
-                        input_shape,
-                        padding_below_diff,
-                        data_dilation_strides,
-                        filter_shape,
-                        window_movement_strides,
-                        window_dilation_strides,
-                        output_shape);
-                    writer << "void* output = gpu::invoke_memory_primitive(ctx, "
-                           << transposed_output_idx << ");\n";
-                    writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
-                    writer << "std::vector<void*>{data, filter}.data(), ";
-                    writer << "std::vector<void*>{output}.data());\n";
-
-                    // reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
-                    input_order.clear();
-                    input_order.push_back(static_cast<int>(rank - 1));
-                    for (int i = 0; i < rank - 1; i++)
-                    {
-                        input_order.push_back(i);
-                    }
-
-                    size_t reshape_output_index = cuda_emitter->build_reshape(
-                        {{args[1].get_type(), args[1].get_type()}}, output_shape, input_order);
-                    writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
-                    writer << "std::vector<void*>{output}.data(), ";
-                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
+                    conv_index = cuda_emitter->build_primitive(convolution);
                }
                else
                {
-                    bool is_deconvolution = false;
-                    for (auto a : data_dilation_strides)
-                    {
-                        if (a != 1)
-                        {
-                            is_deconvolution = true;
-                            break;
-                        }
-                    }
-
-                    bool pad_required = (padding_below_diff != padding_above_diff);
-
-                    Shape padding_below(padding_below_diff.size(), 0);
-                    Shape padding_above(padding_above_diff.size(), 0);
-                    for (int i = 0; i < padding_below.size(); i++)
-                    {
-                        padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
-                        padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
-                    }
-
-                    Shape input_shape_padded = input_shape;
-                    Shape padding_interior(data_dilation_strides);
-                    writer.block_begin();
-                    if (pad_required || is_deconvolution)
-                    {
-                        input_shape_padded = get_padded_shape(
-                            input_shape, padding_below, padding_above, padding_interior);
-                        Shape input_padded_strides = row_major_strides(input_shape_padded);
-                        auto temp_size =
-                            shape_size(input_shape_padded) * args[0].get_element_type().size();
-                        GPUAllocator allocator =
-                            external_function->get_primitive_emitter()->get_memory_allocator();
-                        size_t idx_workspace = allocator.reserve_workspace(temp_size);
-                        writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
-                               << idx_workspace << ");\n";
-                        writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
-                               << shape_size(input_shape_padded) << ", 0);\n";
-                        writer
-                            << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
-                            << temp_size << ");\n";
-                        auto& cuda_emitter =
-                            external_function->get_primitive_emitter()->get_cuda_emitter();
-                        auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
-                            {{args[0].get_type(), out[0].get_type()}},
-                            input_shape,
-                            input_shape_padded,
-                            padding_below,
-                            padding_interior);
-
-                        writer << "gpu::invoke_primitive(ctx, " << pad_dynamic_index << ", ";
-                        writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                        writer << "std::vector<void*>{pad_buffer}.data()";
-                        writer << ");\n";
-                        // asymetric padding has been applied, zero out padding vectors to
-                        // ensure cudnn does not assume padding
-                        std::fill(padding_below.begin(), padding_below.end(), 0);
-                    }
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-
-                    size_t index = cudnn_emitter->build_convolution(out[0].get_type(),
-                                                                    input_shape_padded,
-                                                                    args[1].get_shape(),
-                                                                    out[0].get_shape(),
-                                                                    window_movement_strides,
-                                                                    window_dilation_strides,
-                                                                    padding_below);
-
-                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
-                    if (pad_required || is_deconvolution)
-                    {
-                        writer << "std::vector<void*>{pad_buffer, " << args[1].get_name()
-                               << "}.data(), ";
-                    }
-                    else
-                    {
-                        writer << "std::vector<void*>{" << args[0].get_name() << ","
-                               << args[1].get_name() << "}.data(), ";
-                    }
-                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                    writer << ");\n";
-                    writer.block_end();
+                    conv_index = cudnn_emitter->build_primitive(convolution);
                }
+
+                writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
+                writer << "std::vector<void*>{";
+                writer << args[0].get_name() << ", ";
+                writer << args[1].get_name() << ", ";
+                writer << "}.data(), ";
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
            }

            template <>
@@ -339,128 +175,23 @@ namespace ngraph
                }

                auto convolution = static_cast<const ngraph::op::ConvolutionBackpropData*>(node);
-                Strides window_dilation_strides =
-                    convolution->get_window_dilation_strides_forward();
-                Strides window_movement_strides =
-                    convolution->get_window_movement_strides_forward();
-                Strides data_dilation_strides = convolution->get_data_dilation_strides_forward();
-                CoordinateDiff padding_below_diff = convolution->get_padding_below_forward();
-                CoordinateDiff padding_above_diff = convolution->get_padding_above_forward();

-                if (padding_below_diff.size() > 3)
+                if (convolution->get_padding_below_forward().size() > 3)
                {
                    throw std::runtime_error(node->get_name() +
                                             "with more than 3D is not implemented.");
                }

-                bool is_deconvolution = false;
-                for (auto a : data_dilation_strides)
-                {
-                    if (a != 1)
-                    {
-                        is_deconvolution = true;
-                        break;
-                    }
-                }
-
-                bool pad_required = (padding_below_diff != padding_above_diff);
-
-                Shape padding_below(padding_below_diff.size(), 0);
-                Shape padding_above(padding_above_diff.size(), 0);
-                for (int i = 0; i < padding_below.size(); i++)
-                {
-                    padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
-                    padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
-                }
-
-                auto output_shape = out[0].get_shape();
-                auto output_shape_padded = output_shape;
-                Shape padding_below_back(output_shape.size(), 0);
-                Shape padding_interior_back(output_shape.size(), 1);
-                size_t i = padding_below_back.size() - padding_below.size();
-                size_t j = 0;
-                for (; i < padding_below_back.size(); i++)
-                {
-                    padding_below_back[i] = padding_below[j];
-                    padding_interior_back[i] = data_dilation_strides[j];
-                    j++;
-                }
-
-                Shape padding_interior(data_dilation_strides);
-                writer.block_begin();
-                if (pad_required || is_deconvolution)
-                {
-                    output_shape_padded = get_padded_shape(
-                        output_shape, padding_below, padding_above, padding_interior);
-                    auto temp_size =
-                        shape_size(output_shape_padded) * args[0].get_element_type().size();
-                    GPUAllocator allocator =
-                        external_function->get_primitive_emitter()->get_memory_allocator();
-                    size_t idx_workspace = allocator.reserve_workspace(temp_size);
-                    writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
-                           << idx_workspace << ");\n";
-                    writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
-                           << shape_size(output_shape_padded) << ", 0);\n";
-                    writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
-                           << temp_size << ");\n";
-                    auto& cuda_emitter =
-                        external_function->get_primitive_emitter()->get_cuda_emitter();
-                    auto pad_dynamic_index =
-                        cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
-                                                        output_shape,
-                                                        output_shape_padded,
-                                                        padding_below,
-                                                        padding_interior);
-
-                    writer << "gpu::invoke_primitive(ctx, " << pad_dynamic_index << ", ";
-                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                    writer << "std::vector<void*>{pad_buffer}.data()";
-                    writer << ");\n";
-                    // asymetric padding has been applied, zero out padding vectors to
-                    // ensure cuDNN does not assume padding
-                    std::fill(padding_below.begin(), padding_below.end(), 0);
-                }
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                size_t index =
-                    cudnn_emitter->build_convolution_backward_data(out[0].get_type(),
-                                                                   args[0].get_shape(),
-                                                                   args[1].get_shape(),
-                                                                   output_shape_padded,
-                                                                   window_movement_strides,
-                                                                   window_dilation_strides,
-                                                                   padding_below);
+                size_t conv_index = cudnn_emitter->build_primitive(convolution);

-                writer << "gpu::invoke_primitive(ctx, " << index << ", ";
-                writer << "std::vector<void*>{" << args[0].get_name() << "," << args[1].get_name()
-                       << "}.data(), ";
-                if (pad_required || is_deconvolution)
-                {
-                    writer << "std::vector<void*>{pad_buffer}.data()";
-                }
-                else
-                {
-                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                }
-                writer << ");\n";
-                // since we padded output with temp buffer, we need to copy back to real ouput
-                if (pad_required || is_deconvolution)
-                {
-                    auto& cuda_emitter =
-                        external_function->get_primitive_emitter()->get_cuda_emitter();
-                    auto slice_index =
-                        cuda_emitter->build_slice({{args[0].get_type(), out[0].get_type()}},
-                                                  output_shape_padded,
-                                                  padding_below_back,
-                                                  padding_interior_back,
-                                                  output_shape);
-
-                    writer << "gpu::invoke_primitive(ctx, " << slice_index << ", ";
-                    writer << "std::vector<void*>{pad_buffer}.data(), ";
-                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                    writer << ");\n";
-                }
-                writer.block_end();
+                writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
+                writer << "std::vector<void*>{";
+                writer << args[0].get_name() << ", ";
+                writer << args[1].get_name() << ", ";
+                writer << "}.data(), ";
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
            }

            template <>
@@ -473,102 +204,22 @@ namespace ngraph

                auto convolution = static_cast<const ngraph::op::ConvolutionBackpropFilters*>(node);

-                Strides window_dilation_strides =
-                    convolution->get_window_dilation_strides_forward();
-                Strides window_movement_strides =
-                    convolution->get_window_movement_strides_forward();
-                Strides data_dilation_strides = convolution->get_data_dilation_strides_forward();
-                CoordinateDiff padding_below_diff = convolution->get_padding_below_forward();
-                CoordinateDiff padding_above_diff = convolution->get_padding_above_forward();
-
-                if (padding_below_diff.size() > 3)
+                if (convolution->get_padding_below_forward().size() > 3)
                {
                    throw std::runtime_error(node->get_name() +
                                             "with more than 3D is not implemented.");
                }

-                bool is_deconvolution = false;
-                for (auto a : data_dilation_strides)
-                {
-                    if (a != 1)
-                    {
-                        is_deconvolution = true;
-                        break;
-                    }
-                }
-
-                bool pad_required = (padding_below_diff != padding_above_diff);
-
-                Shape padding_below(padding_below_diff.size(), 0);
-                Shape padding_above(padding_above_diff.size(), 0);
-                for (int i = 0; i < padding_below.size(); i++)
-                {
-                    padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
-                    padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
-                }
-
-                auto input_shape = args[0].get_shape();
-                auto input_shape_padded = input_shape;
-                Shape padding_interior(data_dilation_strides);
-                writer.block_begin();
-                if (pad_required || is_deconvolution)
-                {
-                    input_shape_padded = get_padded_shape(
-                        input_shape, padding_below, padding_above, padding_interior);
-                    auto temp_size =
-                        shape_size(input_shape_padded) * args[0].get_element_type().size();
-                    GPUAllocator allocator =
-                        external_function->get_primitive_emitter()->get_memory_allocator();
-                    size_t idx_workspace = allocator.reserve_workspace(temp_size);
-                    writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
-                           << idx_workspace << ");\n";
-                    writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
-                           << shape_size(input_shape_padded) << ", 0);\n";
-                    writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
-                           << temp_size << ");\n";
-                    auto& cuda_emitter =
-                        external_function->get_primitive_emitter()->get_cuda_emitter();
-                    auto pad_dynamic_index =
-                        cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
-                                                        input_shape,
-                                                        input_shape_padded,
-                                                        padding_below,
-                                                        padding_interior);
-
-                    writer << "gpu::invoke_primitive(ctx, " << pad_dynamic_index << ", ";
-                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                    writer << "std::vector<void*>{pad_buffer}.data()";
-                    writer << ");\n";
-                    // asymetric padding has been applied, zero out padding vectors to
-                    // ensure cuDNN does not assume padding
-                    std::fill(padding_below.begin(), padding_below.end(), 0);
-                }
-
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                size_t index =
-                    cudnn_emitter->build_convolution_backward_filter(out[0].get_type(),
-                                                                     input_shape_padded,
-                                                                     args[1].get_shape(),
-                                                                     out[0].get_shape(),
-                                                                     window_movement_strides,
-                                                                     window_dilation_strides,
-                                                                     padding_below);
+                size_t conv_index = cudnn_emitter->build_primitive(convolution);

-                writer << "gpu::invoke_primitive(ctx, " << index << ", ";
-                if (pad_required || is_deconvolution)
-                {
-                    writer << "std::vector<void*>{pad_buffer, " << args[1].get_name()
-                           << "}.data(), ";
-                }
-                else
-                {
-                    writer << "std::vector<void*>{" << args[0].get_name() << ","
-                           << args[1].get_name() << "}.data(), ";
-                }
-                writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                writer << ");\n";
-                writer.block_end();
+                writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
+                writer << "std::vector<void*>{";
+                writer << args[0].get_name() << ", ";
+                writer << args[1].get_name() << ", ";
+                writer << "}.data(), ";
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
            }

            template <>
@@ -1146,96 +797,40 @@ namespace ngraph
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Max)
            {
-                const ngraph::op::Max* max_op = static_cast<const ngraph::op::Max*>(node);
-                writer.block_begin();
-                {
-                    if (out[0].get_size() != 0)
-                    {
-                        // one of args[] axes has zero size, zero output
-                        if (args[0].get_size() == 0)
-                        {
-                            GPUAllocator allocator =
-                                external_function->get_primitive_emitter()->get_memory_allocator();
-                            std::vector<float> negative_inf(
-                                out[0].get_size(), -std::numeric_limits<float>::infinity());
-                            size_t idx_float_inf = allocator.reserve_argspace(
-                                negative_inf.data(), negative_inf.size() * sizeof(float));
-                            writer << "void* temp_d = runtime::gpu::invoke_memory_primitive(ctx, "
-                                   << idx_float_inf << ");\n";
-                            writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name()
-                                   << ", temp_d, " << out[0].get_size() << " * "
-                                   << out[0].get_element_type().size() << ");\n";
-                        }
-                        else if (args[0].get_size() == out[0].get_size())
+                if (out[0].get_size() == 0)
                {
-                            kernel::emit_memcpyDtD(writer, out[0], args[0]);
+                    return;
                }
-                        else
-                        {
+
+                const ngraph::op::Max* max = static_cast<const ngraph::op::Max*>(node);
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                            auto max_index =
-                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MAX,
-                                                                    out[0].get_type(),
-                                                                    args[0].get_shape(),
-                                                                    max_op->get_reduction_axes());
+                auto index = cudnn_emitter->build_primitive(max);

-                            writer << "gpu::invoke_primitive(ctx, " << max_index << ", ";
+                writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                            writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                            writer << ");\n";
-                        }
-                    }
-                }
-                writer.block_end();
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
+
                return;
            }

            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Min)
            {
-                const ngraph::op::Min* min_op = static_cast<const ngraph::op::Min*>(node);
-                writer.block_begin();
-                {
-                    if (out[0].get_size() != 0)
-                    {
-                        // one of args[] axes has zero size, zero output
-                        if (args[0].get_size() == 0)
-                        {
-                            GPUAllocator allocator =
-                                external_function->get_primitive_emitter()->get_memory_allocator();
-                            std::vector<float> positive_inf(out[0].get_size(),
-                                                            std::numeric_limits<float>::infinity());
-                            size_t idx_float_inf = allocator.reserve_argspace(
-                                positive_inf.data(), positive_inf.size() * sizeof(float));
-                            writer << "void* temp_d = runtime::gpu::invoke_memory_primitive(ctx, "
-                                   << idx_float_inf << ");\n";
-                            writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name()
-                                   << ", temp_d, " << out[0].get_size() << " * "
-                                   << out[0].get_element_type().size() << ");\n";
-                        }
-                        else if (args[0].get_size() == out[0].get_size())
+                if (out[0].get_size() == 0)
                {
-                            kernel::emit_memcpyDtD(writer, out[0], args[0]);
+                    return;
                }
-                        else
-                        {
+
+                const ngraph::op::Min* min = static_cast<const ngraph::op::Min*>(node);
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                            auto min_index =
-                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MIN,
-                                                                    out[0].get_type(),
-                                                                    args[0].get_shape(),
-                                                                    min_op->get_reduction_axes());
+                auto index = cudnn_emitter->build_primitive(min);

-                            writer << "gpu::invoke_primitive(ctx, " << min_index << ", ";
+                writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                            writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                            writer << ");\n";
-                        }
-                    }
-                }
-                writer.block_end();
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
+
                return;
            }

@@ -1561,10 +1156,8 @@ namespace ngraph
            {
                // assumes NC{d1,d2,...} format
                auto max_pool = static_cast<const ngraph::op::MaxPool*>(node);
-                writer.block_begin();
-                {
+
                auto& input_shape = args[0].get_shape();
-                    auto& result_shape = out[0].get_shape();
                auto padding_below = max_pool->get_padding_below();
                auto padding_above = max_pool->get_padding_above();
                if (input_shape.size() < 3)
@@ -1574,78 +1167,20 @@ namespace ngraph
                        "Tensors should have at least one spatial dimension, dim(NC{d1...dN}) "
                        "<= 3");
                }
-
-                    bool pad_required = false;
-                    auto shape_to_pool =
-                        get_padded_shape(input_shape, padding_below, padding_above, {});
-                    if (shape_to_pool != input_shape)
-                    {
-                        pad_required = true;
-                    }
-
-                    pad_required = pad_required && (padding_below != padding_above);
-                    // asymetric padding
-                    if (pad_required)
-                    {
-                        auto& cuda_emitter =
-                            external_function->get_primitive_emitter()->get_cuda_emitter();
-
-                        // auto temp_buffer = create_gpu_buffer(shape_size(output_shape)*type_size);
-                        auto temp_size =
-                            shape_size(shape_to_pool) * args[0].get_element_type().size();
-                        GPUAllocator allocator =
-                            external_function->get_primitive_emitter()->get_memory_allocator();
-                        size_t idx_workspace = allocator.reserve_workspace(temp_size);
-                        writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
-                               << idx_workspace << ");\n";
-
-                        std::stringstream ss;
-                        ss << TypeInfo::Get(args[0].get_element_type())->lowest();
-
-                        auto pad_index =
-                            cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
-                                                    input_shape,
-                                                    shape_to_pool,
-                                                    padding_below,
-                                                    padding_above,
-                                                    Shape{},
-                                                    ss.str());
-
-                        writer << "gpu::invoke_primitive(ctx, " << pad_index << ", ";
-                        writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                        writer << "std::vector<void*>{pad_buffer}.data()";
-                        writer << ");\n";
-
-                        // asymetric padding has been applied, zero out padding vectors to
-                        // ensure cuDNN does not assume padding during pooling
-                        std::fill(padding_below.begin(), padding_below.end(), 0);
-                        std::fill(padding_above.begin(), padding_above.end(), 0);
-                    }
-
-                    int num_nontrivial_dims = 0;
-                    for (int64_t i = shape_to_pool.size() - 1; i > 1; i--)
-                    {
-                        if (shape_to_pool[i] > 1)
+                else if (input_shape.size() > 5)
                {
-                            num_nontrivial_dims++;
-                        }
+                    throw std::runtime_error(
+                        "Pooling currently only supports up to 3 spatial dimensions.");
                }

-                    if (input_shape.size() <= 5)
-                    {
-                        size_t max_pool_index = 0;
+                size_t max_pool_index;
                // 1d max pool (NCW)
-                        if ((input_shape.size() == 3 || num_nontrivial_dims == 1))
+                if (input_shape.size() == 3)
                {
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();

-                            max_pool_index = cuda_emitter->build_1d_max_pool(
-                                {{args[0].get_type(), out[0].get_type()}},
-                                input_shape,
-                                result_shape,
-                                max_pool->get_window_shape().back(),
-                                max_pool->get_window_movement_strides().back());
+                    max_pool_index = cuda_emitter->build_primitive(max_pool);
                }
                // 2d and 3d max pool (NCHW)
                else if (input_shape.size() == 4 || input_shape.size() == 5)
@@ -1653,38 +1188,13 @@ namespace ngraph
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();

-                            max_pool_index = cudnn_emitter->build_pooling(
-                                CUDNN_POOLING_MAX,
-                                out[0].get_type(),
-                                CUDNNEmitter::Prop::Forward,
-                                shape_to_pool,
-                                result_shape,
-                                max_pool->get_window_movement_strides(),
-                                max_pool->get_window_shape(),
-                                padding_below,
-                                padding_above);
+                    max_pool_index = cudnn_emitter->build_primitive(max_pool);
                }
-
                writer << "gpu::invoke_primitive(ctx, " << max_pool_index << ", ";
-                        if (pad_required)
-                        {
-                            writer << "std::vector<void*>{pad_buffer}.data(), ";
-                        }
-                        else
-                        {
                writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                        }
                writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
                writer << ");\n";
            }
-                    else
-                    {
-                        throw std::runtime_error(
-                            "Pooling currently only supports up to 3 spatial dimensions.");
-                    }
-                }
-                writer.block_end();
-            }

            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::MaxPoolBackprop)
@@ -1866,8 +1376,7 @@ namespace ngraph
                    size_t avg_pool_index = 0;

                    // if 1d or has asymmetric padding, must handle pooling manually
-                    if (input_shape.size() == 3 || num_nontrivial_dims == 1 ||
-                        padding_below != padding_above)
+                    if (input_shape.size() == 3 || padding_below != padding_above)
                    {
                        auto& cuda_emitter =
                            external_function->get_primitive_emitter()->get_cuda_emitter();
@@ -2015,80 +1524,34 @@ namespace ngraph
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
            {
+                auto softmax = static_cast<const ngraph::op::Softmax*>(node);
                writer.block_begin();
                {
-                    auto softmax = static_cast<const ngraph::op::Softmax*>(node);
-                    auto tensor_shape = args[0].get_shape();
-                    auto axes = softmax->get_axes();
-
-                    auto& cudnn_emitter =
-                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-
-                    if (axes.size() != tensor_shape.size())
+                    size_t softmax_index;
+                    if (softmax->get_axes().size() != args[0].get_shape().size())
                    {
                        auto& cuda_emitter =
                            external_function->get_primitive_emitter()->get_cuda_emitter();

-                        // reserve a temporary buffer for the intermediate reduction
-                        GPUAllocator allocator =
-                            external_function->get_primitive_emitter()->get_memory_allocator();
-                        auto reduced_shape = tensor_shape;
-                        for (auto const& axis : axes)
-                        {
-                            reduced_shape[axis] = 1;
-                        }
-                        size_t reduced_size = shape_size(reduced_shape);
-                        size_t workspace_idx = allocator.reserve_workspace(
-                            reduced_size * out[0].get_element_type().size());
-
-                        // exponentiate with fused sum reduction to calculate softmax denominator
-                        size_t exp_sum_reduce =
-                            cuda_emitter
-                                ->build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>(
-                                    {{args[0].get_type(), out[0].get_type()}},
-                                    args[0].get_shape(),
-                                    {},
-                                    axes,
-                                    true /* multi-output */);
-
-                        writer << "void* workspace = gpu::invoke_memory_primitive(ctx, "
-                               << workspace_idx << ");\n";
-                        writer << "gpu::invoke_primitive(ctx, " << exp_sum_reduce << ", ";
-                        writer << "std::vector<void*>{" << args[0].get_name();
-                        writer << "}.data(), ";
-                        // cache the elementwise result and the fused result (multi-output)
-                        writer << "std::vector<void*>{ workspace, ";
-                        writer << out[0].get_name() << "}.data()";
-                        writer << ");\n";
-
-                        // inplace binary division with fused broadcast to calculate softmax
-                        size_t div_broadcast =
-                            cuda_emitter->build_elementwise_collective<ngraph::op::Divide>(
-                                {{out[0].get_type(), out[0].get_type(), out[0].get_type()}},
-                                out[0].get_shape(),
-                                {1},
-                                axes);
-
-                        writer << "gpu::invoke_primitive(ctx, " << div_broadcast << ", ";
-                        writer << "std::vector<void*>{" << out[0].get_name();
-                        writer << ", workspace}.data(), ";
-                        writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                        writer << ");\n";
+                        softmax_index = cuda_emitter->build_primitive(softmax);
                    }
                    else
                    {
-                        size_t softmax_index =
-                            cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
+                        auto& cudnn_emitter =
+                            external_function->get_primitive_emitter()->get_cudnn_emitter();
+
+                        softmax_index = cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
                                                                     CUDNN_SOFTMAX_MODE_INSTANCE,
                                                                     out[0].get_type(),
                                                                     CUDNNEmitter::Prop::Forward,
-                                                         tensor_shape);
+                                                                     args[0].get_shape());
+                    }
+
                    writer << "gpu::invoke_primitive(ctx, " << softmax_index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
                    writer << ");\n";
                }
-                }
                writer.block_end();
            }
        }

--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.cpp
@@ -232,3 +232,19 @@ void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
    writer << "                  " << out.get_name() << "));\n";
    writer << "ngraph::runtime::gpu::free_gpu_buffer(workspace_ptr);\n";
 }
+
+std::string runtime::gpu::kernel::emit_type_string(const Node* node)
+{
+    std::stringstream ss;
+    for (auto const& input : node->get_inputs())
+    {
+        ss << input.get_element_type().c_type_string() << "_";
+    }
+    for (auto const& output : node->get_outputs())
+    {
+        ss << output.get_element_type().c_type_string() << "_";
+    }
+    std::string types = ss.str();
+    std::replace(types.begin(), types.end(), ' ', '_');
+    return types;
+}
--- a/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_emitters.hpp
@@ -82,6 +82,8 @@ namespace ngraph
                                            const std::string& output_desc,
                                            const float& alpha,
                                            const float& beta);
+
+                std::string emit_type_string(const Node* node);
            }
        }
    }