Refactored GPU backend state into BackendContext (#1186)

* Refactored GPU backend state into BackendContext and moved it to the highest level GPU_Backend. Some bugs have appeared in so doing. Needs investigation. * extra *block_size * change grid_size to threads * Bug fix in softmax cache parameters. * Additional bug fix for maxpool1d cache parameters. * Bug fix in softmax cache parameters. * Additional bug fix for maxpool1d cache parameters. * Remove temporary print statements. * Use nthreads in primitive hash. * Switched from using stack references for cudnn and cublas handles to heap pointers held only the c-struct GPURuntimeContext but managed by the GPU_Backend. * Refactored the use of GPURuntimeContext* ctx throughout the emitters. * Use std::prev instead of operator-- for memory iteratory capture * bug fix from abaf1d7

Refactored GPU backend state into BackendContext (#1186)
* Refactored GPU backend state into BackendContext and moved it to the highest level GPU_Backend. Some bugs have appeared in so doing. Needs investigation. * extra *block_size * change grid_size to threads * Bug fix in softmax cache parameters. * Additional bug fix for maxpool1d cache parameters. * Bug fix in softmax cache parameters. * Additional bug fix for maxpool1d cache parameters. * Remove temporary print statements. * Use nthreads in primitive hash. * Switched from using stack references for cudnn and cublas handles to heap pointers held only the c-struct GPURuntimeContext but managed by the GPU_Backend. * Refactored the use of GPURuntimeContext* ctx throughout the emitters. * Use std::prev instead of operator-- for memory iteratory capture * bug fix from abaf1d7
55a25d41 · Chris Sullivan · Robert Kimball · 8bde818c · 55a25d41 · 55a25d41
Commit 55a25d41 authored Jul 13, 2018 by Chris Sullivan Committed by Robert Kimball Jul 13, 2018
16 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -62,13 +62,14 @@ std::ostream& operator<<(std::ostream& os, pooling_op_shape& shape)
              << shape.PAD_D << "_" << shape.PAD_H << "_" << shape.PAD_W;
 }
-runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitter)
+runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitter,
+                                       runtime::gpu::GPURuntimeContext* ctx)
    : m_primitive_emitter(emitter)
 {
+    m_ctx = ctx;
 }
-size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
-                                            const std::array<std::string, 2>& dtypes,
                                            GPUShape input_shape,
                                            GPUShape output_shape,
                                            GPUShape padding_below,
@@ -103,7 +104,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
    // if the kernel has not been compiled, build it
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(hash);
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(hash);
    if (compiled_kernel == nullptr)
    {
        // normalize pad dimensions to shape dimensions
@@ -186,7 +187,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
        }
        writer.block_end();
-        compiled_kernel = ctx->compiled_kernel_pool->set(hash, writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(hash, writer.get_code());
    }
    std::unique_ptr<gpu::primitive> pad;
@@ -233,8 +234,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes,
-                                                    const std::array<std::string, 2>& dtypes,
                                                    GPUShape input_shape,
                                                    GPUShape output_shape,
                                                    GPUShape padding_below,
@@ -261,13 +261,13 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        CudaKernelBuilder::add_pod_typedefs(writer);
        CudaKernelBuilder::get_pad_dynamic_op(writer, kernel_name.str(), dtypes);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    uint32_t rank = static_cast<uint32_t>(input_shape.size());
@@ -300,11 +300,12 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
    // create the launch primitive
    std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs,
                                                                       void** outputs) mutable {
-        void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
+        void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
-        void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides);
+        void* param_output_strides =
-        void* param_padding_below = runtime::gpu::invoke_memory_primitive(ctx, idx_padding_below);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_strides);
+        void* param_padding_below = runtime::gpu::invoke_memory_primitive(m_ctx, idx_padding_below);
        void* param_padding_interior =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_padding_interior);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_padding_interior);
        std::vector<void*> args_list{&inputs[0],
                                     &outputs[0],
                                     &param_input_strides,
@@ -332,9 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
    m_primitive_emitter->cache(hash, primitive_index);
    return primitive_index;
 }
+size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes,
-size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeContext* ctx,
-                                                const std::array<std::string, 2>& dtypes,
                                                GPUShape input_shape,
                                                GPUShape input_order)
 {
@@ -359,13 +358,13 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        CudaKernelBuilder::add_pod_typedefs(writer);
        CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), dtypes, rank);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
@@ -396,8 +395,8 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo
    // create the launch primitive
    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
                                                                         void** outputs) mutable {
-        void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
+        void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
-        void* param_trans_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_trans_strides);
+        void* param_trans_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_trans_strides);
        std::vector<void*> args_list{
            &inputs[0], &outputs[0], &param_input_strides, &param_trans_strides, &nthreads};
@@ -420,8 +419,7 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& dtypes,
-                                              const std::array<std::string, 2>& dtypes,
                                              GPUShape input_shape,
                                              GPUShape lower_bounds,
                                              GPUShape slice_strides,
@@ -448,13 +446,13 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        CudaKernelBuilder::add_pod_typedefs(writer);
        CudaKernelBuilder::get_slice_op(writer, kernel_name.str(), dtypes, output_shape.size());
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
@@ -478,10 +476,11 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont
    // create the launch primitive
    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
                                                                         void** outputs) mutable {
-        void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
+        void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
-        void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides);
+        void* param_output_strides =
-        void* param_lower_bounds = runtime::gpu::invoke_memory_primitive(ctx, idx_lower_bounds);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_strides);
-        void* param_slice_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_slice_strides);
+        void* param_lower_bounds = runtime::gpu::invoke_memory_primitive(m_ctx, idx_lower_bounds);
+        void* param_slice_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_slice_strides);
        std::vector<void*> args_list{&inputs[0],
                                     &outputs[0],
                                     &param_input_strides,
@@ -509,8 +508,7 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::string, 3>& dtypes,
-                                                         const std::array<std::string, 3>& dtypes,
                                                         GPUShape input_shape0,
                                                         GPUShape input_shape1,
                                                         GPUShape output_shape,
@@ -538,14 +536,14 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        CudaKernelBuilder::add_pod_typedefs(writer);
        CudaKernelBuilder::get_reverse_sequence_op(
            writer, kernel_name.str(), dtypes, batch_axis, sequence_axis, output_shape.size());
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
@@ -564,8 +562,9 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU
    // create the launch primitive
    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
                                                                         void** outputs) mutable {
-        void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape);
+        void* param_output_shape = runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_shape);
-        void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides);
+        void* param_output_strides =
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_strides);
        std::vector<void*> args_list{&inputs[0],
                                     &inputs[1],
                                     &outputs[0],
@@ -592,8 +591,7 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string, 2>& dtypes,
-                                                    const std::array<std::string, 2>& dtypes,
                                                    GPUShape input_shape,
                                                    GPUShape output_shape,
                                                    size_t window_width,
@@ -617,13 +615,13 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx
    }
    // if the kernel has not been compiled, build it
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(hash);
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(hash);
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        CudaKernelBuilder::get_max_pool_1d(
            writer, kernel_name, dtypes, input_width, output_width, window_width, window_stride);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
    //TODO: currently we set it to 64, will add tuning method later
@@ -722,8 +720,7 @@ pooling_op_shape
    return shape;
 }
-size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2>& dtypes,
-                                                 const std::array<std::string, 2>& dtypes,
                                                 GPUShape input_shape,
                                                 GPUShape output_shape,
                                                 GPUShape window_shape,
@@ -750,13 +747,13 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
    // if the kernel has not been compiled, build it
    kernel_name += "_ip" + std::to_string(int(include_pad));
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name);
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        writer << include_helpers();
        CudaKernelBuilder::get_avg_pool(writer, kernel_name, dtypes, include_pad);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
    // precompute for fast constant memory access
@@ -843,8 +840,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
-                                                           const std::vector<std::string>& dtypes,
                                                           GPUShape tensor_shape,
                                                           const char* op,
                                                           const char* kernel)
@@ -870,7 +866,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
@@ -882,7 +878,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
        CudaKernelBuilder::get_elementwise_op(writer, kernel_name.str(), op, dtypes);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    size_t nthreads = shape_size(tensor_shape);
    //TODO: currently we set it to 64, will add tuning method later
@@ -920,8 +916,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
 }
 size_t
-    runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const GPURuntimeContext* ctx,
+    runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
-                                                            const std::vector<std::string>& dtypes,
                                                            GPUShape tensor_shape,
                                                            const std::set<size_t>& reduced_tensors,
                                                            const std::set<size_t>& axes,
@@ -954,7 +949,7 @@ size_t
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
@@ -972,7 +967,7 @@ size_t
                                                reduced_tensors,
                                                save_elementwise,
                                                tensor_shape.size());
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    // calculate strides
@@ -1016,10 +1011,10 @@ size_t
    // TODO: check if mutable is necessary
    std::unique_ptr<gpu::primitive> ew_collective(new gpu::primitive{[=](void** inputs,
                                                                         void** outputs) mutable {
-        void* strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_strides);
+        void* strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_strides);
-        void* stride_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_magic);
+        void* stride_magic_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_magic);
-        void* stride_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_shift);
+        void* stride_shift_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_shift);
-        void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_reduced_strides);
+        void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduced_strides);
        std::vector<void*> args_list;
        for (auto i = 0u; i < dtypes.size() - 1; i++)
@@ -1056,8 +1051,7 @@ size_t
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
-                                                      const OpName op_name,
                                                      const std::vector<std::string>& dtypes,
                                                      GPUShape input_shape,
                                                      GPUShape output_shape,
@@ -1106,7 +1100,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
    // a launch primitive for it based on the input tensor shape
    // but do not recompile the kernel. otherwise, do it all:
    // recompile the kernel and then create the primitive
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
@@ -1116,7 +1110,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
            CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes);
        }
        CudaKernelBuilder::get_reduce_window_op(writer, kernel_name.str(), op, dtypes, rank);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
    }
    size_t nthreads = shape_size(output_shape);
@@ -1136,12 +1130,12 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
    // create the launch primitive
    std::unique_ptr<gpu::primitive> f(new gpu::primitive{[=](void** inputs,
                                                             void** outputs) mutable {
-        void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
+        void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
-        void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape);
+        void* param_output_shape = runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_shape);
        void* param_reduce_window_shape =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_reduce_window_shape);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduce_window_shape);
        void* param_reduce_window_strides =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_reduce_window_strides);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduce_window_strides);
        std::vector<void*> args_list(7, NULL);
        args_list[0] = &inputs[0];
@@ -1172,8 +1166,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::string, 3>& dtypes,
-                                                      const std::array<std::string, 3>& dtypes,
                                                      GPUShape tensor_shape,
                                                      GPUShape source_shape,
                                                      GPUShape lower_bounds,
@@ -1200,14 +1193,14 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c
    constexpr const int nthreads_per_block = 32;
    // if the kernel has not been compiled, build it
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name);
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        writer << include_helpers();
        runtime::gpu::CudaKernelBuilder::get_replace_slice_op(
            writer, kernel_name, dtypes, nthreads_per_block);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
    // calculate strides
@@ -1261,18 +1254,18 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c
    float alpha = 1.0f;
    float beta = 0.0f;
-    std::unique_ptr<gpu::primitive> replace_slice(
+    std::unique_ptr<gpu::primitive> replace_slice(new gpu::primitive{[=](void** inputs,
-        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                                                                         void** outputs) mutable {
-            void* param_dstr = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides);
+        void* param_dstr = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
-            void* param_dmagic = runtime::gpu::invoke_memory_primitive(ctx, idx_dmagics);
+        void* param_dmagic = runtime::gpu::invoke_memory_primitive(m_ctx, idx_dmagics);
-            void* param_dshift = runtime::gpu::invoke_memory_primitive(ctx, idx_dshifts);
+        void* param_dshift = runtime::gpu::invoke_memory_primitive(m_ctx, idx_dshifts);
-            void* param_lbound = runtime::gpu::invoke_memory_primitive(ctx, idx_lower_bounds);
+        void* param_lbound = runtime::gpu::invoke_memory_primitive(m_ctx, idx_lower_bounds);
-            void* param_ubound = runtime::gpu::invoke_memory_primitive(ctx, idx_upper_bounds);
+        void* param_ubound = runtime::gpu::invoke_memory_primitive(m_ctx, idx_upper_bounds);
-            void* param_slice_str = runtime::gpu::invoke_memory_primitive(ctx, idx_slice_strides);
+        void* param_slice_str = runtime::gpu::invoke_memory_primitive(m_ctx, idx_slice_strides);
-            void* param_slice_magic = runtime::gpu::invoke_memory_primitive(ctx, idx_smagics);
+        void* param_slice_magic = runtime::gpu::invoke_memory_primitive(m_ctx, idx_smagics);
-            void* param_slice_shift = runtime::gpu::invoke_memory_primitive(ctx, idx_sshifts);
+        void* param_slice_shift = runtime::gpu::invoke_memory_primitive(m_ctx, idx_sshifts);
-            void* param_dsource = runtime::gpu::invoke_memory_primitive(ctx, idx_source_shape);
+        void* param_dsource = runtime::gpu::invoke_memory_primitive(m_ctx, idx_source_shape);
-            void* param_sourcestr = runtime::gpu::invoke_memory_primitive(ctx, idx_source_strides);
+        void* param_sourcestr = runtime::gpu::invoke_memory_primitive(m_ctx, idx_source_strides);
        void* args_list[] = {&inputs[0],
                             &inputs[1],
@@ -1311,8 +1304,7 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, 2>& dtypes,
-                                                  const std::array<std::string, 2>& dtypes,
                                                  GPUShape result_shape,
                                                  const std::set<size_t>& reduce_axes)
 {
@@ -1333,14 +1325,14 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
    }
    // if the kernel has not been compiled, build it
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name);
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        writer << include_helpers();
        runtime::gpu::CudaKernelBuilder::get_broadcast_op(
            writer, kernel_name, dtypes, result_shape.size());
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
    // calculate strides
@@ -1389,10 +1381,10 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
    std::unique_ptr<gpu::primitive> broadcast(new gpu::primitive{[=](void** inputs,
                                                                     void** outputs) mutable {
-        void* strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_strides);
+        void* strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_strides);
-        void* stride_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_magic);
+        void* stride_magic_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_magic);
-        void* stride_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_shift);
+        void* stride_shift_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_shift);
-        void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_reduced_strides);
+        void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduced_strides);
        void* args_list[] = {&inputs[0],
                             &outputs[0],
@@ -1423,8 +1415,7 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
    return primitive_index;
 }
-size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes,
-                                                    const std::array<std::string, 3>& dtypes,
                                                    GPUShape input_shape,
                                                    GPUShape input_pad_below,
                                                    GPUShape input_dilation,
@@ -1489,14 +1480,14 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx
    }
    // if the kernel has not been compiled, build it
-    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name);
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
    if (compiled_kernel == nullptr)
    {
        codegen::CodeWriter writer;
        writer << include_helpers();
        CudaKernelBuilder::get_convolution_forward(
            writer, kernel_name, dtypes, N, K, filter_size, rank, sm_tile_size, reg_tile_size);
-        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
    // ----- build primitive arguments -----
@@ -1609,24 +1600,28 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx
    std::unique_ptr<gpu::primitive> conv(new gpu::primitive{[=](void** inputs,
                                                                void** outputs) mutable {
-        void* pad_d = runtime::gpu::invoke_memory_primitive(ctx, idx_pad);
+        void* pad_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_pad);
-        void* data_dilation_d = runtime::gpu::invoke_memory_primitive(ctx, idx_data_dilation);
+        void* data_dilation_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation);
        void* data_dilation_magic_d =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_data_dilation_magic);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation_magic);
        void* data_dilation_shift_d =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_data_dilation_shift);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation_shift);
-        void* filter_strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_strides);
+        void* filter_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_strides);
-        void* filter_dilation_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_dilation);
+        void* filter_dilation_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_dilation);
-        void* input_shape_d = runtime::gpu::invoke_memory_primitive(ctx, idx_input_shape);
+        void* input_shape_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_shape);
-        void* input_shape_str_d = runtime::gpu::invoke_memory_primitive(ctx, idx_input_shape_str);
+        void* input_shape_str_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_shape_str);
        void* output_dim_strides_d =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_output_dim_strides);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_dim_strides);
-        void* output_str_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_output_str_magic);
+        void* output_str_magic_d =
-        void* output_str_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_output_str_shift);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_str_magic);
+        void* output_str_shift_d =
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_str_shift);
        void* filter_dim_strides_d =
-            runtime::gpu::invoke_memory_primitive(ctx, idx_filter_dim_strides);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_dim_strides);
-        void* filter_str_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_str_magic);
+        void* filter_str_magic_d =
-        void* filter_str_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_str_shift);
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_str_magic);
+        void* filter_str_shift_d =
+            runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_str_shift);
        void* args_list[] = {&inputs[0],
                             &inputs[1],

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -38,8 +38,7 @@ namespace ngraph
                friend class GPUPrimitiveEmitter;
            public:
-                size_t build_pad(const GPURuntimeContext* ctx,
+                size_t build_pad(const std::array<std::string, 2>& dtypes,
-                                 const std::array<std::string, 2>& dtypes,
                                 GPUShape input_shape,
                                 GPUShape output_shape,
                                 GPUShape pad_below,
@@ -47,22 +46,19 @@ namespace ngraph
                                 GPUShape pad_interior,
                                 const std::string& pad_value = "");
-                size_t build_pad_dynamic(const GPURuntimeContext* ctx,
+                size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
-                                         const std::array<std::string, 2>& dtypes,
                                         GPUShape input_shape,
                                         GPUShape output_shape,
                                         GPUShape padding_below,
                                         GPUShape padding_interior);
-                size_t build_1d_max_pool(const GPURuntimeContext* ctx,
+                size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
-                                         const std::array<std::string, 2>& dtypes,
                                         GPUShape input_shape,
                                         GPUShape output_shape,
                                         size_t window_width,
                                         size_t window_stride);
-                size_t build_avg_pool(const GPURuntimeContext* ctx,
+                size_t build_avg_pool(const std::array<std::string, 2>& dtypes,
-                                      const std::array<std::string, 2>& dtypes,
                                      GPUShape input_shape,
                                      GPUShape output_shape,
                                      GPUShape window_shape,
@@ -70,23 +66,20 @@ namespace ngraph
                                      GPUShape padding_below,
                                      bool include_pad = false);
-                size_t build_slice(const GPURuntimeContext* ctx,
+                size_t build_slice(const std::array<std::string, 2>& dtypes,
-                                   const std::array<std::string, 2>& dtypes,
                                   GPUShape input_shape,
                                   GPUShape lower_bounds,
                                   GPUShape slice_strides,
                                   GPUShape output_shape);
-                size_t build_reduce_window(const GPURuntimeContext* ctx,
+                size_t build_reduce_window(const OpName op_name,
-                                           const OpName op_name,
                                           const std::vector<std::string>& dtypes,
                                           GPUShape input_shape,
                                           GPUShape output_shape,
                                           GPUShape reduce_window_shape,
                                           GPUShape reduce_window_strides);
-                size_t build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
+                size_t build_reverse_sequence(const std::array<std::string, 3>& dtypes,
-                                              const std::array<std::string, 3>& dtypes,
                                              GPUShape input_shape0,
                                              GPUShape input_shape1,
                                              GPUShape output_shape,
@@ -94,24 +87,21 @@ namespace ngraph
                                              size_t sequence_axis);
                template <typename T>
-                size_t build_elementwise(const GPURuntimeContext* ctx,
+                size_t build_elementwise(const std::vector<std::string>& dtypes,
-                                         const std::vector<std::string>& dtypes,
                                         GPUShape tensor_shape)
                {
                    return build_elementwise_n_to_1(
-                        ctx, dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
+                        dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
                }
                template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
-                size_t build_elementwise_collective(const GPURuntimeContext* ctx,
+                size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
-                                                    const std::vector<std::string>& dtypes,
                                                    GPUShape tensor_shape,
                                                    const std::set<size_t>& reduced_tensors = {},
                                                    const std::set<size_t>& axes = {},
                                                    bool save_elementwise = false)
                {
-                    return build_fused_ew_to_collective(ctx,
+                    return build_fused_ew_to_collective(dtypes,
-                                                        dtypes,
                                                        tensor_shape,
                                                        reduced_tensors,
                                                        axes,
@@ -121,26 +111,22 @@ namespace ngraph
                                                        save_elementwise);
                }
-                size_t build_replace_slice(const GPURuntimeContext* ctx,
+                size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
-                                           const std::array<std::string, 3>& dtypes,
                                           GPUShape tensor_shape,
                                           GPUShape source_shape,
                                           GPUShape lower_bounds,
                                           GPUShape upper_bounds,
                                           GPUShape slice_stride);
-                size_t build_broadcast(const GPURuntimeContext* ctx,
+                size_t build_broadcast(const std::array<std::string, 2>& dtypes,
-                                       const std::array<std::string, 2>& dtypes,
                                       GPUShape result_shape,
                                       const std::set<size_t>& bcast_axes);
-                size_t build_reshape(const GPURuntimeContext* ctx,
+                size_t build_reshape(const std::array<std::string, 2>& dtypes,
-                                     const std::array<std::string, 2>& dtypes,
                                     GPUShape input_shape,
                                     GPUShape input_order);
-                size_t build_convolution(const GPURuntimeContext* ctx,
+                size_t build_convolution(const std::array<std::string, 3>& dtypes,
-                                         const std::array<std::string, 3>& dtypes,
                                         GPUShape input_shape,
                                         GPUShape input_pad_below,
                                         GPUShape input_dilation,
@@ -150,19 +136,17 @@ namespace ngraph
                                         GPUShape output_shape);
            private:
-                CUDAEmitter(GPUPrimitiveEmitter* emitter);
+                CUDAEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
                uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
                void print_tensor_from_gpu(codegen::CodeWriter& writer,
                                           const std::string& tensor_name,
                                           GPUShape shape);
                std::string include_helpers();
-                size_t build_elementwise_n_to_1(const GPURuntimeContext* ctx,
+                size_t build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
-                                                const std::vector<std::string>& dtypes,
                                                GPUShape tensor_shape,
                                                const char* op,
                                                const char* kernel);
-                size_t build_fused_ew_to_collective(const GPURuntimeContext* ctx,
+                size_t build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
-                                                    const std::vector<std::string>& dtypes,
                                                    GPUShape tensor_shape,
                                                    const std::set<size_t>& reduced_tensors,
                                                    const std::set<size_t>& axes,
@@ -172,6 +156,7 @@ namespace ngraph
                                                    bool save_elementwise);
                GPUPrimitiveEmitter* m_primitive_emitter;
+                GPURuntimeContext* m_ctx;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
@@ -107,9 +107,10 @@ std::vector<int>
    return low_vec;
 }
-runtime::gpu::CUDNNEmitter::CUDNNEmitter(GPUPrimitiveEmitter* emitter)
+runtime::gpu::CUDNNEmitter::CUDNNEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx)
    : m_primitive_emitter(emitter)
 {
+    m_ctx = ctx;
 }
 cudnnDataType_t runtime::gpu::CUDNNEmitter::get_cudnn_datatype(std::string dtype)
@@ -128,8 +129,7 @@ cudnnDataType_t runtime::gpu::CUDNNEmitter::get_cudnn_datatype(std::string dtype
    return p->second;
 }
-size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
-                                                        const cudnnReduceTensorOp_t& reduce_op,
                                                        const std::string& dtype,
                                                        const Shape& input_shape,
                                                        const AxisSet& reduction_axes)
@@ -162,7 +162,7 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
    size_t workspace_size = 0;
    CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
-        *ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
+        *m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
    size_t workspace_idx = allocator.reserve_workspace(workspace_size);
    void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
    void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
@@ -177,8 +177,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
                                                           CUDNN_REDUCE_TENSOR_NO_INDICES,
                                                           CUDNN_32BIT_INDICES));
-            void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
+            void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
-            CUDNN_SAFE_CALL(cudnnReduceTensor(*ctx->cudnn_handle,
+            CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
                                              desc,
                                              nullptr,
                                              0,
@@ -197,8 +197,7 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
    return primitive_index;
 }
-size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
-                                                   const cudnnOpTensorOp_t& tensor_op,
                                                   const std::string& dtype,
                                                   const Shape& input_shape,
                                                   const double alpha0,
@@ -231,7 +230,7 @@ size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const GPURuntimeContext* ctx,
            CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(
                opTensorDesc, tensor_op, data_type, CUDNN_NOT_PROPAGATE_NAN));
-            CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,
+            CUDNN_SAFE_CALL(cudnnOpTensor(*m_ctx->cudnn_handle,
                                          opTensorDesc,
                                          alpha_dt0,
                                          descriptor,
@@ -326,8 +325,7 @@ cudnnConvolutionDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_convolution_
    return conv_descriptor;
 }
-size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
-                                                     const std::string& dtype,
                                                     const Shape& input_tensor_shape,
                                                     const Shape& input_filter_shape,
                                                     const Shape& output_tensor_shape,
@@ -366,7 +364,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt
    void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
    size_t workspace_size_in_bytes = 0;
-    CUDNN_SAFE_CALL(cudnnGetConvolutionForwardWorkspaceSize(*ctx->cudnn_handle,
+    CUDNN_SAFE_CALL(cudnnGetConvolutionForwardWorkspaceSize(*m_ctx->cudnn_handle,
                                                            tensor_desc_0,
                                                            filter_desc,
                                                            conv_desc,
@@ -382,8 +380,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt
    std::unique_ptr<gpu::primitive> conv;
    conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
        void** inputs, void** outputs) {
-        void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
+        void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
-        CUDNN_SAFE_CALL(cudnnConvolutionForward(*ctx->cudnn_handle,
+        CUDNN_SAFE_CALL(cudnnConvolutionForward(*m_ctx->cudnn_handle,
                                                alpha,
                                                tensor_desc_0,
                                                inputs[0],
@@ -404,7 +402,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt
 }
 size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
-    const runtime::gpu::GPURuntimeContext* ctx,
    const std::string& dtype,
    const Shape& input_filter_shape,
    const Shape& input_tensor_shape,
@@ -443,7 +440,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
    void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
    size_t workspace_size_in_bytes = 0;
-    CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(*ctx->cudnn_handle,
+    CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(*m_ctx->cudnn_handle,
                                                                 filter_desc,
                                                                 tensor_desc_0,
                                                                 conv_desc,
@@ -459,8 +456,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
    std::unique_ptr<gpu::primitive> conv;
    conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
        void** inputs, void** outputs) {
-        void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
+        void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
-        CUDNN_SAFE_CALL(cudnnConvolutionBackwardData(*ctx->cudnn_handle,
+        CUDNN_SAFE_CALL(cudnnConvolutionBackwardData(*m_ctx->cudnn_handle,
                                                     alpha,
                                                     filter_desc,
                                                     inputs[0],
@@ -481,7 +478,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
 }
 size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
-    const runtime::gpu::GPURuntimeContext* ctx,
    const std::string& dtype,
    const Shape& input_tensor_shape_0,
    const Shape& input_tensor_shape_1,
@@ -520,7 +516,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    size_t workspace_size_in_bytes = 0;
-    CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*ctx->cudnn_handle,
+    CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_ctx->cudnn_handle,
                                                                   tensor_desc_0,
                                                                   tensor_desc_1,
                                                                   conv_desc,
@@ -538,8 +534,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
    std::unique_ptr<gpu::primitive> conv;
    conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
        void** inputs, void** outputs) {
-        void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
+        void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
-        CUDNN_SAFE_CALL(cudnnConvolutionBackwardFilter(*ctx->cudnn_handle,
+        CUDNN_SAFE_CALL(cudnnConvolutionBackwardFilter(*m_ctx->cudnn_handle,
                                                       alpha,
                                                       tensor_desc_0,
                                                       inputs[0],
@@ -558,8 +554,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
    return primitive_index;
 }
-size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDNNEmitter::build_pooling(const cudnnPoolingMode_t& pool_op,
-                                                 const cudnnPoolingMode_t& pool_op,
                                                 const std::string& dtype,
                                                 const Prop& direction,
                                                 const Shape& input_shape,
@@ -638,7 +633,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
    {
        pool.reset(new gpu::primitive{
            [=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
-                CUDNN_SAFE_CALL(cudnnPoolingForward(*ctx->cudnn_handle,
+                CUDNN_SAFE_CALL(cudnnPoolingForward(*m_ctx->cudnn_handle,
                                                    desc,
                                                    alpha,
                                                    input_desc,
@@ -660,7 +655,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
                // cuDNN requires the output tensor of the maxpool fprop to be passed even though
                // it is not mathematically necessary. It appears, however, that it is not actually
                // used as the adjoints are passed in place and the correct result is achieved.
-                CUDNN_SAFE_CALL(cudnnPoolingBackward(*ctx->cudnn_handle,
+                CUDNN_SAFE_CALL(cudnnPoolingBackward(*m_ctx->cudnn_handle,
                                                     desc,
                                                     alpha,
                                                     // output (wrt maxpool) tensor
@@ -686,8 +681,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
    return primitive_index;
 }
-size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const cudnnBatchNormMode_t& bn_op,
-                                                   const cudnnBatchNormMode_t& bn_op,
                                                   const std::string& dtype,
                                                   const Prop& direction,
                                                   const Shape& tensor_shape,
@@ -728,7 +722,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
    {
        batchnorm.reset(new gpu::primitive{
            [=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
-                CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardInference(*ctx->cudnn_handle,
+                CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardInference(*m_ctx->cudnn_handle,
                                                                        bn_op,
                                                                        alpha,
                                                                        beta,
@@ -762,7 +756,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
        void* bias_factor = m_host_parameters.allocate_by_datatype(data_type, (m - 1) / m);
        batchnorm.reset(new gpu::primitive{
            [=, &op_desc, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
-                CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardTraining(*ctx->cudnn_handle,
+                CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardTraining(*m_ctx->cudnn_handle,
                                                                       bn_op,
                                                                       alpha,
                                                                       beta,
@@ -781,7 +775,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
                                                                       NULL));
                // convert to biased variance
-                CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,
+                CUDNN_SAFE_CALL(cudnnOpTensor(*m_ctx->cudnn_handle,
                                              op_desc,
                                              beta,
                                              derived_param_desc,
@@ -800,7 +794,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
        batchnorm.reset(new gpu::primitive{
            [=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
                CUDNN_SAFE_CALL(cudnnBatchNormalizationBackward(
-                    *ctx->cudnn_handle,
+                    *m_ctx->cudnn_handle,
                    bn_op,
                    alpha,
                    beta,
@@ -829,8 +823,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
    return primitive_index;
 }
-size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeContext* ctx,
+size_t runtime::gpu::CUDNNEmitter::build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
-                                                 const cudnnSoftmaxAlgorithm_t& algorithm,
                                                 const cudnnSoftmaxMode_t& mode,
                                                 const std::string& dtype,
                                                 const Prop& direction,
@@ -862,7 +855,7 @@ size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeC
    case Prop::Inference:
    {
        softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
-            CUDNN_SAFE_CALL(cudnnSoftmaxForward(*ctx->cudnn_handle,
+            CUDNN_SAFE_CALL(cudnnSoftmaxForward(*m_ctx->cudnn_handle,
                                                algorithm,
                                                mode,
                                                alpha,
@@ -877,7 +870,7 @@ size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeC
    case Prop::Backward:
    {
        softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
-            CUDNN_SAFE_CALL(cudnnSoftmaxBackward(*ctx->cudnn_handle,
+            CUDNN_SAFE_CALL(cudnnSoftmaxBackward(*m_ctx->cudnn_handle,
                                                 algorithm,
                                                 mode,
                                                 alpha,

--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
@@ -56,8 +56,7 @@ namespace ngraph
                    Backward
                };
-                size_t build_convolution(const runtime::gpu::GPURuntimeContext* ctx,
+                size_t build_convolution(const std::string& dtype,
-                                         const std::string& dtype,
                                         const Shape& input_tensor_shape,
                                         const Shape& input_filter_shape,
                                         const Shape& output_tensor_shape,
@@ -65,8 +64,7 @@ namespace ngraph
                                         const Strides& window_dilation_strides,
                                         const Shape& padding_below);
-                size_t build_convolution_backward_data(const runtime::gpu::GPURuntimeContext* ctx,
+                size_t build_convolution_backward_data(const std::string& dtype,
-                                                       const std::string& dtype,
                                                       const Shape& input_filter_shape,
                                                       const Shape& input_tensor_shape,
                                                       const Shape& output_tensor_shape,
@@ -74,8 +72,7 @@ namespace ngraph
                                                       const Strides& window_dilation_strides,
                                                       const Shape& padding_below);
-                size_t build_convolution_backward_filter(const runtime::gpu::GPURuntimeContext* ctx,
+                size_t build_convolution_backward_filter(const std::string& dtype,
-                                                         const std::string& dtype,
                                                         const Shape& input_tensor_shape_0,
                                                         const Shape& input_tensor_shape_1,
                                                         const Shape& output_filter_shape,
@@ -83,22 +80,19 @@ namespace ngraph
                                                         const Strides& window_dilation_strides,
                                                         const Shape& padding_below);
-                size_t build_reduce_forward(const GPURuntimeContext* ctx,
+                size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
-                                            const cudnnReduceTensorOp_t& reduce_op,
                                            const std::string& dtype,
                                            const Shape& input_shape,
                                            const AxisSet& reduction_axes);
-                size_t build_tensor_op(const GPURuntimeContext* ctx,
+                size_t build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
-                                       const cudnnOpTensorOp_t& tensor_op,
                                       const std::string& dtype,
                                       const Shape& input_shape,
                                       const double alpha0,
                                       const double alpha1,
                                       const double beta);
-                size_t build_pooling(const GPURuntimeContext* ctx,
+                size_t build_pooling(const cudnnPoolingMode_t& pool_op,
-                                     const cudnnPoolingMode_t& pool_op,
                                     const std::string& dtype,
                                     const Prop& direction,
                                     const ngraph::Shape& input_shape,
@@ -108,23 +102,21 @@ namespace ngraph
                                     const ngraph::Shape& padding_below,
                                     const ngraph::Shape& padding_above);
-                size_t build_batchnorm(const runtime::gpu::GPURuntimeContext* ctx,
+                size_t build_batchnorm(const cudnnBatchNormMode_t& bn_op,
-                                       const cudnnBatchNormMode_t& bn_op,
                                       const std::string& dtype,
                                       const Prop& direction,
                                       const Shape& tensor_shape,
                                       const Shape& param_shape,
                                       double epsilon);
-                size_t build_softmax(const runtime::gpu::GPURuntimeContext* ctx,
+                size_t build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
-                                     const cudnnSoftmaxAlgorithm_t& algorithm,
                                     const cudnnSoftmaxMode_t& mode,
                                     const std::string& dtype,
                                     const Prop& direction,
                                     const Shape& tensor_shape);
            private:
-                CUDNNEmitter(GPUPrimitiveEmitter* emitter);
+                CUDNNEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
                void* get_data_by_type(cudnnDataType_t data_type, double value);
@@ -149,6 +141,7 @@ namespace ngraph
                CUDNNHostParameters m_host_parameters;
                GPUPrimitiveEmitter* m_primitive_emitter;
+                GPURuntimeContext* m_ctx;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -14,9 +14,15 @@
 * limitations under the License.
 *******************************************************************************/
-#include "ngraph/runtime/gpu/gpu_backend.hpp"
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
 #include "ngraph/graph_util.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
 #include "ngraph/util.hpp"
@@ -38,6 +44,57 @@ extern "C" void delete_backend(runtime::Backend* backend)
    delete backend;
 }
+runtime::gpu::GPU_Backend::GPU_Backend()
+    : runtime::Backend()
+    , m_context(new BackendContext())
+{
+}
+runtime::gpu::GPU_Backend::BackendContext::BackendContext()
+    : m_runtime_context(new GPURuntimeContext)
+    , m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
+{
+    // Create context use driver API and make it current, the runtime call will pickup the context
+    // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+    // #interoperability-between-runtime-and-driver-apis
+    ngraph::runtime::gpu::CudaContextManager::Instance().SetContextCurrent();
+    m_runtime_context->cublas_handle = new cublasHandle_t;
+    cublasStatus_t cublasStatus = cublasCreate(m_runtime_context->cublas_handle);
+    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
+    {
+        throw runtime_error("cuBLAS create handle failed");
+    }
+    // Pass scalars as reference on the Device
+    cublasSetPointerMode(*m_runtime_context->cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
+    m_runtime_context->cudnn_handle = new cudnnHandle_t;
+    cudnnStatus_t cudnnStatus = cudnnCreate(m_runtime_context->cudnn_handle);
+    if (cudnnStatus != CUDNN_STATUS_SUCCESS)
+    {
+        throw runtime_error("cuDNN create handle failed");
+    }
+    // register with c-api runtime context
+    m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
+}
+void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
+{
+    // add pointers to gpu primitives into the gpu runtime context
+    m_runtime_context->gpu_primitives = m_primitive_emitter->get_primitives().data();
+    m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
+}
+runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
+{
+    cublasDestroy(*m_runtime_context->cublas_handle);
+    delete m_runtime_context->cublas_handle;
+    cudnnDestroy(*m_runtime_context->cudnn_handle);
+    delete m_runtime_context->cudnn_handle;
+    delete m_runtime_context->compiled_kernel_pool;
+}
 shared_ptr<runtime::gpu::GPU_CallFrame> runtime::gpu::GPU_Backend::make_call_frame(
    const shared_ptr<GPU_ExternalFunction>& external_function)
 {
@@ -61,7 +118,7 @@ bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
    FunctionInstance& instance = m_function_map[func];
    if (instance.m_external_function == nullptr)
    {
-        instance.m_external_function = make_shared<GPU_ExternalFunction>(func);
+        instance.m_external_function = make_shared<GPU_ExternalFunction>(func, m_context);
        instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
        auto cf = instance.m_external_function->make_call_frame();
        instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf);
@@ -83,7 +140,9 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
        rc = compile(func);
    }
-    instance.m_call_frame->call(outputs, inputs);
+    // ensure the GPURuntimeContext primitive pointers are valid
+    m_context->prepare_runtime_context();
+    instance.m_call_frame->call(outputs, inputs, m_context->m_runtime_context.get());
    return rc;
 }

--- a/src/ngraph/runtime/gpu/gpu_backend.hpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.hpp
@@ -31,10 +31,13 @@ namespace ngraph
            class GPU_ExternalFunction;
            class GPU_CallFrame;
+            class GPUPrimitiveEmitter;
+            struct GPURuntimeContext;
            class GPU_Backend : public Backend
            {
            public:
+                GPU_Backend();
                std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame(
                    const std::shared_ptr<ngraph::runtime::gpu::GPU_ExternalFunction>&
                        external_function);
@@ -59,6 +62,17 @@ namespace ngraph
                std::vector<PerformanceCounter>
                    get_performance_data(std::shared_ptr<Function> func) const override;
+                class BackendContext
+                {
+                public:
+                    BackendContext();
+                    ~BackendContext();
+                    void prepare_runtime_context();
+                    std::unique_ptr<GPURuntimeContext> m_runtime_context;
+                    std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
+                };
            private:
                class FunctionInstance
                {
@@ -69,6 +83,7 @@ namespace ngraph
                };
                std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
+                std::shared_ptr<BackendContext> m_context;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_call_frame.cpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.cpp
@@ -32,17 +32,16 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction>
    : m_external_function(external_function)
    , m_compiled_function(compiled_function)
 {
-    setup_runtime_context();
 }
 runtime::gpu::GPU_CallFrame::~GPU_CallFrame()
 {
-    cleanup_runtime_context();
 }
 void runtime::gpu::GPU_CallFrame::call(
    const std::vector<std::shared_ptr<runtime::TensorView>>& output_tvs,
-    const std::vector<std::shared_ptr<runtime::TensorView>>& input_tvs)
+    const std::vector<std::shared_ptr<runtime::TensorView>>& input_tvs,
+    GPURuntimeContext* ctx)
 {
    //Device tensors
    vector<void*> inputs;
@@ -61,18 +60,5 @@ void runtime::gpu::GPU_CallFrame::call(
        outputs.push_back(tv->m_allocated_buffer_pool);
    }
-    m_compiled_function(inputs.data(), outputs.data(), m_external_function->m_ctx.get());
+    m_compiled_function(inputs.data(), outputs.data(), ctx);
-}
-void runtime::gpu::GPU_CallFrame::setup_runtime_context()
-{
-    // add pointers to gpu primitives into the gpu runtime context
-    const auto& primitive_emitter = m_external_function->get_primitive_emitter();
-    m_external_function->m_ctx->gpu_primitives = primitive_emitter->get_primitives().data();
-    m_external_function->m_ctx->gpu_memory_primitives =
-        primitive_emitter->get_memory_primitives().data();
-}
-void runtime::gpu::GPU_CallFrame::cleanup_runtime_context()
-{
 }
--- a/src/ngraph/runtime/gpu/gpu_call_frame.hpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.hpp
@@ -53,10 +53,8 @@ namespace ngraph
                ///
                /// Tuples will be expanded into their tensor views to build the call frame.
                void call(const std::vector<std::shared_ptr<runtime::TensorView>>& outputs,
-                          const std::vector<std::shared_ptr<runtime::TensorView>>& inputs);
+                          const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
+                          GPURuntimeContext* ctx);
-                void setup_runtime_context();
-                void cleanup_runtime_context();
            protected:
                std::shared_ptr<GPU_ExternalFunction> m_external_function;

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -123,13 +123,8 @@ namespace ngraph
                {
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-                    auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(),
+                    auto index = cudnn_emitter->build_tensor_op(
-                                                                CUDNN_OP_TENSOR_ADD,
+                        CUDNN_OP_TENSOR_ADD, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
-                                                                out[0].get_type(),
-                                                                args[0].get_shape(),
-                                                                1.0,
-                                                                1.0,
-                                                                0);
                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << ","
@@ -187,22 +182,16 @@ namespace ngraph
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
-                    size_t reshape_data_index =
+                    size_t reshape_data_index = cuda_emitter->build_reshape(
-                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                        {{args[0].get_type(), args[0].get_type()}}, input_shape, input_order);
-                                                    {{args[0].get_type(), args[0].get_type()}},
-                                                    input_shape,
-                                                    input_order);
                    writer << "void* data = gpu::invoke_memory_primitive(ctx, "
                           << transposed_data_idx << ");\n";
                    writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
                    writer << "std::vector<void*>{data}.data());\n";
-                    size_t reshape_filter_index =
+                    size_t reshape_filter_index = cuda_emitter->build_reshape(
-                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                        {{args[1].get_type(), args[1].get_type()}}, filter_shape, input_order);
-                                                    {{args[1].get_type(), args[1].get_type()}},
-                                                    filter_shape,
-                                                    input_order);
                    writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
                           << transposed_filter_idx << ");\n";
                    writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
@@ -227,7 +216,6 @@ namespace ngraph
                    output_shape = reshape(output_shape, input_order);
                    size_t conv_index = cuda_emitter->build_convolution(
-                        external_function->ctx().get(),
                        {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
                        input_shape,
                        padding_below_diff,
@@ -250,11 +238,8 @@ namespace ngraph
                        input_order.push_back(i);
                    }
-                    size_t reshape_output_index =
+                    size_t reshape_output_index = cuda_emitter->build_reshape(
-                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                        {{args[1].get_type(), args[1].get_type()}}, output_shape, input_order);
-                                                    {{args[1].get_type(), args[1].get_type()}},
-                                                    output_shape,
-                                                    input_order);
                    writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
                    writer << "std::vector<void*>{output}.data(), ";
                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
@@ -304,7 +289,6 @@ namespace ngraph
                        auto& cuda_emitter =
                            external_function->get_primitive_emitter()->get_cuda_emitter();
                        auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
-                            external_function->ctx().get(),
                            {{args[0].get_type(), out[0].get_type()}},
                            input_shape,
                            input_shape_padded,
@@ -322,8 +306,7 @@ namespace ngraph
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-                    size_t index = cudnn_emitter->build_convolution(external_function->ctx().get(),
+                    size_t index = cudnn_emitter->build_convolution(out[0].get_type(),
-                                                                    out[0].get_type(),
                                                                    input_shape_padded,
                                                                    args[1].get_shape(),
                                                                    out[0].get_shape(),
@@ -424,8 +407,7 @@ namespace ngraph
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
                    auto pad_dynamic_index =
-                        cuda_emitter->build_pad_dynamic(external_function->ctx().get(),
+                        cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
-                                                        {{args[0].get_type(), out[0].get_type()}},
                                                        output_shape,
                                                        output_shape_padded,
                                                        padding_below,
@@ -442,8 +424,7 @@ namespace ngraph
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
                size_t index =
-                    cudnn_emitter->build_convolution_backward_data(external_function->ctx().get(),
+                    cudnn_emitter->build_convolution_backward_data(out[0].get_type(),
-                                                                   out[0].get_type(),
                                                                   args[0].get_shape(),
                                                                   args[1].get_shape(),
                                                                   output_shape_padded,
@@ -469,8 +450,7 @@ namespace ngraph
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
                    auto slice_index =
-                        cuda_emitter->build_slice(external_function->ctx().get(),
+                        cuda_emitter->build_slice({{args[0].get_type(), out[0].get_type()}},
-                                                  {{args[0].get_type(), out[0].get_type()}},
                                                  output_shape_padded,
                                                  padding_below_back,
                                                  padding_interior_back,
@@ -550,8 +530,7 @@ namespace ngraph
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
                    auto pad_dynamic_index =
-                        cuda_emitter->build_pad_dynamic(external_function->ctx().get(),
+                        cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
-                                                        {{args[0].get_type(), out[0].get_type()}},
                                                        input_shape,
                                                        input_shape_padded,
                                                        padding_below,
@@ -569,8 +548,7 @@ namespace ngraph
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
                size_t index =
-                    cudnn_emitter->build_convolution_backward_filter(external_function->ctx().get(),
+                    cudnn_emitter->build_convolution_backward_filter(out[0].get_type(),
-                                                                     out[0].get_type(),
                                                                     input_shape_padded,
                                                                     args[1].get_shape(),
                                                                     out[0].get_shape(),
@@ -770,13 +748,8 @@ namespace ngraph
                {
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-                    auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(),
+                    auto index = cudnn_emitter->build_tensor_op(
-                                                                CUDNN_OP_TENSOR_MAX,
+                        CUDNN_OP_TENSOR_MAX, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
-                                                                out[0].get_type(),
-                                                                args[0].get_shape(),
-                                                                1.0,
-                                                                1.0,
-                                                                0);
                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << ","
@@ -798,13 +771,8 @@ namespace ngraph
                {
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-                    auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(),
+                    auto index = cudnn_emitter->build_tensor_op(
-                                                                CUDNN_OP_TENSOR_MIN,
+                        CUDNN_OP_TENSOR_MIN, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
-                                                                out[0].get_type(),
-                                                                args[0].get_shape(),
-                                                                1.0,
-                                                                1.0,
-                                                                0);
                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << ","
@@ -838,11 +806,8 @@ namespace ngraph
                auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
-                auto bcast_index =
+                auto bcast_index = cuda_emitter->build_broadcast(
-                    cuda_emitter->build_broadcast(external_function->ctx().get(),
+                    {{args[0].get_type(), out[0].get_type()}}, result_shape, axes);
-                                                  {{args[0].get_type(), out[0].get_type()}},
-                                                  result_shape,
-                                                  axes);
                writer << "gpu::invoke_primitive(ctx, " << bcast_index << ", ";
                writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
                writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
@@ -954,11 +919,8 @@ namespace ngraph
                {
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
-                    auto index =
+                    auto index = cuda_emitter->build_reshape(
-                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                        {{args[0].get_type(), out[0].get_type()}}, arg_shape, input_order);
-                                                    {{args[0].get_type(), out[0].get_type()}},
-                                                    arg_shape,
-                                                    input_order);
                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
@@ -1031,8 +993,7 @@ namespace ngraph
                    auto& cuda_emitter =
                        external_function->get_primitive_emitter()->get_cuda_emitter();
                    auto index =
-                        cuda_emitter->build_slice(external_function->ctx().get(),
+                        cuda_emitter->build_slice({{args[0].get_type(), out[0].get_type()}},
-                                                  {{args[0].get_type(), out[0].get_type()}},
                                                  arg_shape,
                                                  lower_bounds,
                                                  slice_strides,
@@ -1117,7 +1078,6 @@ namespace ngraph
                auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
                auto rs_index = cuda_emitter->build_reverse_sequence(
-                    external_function->ctx().get(),
                    {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
                    arg_shape0,
                    arg_shape1,
@@ -1142,13 +1102,8 @@ namespace ngraph
                {
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-                    auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(),
+                    auto index = cudnn_emitter->build_tensor_op(
-                                                                CUDNN_OP_TENSOR_MUL,
+                        CUDNN_OP_TENSOR_MUL, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
-                                                                out[0].get_type(),
-                                                                args[0].get_shape(),
-                                                                1.0,
-                                                                1.0,
-                                                                0);
                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << ","
@@ -1201,13 +1156,8 @@ namespace ngraph
                {
                    auto& cudnn_emitter =
                        external_function->get_primitive_emitter()->get_cudnn_emitter();
-                    auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(),
+                    auto index = cudnn_emitter->build_tensor_op(
-                                                                CUDNN_OP_TENSOR_SQRT,
+                        CUDNN_OP_TENSOR_SQRT, out[0].get_type(), args[0].get_shape(), 1.0, 0, 0);
-                                                                out[0].get_type(),
-                                                                args[0].get_shape(),
-                                                                1.0,
-                                                                0,
-                                                                0);
                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
                    writer << "std::vector<void*>{" << args[0].get_name() << ","
@@ -1259,8 +1209,7 @@ namespace ngraph
                            auto& cudnn_emitter =
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
                            auto max_index =
-                                cudnn_emitter->build_reduce_forward(external_function->ctx().get(),
+                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MAX,
-                                                                    CUDNN_REDUCE_TENSOR_MAX,
                                                                    out[0].get_type(),
                                                                    args[0].get_shape(),
                                                                    max_op->get_reduction_axes());
@@ -1308,8 +1257,7 @@ namespace ngraph
                            auto& cudnn_emitter =
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
                            auto min_index =
-                                cudnn_emitter->build_reduce_forward(external_function->ctx().get(),
+                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MIN,
-                                                                    CUDNN_REDUCE_TENSOR_MIN,
                                                                    out[0].get_type(),
                                                                    args[0].get_shape(),
                                                                    min_op->get_reduction_axes());
@@ -1348,8 +1296,7 @@ namespace ngraph
                            auto& cudnn_emitter =
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
                            auto sum_index =
-                                cudnn_emitter->build_reduce_forward(external_function->ctx().get(),
+                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_ADD,
-                                                                    CUDNN_REDUCE_TENSOR_ADD,
                                                                    out[0].get_type(),
                                                                    args[0].get_shape(),
                                                                    sum->get_reduction_axes());
@@ -1393,8 +1340,7 @@ namespace ngraph
                            auto& cudnn_emitter =
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
                            auto index =
-                                cudnn_emitter->build_reduce_forward(external_function->ctx().get(),
+                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MUL,
-                                                                    CUDNN_REDUCE_TENSOR_MUL,
                                                                    out[0].get_type(),
                                                                    args[0].get_shape(),
                                                                    product->get_reduction_axes());
@@ -1491,7 +1437,6 @@ namespace ngraph
                            auto& cudnn_emitter =
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
                            auto reduce_index = cudnn_emitter->build_reduce_forward(
-                                external_function->ctx().get(),
                                reduce_tensor_op,
                                out[0].get_type(),
                                args[0].get_shape(),
@@ -1596,7 +1541,6 @@ namespace ngraph
                                args[0].get_type(), args[0].get_type(), out[0].get_type()};
                            reduce_index = cuda_emitter->build_reduce_window(
-                                external_function->ctx().get(),
                                it->second,
                                dtypes,
                                args[0].get_shape(),
@@ -1631,8 +1575,7 @@ namespace ngraph
                        external_function->get_primitive_emitter()->get_cuda_emitter();
                    auto pad_index =
-                        cuda_emitter->build_pad(external_function->ctx().get(),
+                        cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
-                                                {{args[0].get_type(), out[0].get_type()}},
                                                input_shape,
                                                output_shape,
                                                padding_below,
@@ -1694,8 +1637,7 @@ namespace ngraph
                        ss << TypeInfo::Get(args[0].get_element_type())->lowest();
                        auto pad_index =
-                            cuda_emitter->build_pad(external_function->ctx().get(),
+                            cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
-                                                    {{args[0].get_type(), out[0].get_type()}},
                                                    input_shape,
                                                    shape_to_pool,
                                                    padding_below,
@@ -1733,7 +1675,6 @@ namespace ngraph
                                external_function->get_primitive_emitter()->get_cuda_emitter();
                            max_pool_index = cuda_emitter->build_1d_max_pool(
-                                external_function->ctx().get(),
                                {{args[0].get_type(), out[0].get_type()}},
                                input_shape,
                                result_shape,
@@ -1747,7 +1688,6 @@ namespace ngraph
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
                            max_pool_index = cudnn_emitter->build_pooling(
-                                external_function->ctx().get(),
                                CUDNN_POOLING_MAX,
                                out[0].get_type(),
                                CUDNNEmitter::Prop::Forward,
@@ -1795,8 +1735,7 @@ namespace ngraph
                    if (fp_input_shape.size() >= 4)
                    {
                        auto max_pool_bp_index =
-                            cudnn_emitter->build_pooling(external_function->ctx().get(),
+                            cudnn_emitter->build_pooling(CUDNN_POOLING_MAX,
-                                                         CUDNN_POOLING_MAX,
                                                         out[0].get_type(),
                                                         CUDNNEmitter::Prop::Backward,
                                                         fp_input_shape,
@@ -1835,8 +1774,7 @@ namespace ngraph
                    direction = CUDNNEmitter::Prop::Inference;
                }
-                auto bn_index = cudnn_emitter->build_batchnorm(external_function->ctx().get(),
+                auto bn_index = cudnn_emitter->build_batchnorm(CUDNN_BATCHNORM_SPATIAL,
-                                                               CUDNN_BATCHNORM_SPATIAL,
                                                               out[0].get_type(),
                                                               direction,
                                                               args[2].get_shape(),
@@ -1872,8 +1810,7 @@ namespace ngraph
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                auto bn_index = cudnn_emitter->build_batchnorm(external_function->ctx().get(),
+                auto bn_index = cudnn_emitter->build_batchnorm(CUDNN_BATCHNORM_SPATIAL,
-                                                               CUDNN_BATCHNORM_SPATIAL,
                                                               out[0].get_type(),
                                                               CUDNNEmitter::Prop::Backward,
                                                               args[2].get_shape(),
@@ -1970,8 +1907,7 @@ namespace ngraph
                            external_function->get_primitive_emitter()->get_cuda_emitter();
                        avg_pool_index =
-                            cuda_emitter->build_avg_pool(external_function->ctx().get(),
+                            cuda_emitter->build_avg_pool({{args[0].get_type(), out[0].get_type()}},
-                                                         {{args[0].get_type(), out[0].get_type()}},
                                                         input_shape,
                                                         result_shape,
                                                         avg_pool->get_window_shape(),
@@ -1991,7 +1927,6 @@ namespace ngraph
                                                      : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
                            avg_pool_index = cudnn_emitter->build_pooling(
-                                external_function->ctx().get(),
                                cudnn_avg_type,
                                out[0].get_type(),
                                CUDNNEmitter::Prop::Forward,
@@ -2036,8 +1971,7 @@ namespace ngraph
                                                  : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
                        auto avg_pool_bp_index =
-                            cudnn_emitter->build_pooling(external_function->ctx().get(),
+                            cudnn_emitter->build_pooling(cudnn_avg_type,
-                                                         cudnn_avg_type,
                                                         out[0].get_type(),
                                                         CUDNNEmitter::Prop::Backward,
                                                         output_shape,
@@ -2095,7 +2029,6 @@ namespace ngraph
                            external_function->get_primitive_emitter()->get_cuda_emitter();
                        auto replace_slice_index = cuda_emitter->build_replace_slice(
-                            external_function->ctx().get(),
                            {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
                            input_shape,
                            source_shape,
@@ -2146,7 +2079,6 @@ namespace ngraph
                        size_t exp_sum_reduce =
                            cuda_emitter
                                ->build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>(
-                                    external_function->ctx().get(),
                                    {{args[0].get_type(), out[0].get_type()}},
                                    args[0].get_shape(),
                                    {},
@@ -2166,7 +2098,6 @@ namespace ngraph
                        // inplace binary division with fused broadcast to calculate softmax
                        size_t div_broadcast =
                            cuda_emitter->build_elementwise_collective<ngraph::op::Divide>(
-                                external_function->ctx().get(),
                                {{out[0].get_type(), out[0].get_type(), out[0].get_type()}},
                                out[0].get_shape(),
                                {1},
@@ -2181,8 +2112,7 @@ namespace ngraph
                    else
                    {
                        size_t softmax_index =
-                            cudnn_emitter->build_softmax(external_function->ctx().get(),
+                            cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
-                                                         CUDNN_SOFTMAX_FAST,
                                                         CUDNN_SOFTMAX_MODE_INSTANCE,
                                                         out[0].get_type(),
                                                         CUDNNEmitter::Prop::Forward,

--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -85,8 +85,8 @@ namespace ngraph
                            dtypes.push_back(arg.get_type());
                        }
                        dtypes.push_back(out[0].get_type());
-                        auto ew_index = cuda_emitter->build_elementwise<T>(
+                        auto ew_index =
-                            external_function->ctx().get(), dtypes, out[0].get_shape());
+                            cuda_emitter->build_elementwise<T>(dtypes, out[0].get_shape());
                        writer << "gpu::invoke_primitive(ctx, " << ew_index << ", ";
                        writer << "std::vector<void*>{" << args.front().get_name();
                        for (size_t i = 1; i < args.size(); i++)

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -237,44 +237,21 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Or), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Or>}};
 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
-    const shared_ptr<ngraph::Function>& function, bool release_function)
+    const shared_ptr<ngraph::Function>& function,
+    std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
+    bool release_function)
    : m_compiled_function(nullptr)
-    , m_ctx(new GPURuntimeContext)
    , m_function(function)
    , m_emit_timing(false)
    , m_is_compiled(false)
    , m_release_function(release_function)
    , m_temporaries_used(false)
+    , m_shared_context(shared_context)
 {
-    // Create context use driver API and make it current, the runtime call will pickup the context
-    // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
-    // #interoperability-between-runtime-and-driver-apis
-    ngraph::runtime::gpu::CudaContextManager::Instance().SetContextCurrent();
-    cublasStatus_t cublasStatus = cublasCreate(&m_cublas_handle);
-    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
-    {
-        throw runtime_error("cuBLAS create handle failed");
-    }
-    cudnnStatus_t cudnnStatus = cudnnCreate(&m_cudnn_handle);
-    if (cudnnStatus != CUDNN_STATUS_SUCCESS)
-    {
-        throw runtime_error("cuDNN create handle failed");
-    }
-    // Pass scalars as reference on the Device
-    cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
-    // register with c-api runtime context
-    m_ctx->cublas_handle = &m_cublas_handle;
-    m_ctx->cudnn_handle = &m_cudnn_handle;
-    m_ctx->compiled_kernel_pool = new CudaFunctionPool;
 }
 runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
 {
-    cublasDestroy(m_cublas_handle);
-    cudnnDestroy(m_cudnn_handle);
-    delete m_ctx->compiled_kernel_pool;
 }
 void runtime::gpu::GPU_ExternalFunction::emit_header()
@@ -387,7 +364,8 @@ void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
                // get an allocator for transient per kernel gpu memory
-                GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+                GPUAllocator allocator =
+                    m_shared_context->m_primitive_emitter->get_memory_allocator();
                size_t idx = allocator.reserve_argspace(
                    c->get_data_ptr(),
                    tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
@@ -698,8 +676,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
        return;
    }
-    m_primitive_emitter.reset(new GPUPrimitiveEmitter());
    m_function_name = m_function->get_name();
    string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
@@ -722,8 +698,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    emit_function_declarations();
    collect_unique_functions();
    emit_functions();
    // allocate device buffers for primitive arguments and workspace
-    m_primitive_emitter->allocate_primitive_memory();
+    m_shared_context->m_primitive_emitter->allocate_primitive_memory();
    string code = m_writer.get_code();
    store_emitted_functions(code);
@@ -781,11 +758,6 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
    }
 }
-unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
-{
-    return m_ctx;
-}
 string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
                                                               const string& function_name)
 {

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -32,6 +32,7 @@
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/pass/result_copy_elimination.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
@@ -62,6 +63,7 @@ namespace ngraph
            public:
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
+                                     std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
                                     bool release_function = true);
                ~GPU_ExternalFunction();
@@ -69,7 +71,7 @@ namespace ngraph
                std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
                const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
                {
-                    return m_primitive_emitter;
+                    return m_shared_context->m_primitive_emitter;
                }
            protected:
@@ -98,8 +100,6 @@ namespace ngraph
                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
-                std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
-                std::unique_ptr<GPURuntimeContext> m_ctx;
                std::shared_ptr<ngraph::Function> m_function;
                std::map<std::string, size_t> m_name_index_map;
@@ -116,8 +116,7 @@ namespace ngraph
                std::string m_function_name;
                std::string m_pch_header_source;
-                cublasHandle_t m_cublas_handle;
+                std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
-                cudnnHandle_t m_cudnn_handle;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.cpp
@@ -28,17 +28,36 @@ runtime::gpu::GPUMemoryManager::GPUMemoryManager(GPUPrimitiveEmitter* emitter)
    : m_buffer_offset(0)
    , m_buffered_mem(initial_buffer_size)
    , m_workspace_manager(alignment)
-    , m_argspace(nullptr)
+    , m_argspace_mem(1, {nullptr, 0})
-    , m_workspace(nullptr)
+    , m_workspace_mem(1, {nullptr, 0})
-    , m_allocation_size(0)
    , m_primitive_emitter(emitter)
 {
 }
+size_t runtime::gpu::GPUMemoryManager::get_allocation_size() const
+{
+    size_t allocation_size = 0;
+    for (auto const& alloc : m_argspace_mem)
+    {
+        allocation_size += alloc.size;
+    }
+    for (auto const& alloc : m_workspace_mem)
+    {
+        allocation_size += alloc.size;
+    }
+    return allocation_size;
+}
 runtime::gpu::GPUMemoryManager::~GPUMemoryManager()
 {
-    runtime::gpu::free_gpu_buffer(m_argspace);
+    for (auto& alloc : m_argspace_mem)
-    runtime::gpu::free_gpu_buffer(m_workspace);
+    {
+        runtime::gpu::free_gpu_buffer(alloc.ptr);
+    }
+    for (auto& alloc : m_workspace_mem)
+    {
+        runtime::gpu::free_gpu_buffer(alloc.ptr);
+    }
 }
 void runtime::gpu::GPUMemoryManager::allocate()
@@ -46,15 +65,25 @@ void runtime::gpu::GPUMemoryManager::allocate()
    if (m_buffer_offset)
    {
        m_buffer_offset = pass::MemoryManager::align(m_buffer_offset, alignment);
-        m_argspace = runtime::gpu::create_gpu_buffer(m_buffer_offset);
+        // the back most node is always empty, fill it here
-        runtime::gpu::cuda_memcpyHtD(m_argspace, m_buffered_mem.data(), m_buffer_offset);
+        m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset);
-        m_allocation_size += m_buffer_offset;
+        m_argspace_mem.back().size = m_buffer_offset;
+        // copy buffered kernel arguments to device
+        runtime::gpu::cuda_memcpyHtD(
+            m_argspace_mem.back().ptr, m_buffered_mem.data(), m_buffer_offset);
+        // add an empty node to the end of the list and zero offset
+        m_argspace_mem.push_back({nullptr, 0});
+        m_buffer_offset = 0;
    }
    auto workspace_size = m_workspace_manager.max_allocated();
    if (workspace_size)
    {
-        m_workspace = runtime::gpu::create_gpu_buffer(workspace_size);
+        m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size);
-        m_allocation_size += workspace_size;
+        m_workspace_mem.back().size = workspace_size;
+        m_workspace_mem.push_back({nullptr, 0});
+        // construct a new manager if the current one was used
+        m_workspace_manager = pass::MemoryManager(alignment);
    }
 }
@@ -86,17 +115,16 @@ size_t runtime::gpu::GPUAllocator::reserve_argspace(const void* data, size_t siz
    // add parameter data to host buffer that will be transfered to device
    size = pass::MemoryManager::align(size, runtime::gpu::GPUMemoryManager::alignment);
    size_t offset = m_manager->queue_for_transfer(data, size);
-    // required to capture m_manager pointer
+    auto local = std::prev(m_manager->m_argspace_mem.end());
-    // directly rather than `this` pointer
-    auto manager = m_manager;
    // return a lambda that will yield the gpu memory address. this
    // should only be evaluated by the runtime invoked primitive
    gpu::memory_primitive mem_primitive = [=]() {
-        if (manager->m_argspace == nullptr)
+        void* argspace = (*local).ptr;
+        if (argspace == nullptr)
        {
            throw std::runtime_error("An attempt was made to use unallocated device memory.");
        }
-        auto gpu_mem = static_cast<uint8_t*>(manager->m_argspace);
+        auto gpu_mem = static_cast<uint8_t*>(argspace);
        return static_cast<void*>(gpu_mem + offset);
    };
    return m_manager->m_primitive_emitter->insert(mem_primitive);
@@ -106,23 +134,22 @@ size_t runtime::gpu::GPUAllocator::reserve_workspace(size_t size, bool zero_init
 {
    size_t offset = m_manager->m_workspace_manager.allocate(size);
    m_active.push(offset);
-    // required to capture m_manager pointer
+    auto local = std::prev(m_manager->m_workspace_mem.end());
-    // directly rather than `this` pointer
-    auto manager = m_manager;
    // return a lambda that will yield the gpu memory address. this
    // should only be evaluated by the runtime invoked primitive
    gpu::memory_primitive mem_primitive = [=]() {
-        if (manager->m_workspace == nullptr)
+        void* workspace = (*local).ptr;
+        if (workspace == nullptr)
        {
            throw std::runtime_error("An attempt was made to use unallocated device memory.");
        }
-        auto gpu_mem = static_cast<uint8_t*>(manager->m_workspace);
+        auto gpu_mem = static_cast<uint8_t*>(workspace);
-        auto workspace = static_cast<void*>(gpu_mem + offset);
+        auto workspace_ptr = static_cast<void*>(gpu_mem + offset);
        if (zero_initialize)
        {
-            runtime::gpu::cuda_memset(workspace, 0, size);
+            runtime::gpu::cuda_memset(workspace_ptr, 0, size);
        }
-        return workspace;
+        return workspace_ptr;
    };
    return m_manager->m_primitive_emitter->insert(mem_primitive);
 }

--- a/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
@@ -16,6 +16,7 @@
 #pragma once
+#include <list>
 #include <memory>
 #include <stack>
 #include <vector>
@@ -65,7 +66,7 @@ namespace ngraph
                ~GPUMemoryManager();
                void allocate();
-                size_t get_allocation_size() { return m_allocation_size; }
+                size_t get_allocation_size() const;
                GPUAllocator build_allocator() { return GPUAllocator(this); }
            private:
                GPUMemoryManager(GPUPrimitiveEmitter* emitter);
@@ -75,10 +76,15 @@ namespace ngraph
                std::vector<uint8_t> m_buffered_mem;
                pass::MemoryManager m_workspace_manager;
                static constexpr const uint16_t alignment = 8;
-                void* m_argspace;
-                void* m_workspace;
-                size_t m_allocation_size;
+                struct allocation
+                {
+                    void* ptr;
+                    size_t size;
+                };
+                std::list<allocation> m_argspace_mem;
+                std::list<allocation> m_workspace_mem;
                GPUPrimitiveEmitter* m_primitive_emitter;
            };
        }

--- a/src/ngraph/runtime/gpu/gpu_primitive_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_primitive_emitter.cpp
@@ -23,8 +23,15 @@ using namespace ngraph;
 using namespace ngraph::runtime::gpu;
 GPUPrimitiveEmitter::GPUPrimitiveEmitter()
-    : m_cuda_emitter(new CUDAEmitter(this))
+    : m_cuda_emitter(new CUDAEmitter(this, nullptr))
-    , m_cudnn_emitter(new CUDNNEmitter(this))
+    , m_cudnn_emitter(new CUDNNEmitter(this, nullptr))
+    , m_memory_manager(this)
+{
+}
+GPUPrimitiveEmitter::GPUPrimitiveEmitter(const std::unique_ptr<GPURuntimeContext>& ctx)
+    : m_cuda_emitter(new CUDAEmitter(this, ctx.get()))
+    , m_cudnn_emitter(new CUDNNEmitter(this, ctx.get()))
    , m_memory_manager(this)
 {
 }

--- a/src/ngraph/runtime/gpu/gpu_primitive_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_primitive_emitter.hpp
@@ -31,11 +31,11 @@ namespace ngraph
        {
            class CUDAEmitter;
            class CUDNNEmitter;
            class GPUPrimitiveEmitter
            {
            public:
                GPUPrimitiveEmitter();
+                GPUPrimitiveEmitter(const std::unique_ptr<GPURuntimeContext>& ctx);
                std::unique_ptr<CUDAEmitter>& get_cuda_emitter();
                std::unique_ptr<CUDNNEmitter>& get_cudnn_emitter();
                std::vector<gpu::primitive*>& get_primitives() { return m_gpu_primitives; }