Use mkl-dnn v1.0 or v0.x depending on compilation flag. (#3227)

* Use mkl-dnn v1.0 or v0.x depending on compilation flag. * Change cpu builder files. * Modify cmake files. Use mkldnn-v1.0 for DEX if NGRAPH_USE_MKLDNN_V1 is set to true, otherwise use mkldnn-v0.x. CODEGEN only builds with mkldnn-v1.0. * Implement mkldnn utility functions for mkldnn-v1.0. User mode scratchpad management for mkldnn-v1.0. * Query scratchpad size and allocate a buffer of max scratchpad size. * Do not create mkldnn::memory when query scratchpad size of Reorder. Modify mkldnn utility functions. Fix convolution_forward_init and inner_product_forward_init. Modify CPURuntimeContextCG. * Add user mode scratchpad to CODEGEN. * mkldnn-v1.0 splits LSTM states. Update Rnn/Lstm Op accordingly. * Address PR feedback: use MKLDNN_MAJOR_VERSION. * Modify cpu rnn fusion pass and related unit tests. * Change Rnn/Lstm arg types to Output. * Fix Lstm for CODEGEN. * Set native layout for Slice when input format is blocked. * Do not print scratchpad size. * Change external_mkldnn_v1.cmake. Fix a typo. * Add mkldnn_v1.patch for mkldnn-v1.0. * Address PR feedback. * Define MKLDNN_ERROR_MESSAGE. * Address PR feedback: change to NGRAPH_USE_LEGACY_MKLDNN. * Fix a bug. * Remove unused variable. * Fix compiler warnings. * Fix a bug for CODEGEN. * Move variable only needed for mkldnn-v0.20 inside #if. * Remove unused variables. * No in place Reshape rotation for blocked data layout with mkldnn-v1.0. * Modify mkldnn_v1.patch to force mkldnn to link to libiomp. * Fix style. * Change path for find_library and find_file. * Do not insert ConvertLayout before/after Quantize/DeQuantize for blocked data layout. * Write strides information to visualized graph. * Move variables only needed for mkldnn-v0 under #if. * Move more variables in rnn fusion. * Fix ConvertLayout constant folding for mkldnn-v1.0.

Use mkl-dnn v1.0 or v0.x depending on compilation flag. (#3227)
* Use mkl-dnn v1.0 or v0.x depending on compilation flag. * Change cpu builder files. * Modify cmake files. Use mkldnn-v1.0 for DEX if NGRAPH_USE_MKLDNN_V1 is set to true, otherwise use mkldnn-v0.x. CODEGEN only builds with mkldnn-v1.0. * Implement mkldnn utility functions for mkldnn-v1.0. User mode scratchpad management for mkldnn-v1.0. * Query scratchpad size and allocate a buffer of max scratchpad size. * Do not create mkldnn::memory when query scratchpad size of Reorder. Modify mkldnn utility functions. Fix convolution_forward_init and inner_product_forward_init. Modify CPURuntimeContextCG. * Add user mode scratchpad to CODEGEN. * mkldnn-v1.0 splits LSTM states. Update Rnn/Lstm Op accordingly. * Address PR feedback: use MKLDNN_MAJOR_VERSION. * Modify cpu rnn fusion pass and related unit tests. * Change Rnn/Lstm arg types to Output. * Fix Lstm for CODEGEN. * Set native layout for Slice when input format is blocked. * Do not print scratchpad size. * Change external_mkldnn_v1.cmake. Fix a typo. * Add mkldnn_v1.patch for mkldnn-v1.0. * Address PR feedback. * Define MKLDNN_ERROR_MESSAGE. * Address PR feedback: change to NGRAPH_USE_LEGACY_MKLDNN. * Fix a bug. * Remove unused variable. * Fix compiler warnings. * Fix a bug for CODEGEN. * Move variable only needed for mkldnn-v0.20 inside #if. * Remove unused variables. * No in place Reshape rotation for blocked data layout with mkldnn-v1.0. * Modify mkldnn_v1.patch to force mkldnn to link to libiomp. * Fix style. * Change path for find_library and find_file. * Do not insert ConvertLayout before/after Quantize/DeQuantize for blocked data layout. * Write strides information to visualized graph. * Move variables only needed for mkldnn-v0 under #if. * Move more variables in rnn fusion. * Fix ConvertLayout constant folding for mkldnn-v1.0.
e26d602a · Amy Zhuang · Scott Cyphers · cc1daca8 · e26d602a · e26d602a
Commit e26d602a authored Sep 05, 2019 by Amy Zhuang Committed by Scott Cyphers Sep 05, 2019
55 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,6 +168,7 @@ ngraph_var(NGRAPH_USE_PREBUILT_LLVM DEFAULT "FALSE")
 option(NGRAPH_UNIT_TEST_ENABLE "Control the building of unit tests" TRUE)
 option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
 option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
+option(NGRAPH_USE_LEGACY_MKLDNN "Use legacy MKLDNN" TRUE)
 option(NGRAPH_MLIR_ENABLE "Control the building of MLIR backend" FALSE)
 option(NGRAPH_INTELGPU_ENABLE "Control the building of the Intel GPU backend with clDNN" FALSE)
 option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" TRUE)
@@ -241,6 +242,7 @@ endmacro()
 NORMALIZE_BOOL(NGRAPH_UNIT_TEST_ENABLE)
 NORMALIZE_BOOL(NGRAPH_TOOLS_ENABLE)
 NORMALIZE_BOOL(NGRAPH_CPU_ENABLE)
+NORMALIZE_BOOL(NGRAPH_USE_LEGACY_MKLDNN)
 NORMALIZE_BOOL(NGRAPH_MLIR_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTELGPU_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTERPRETER_ENABLE)
@@ -257,6 +259,7 @@ NORMALIZE_BOOL(NGRAPH_PYTHON_BUILD_ENABLE)
 NORMALIZE_BOOL(NGRAPH_USE_PREBUILT_LLVM)
 NORMALIZE_BOOL(NGRAPH_PLAIDML_ENABLE)
 NORMALIZE_BOOL(NGRAPH_JSON_ENABLE)
+
 NORMALIZE_BOOL(NGRAPH_STATIC_LIB_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTERPRETER_STATIC_LIB_ENABLE)
 NORMALIZE_BOOL(NGRAPH_CPU_STATIC_LIB_ENABLE)
@@ -266,6 +269,7 @@ message(STATUS "NGRAPH_CXX_STANDARD:                  ${NGRAPH_CXX_STANDARD}")
 message(STATUS "NGRAPH_UNIT_TEST_ENABLE:              ${NGRAPH_UNIT_TEST_ENABLE}")
 message(STATUS "NGRAPH_TOOLS_ENABLE:                  ${NGRAPH_TOOLS_ENABLE}")
 message(STATUS "NGRAPH_CPU_ENABLE:                    ${NGRAPH_CPU_ENABLE}")
+message(STATUS "NGRAPH_USE_LEGACY_MKLDNN:             ${NGRAPH_USE_LEGACY_MKLDNN}")
 message(STATUS "NGRAPH_MLIR_ENABLE:                   ${NGRAPH_MLIR_ENABLE}")
 message(STATUS "NGRAPH_INTELGPU_ENABLE:               ${NGRAPH_INTELGPU_ENABLE}")
 message(STATUS "NGRAPH_INTERPRETER_ENABLE:            ${NGRAPH_INTERPRETER_ENABLE}")
@@ -391,6 +395,10 @@ if (NGRAPH_CPU_ENABLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_CPU_ENABLE")
 endif()

+if (NGRAPH_USE_LEGACY_MKLDNN)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_USE_LEGACY_MKLDNN")
+endif()
+
 if (NGRAPH_MLIR_ENABLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_MLIR_ENABLE")
    set(NGRAPH_MLIR_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/contrib/mlir)
@@ -534,7 +542,12 @@ if(NGRAPH_CPU_ENABLE OR NGRAPH_GENERIC_CPU_ENABLE)
    include(cmake/external_eigen.cmake)
 endif()
 if(NGRAPH_CPU_ENABLE)
-    include(cmake/external_mkldnn.cmake)
+    if(NGRAPH_USE_LEGACY_MKLDNN)
+        include(cmake/external_mkldnn.cmake)
+        set(NGRAPH_DEX_ONLY TRUE)
+    else()
+        include(cmake/external_mkldnn_v1.cmake)
+    endif()
 endif()
 if (NGRAPH_MLIR_ENABLE)
    include(cmake/external_mlir.cmake)

--- a/cmake/external_mkldnn_v1.cmake
+++ b/cmake/external_mkldnn_v1.cmake
--- a/cmake/mkldnn_v1.patch
+++ b/cmake/mkldnn_v1.patch
+diff --git a/cmake/OpenMP.cmake b/cmake/OpenMP.cmake
+index 99970659..ef88a0a7 100644
+--- a/cmake/OpenMP.cmake
+++ b/cmake/OpenMP.cmake
+@@ -28,7 +28,7 @@ if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+     # But we still want to build the library.
+     set(_omp_severity "WARNING")
+ else()
+-    set(_omp_severity "FATAL_ERROR")
+    set(_omp_severity "WARNING")
+ endif()
+
+ macro(forbid_link_compiler_omp_rt)
+@@ -45,6 +45,42 @@ macro(forbid_link_compiler_omp_rt)
+     endif()
+ endmacro()
+
+macro(use_intel_omp_rt)
+    # fast return
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        set(MKLDNN_USES_INTEL_OPENMP TRUE)
+        return()
+    endif()
+
+    # Do not link with compiler-native OpenMP library if Intel MKL is present.
+    # Rationale: Intel MKL comes with Intel OpenMP library which is compatible
+    # with all libraries shipped with compilers that Intel MKL-DNN supports.
+    find_library(IOMP5LIB
+                NAMES "iomp5" "iomp5md" "libiomp5" "libiomp5md"
+                PATHS
+                    ${CMAKE_CURRENT_SOURCE_DIR}/external/mkl/lib
+                    NO_DEFAULT_PATH)
+    if(IOMP5LIB)
+        forbid_link_compiler_omp_rt()
+        if (WIN32)
+            find_file(IOMP5DLL
+                NAMES "libiomp5.dll" "libiomp5md.dll"
+                PATHS
+                    ${CMAKE_CURRENT_SOURCE_DIR}/external/mkl/lib
+                    NO_DEFAULT_PATH)
+        endif()
+        list(APPEND EXTRA_SHARED_LIBS ${IOMP5LIB})
+    else()
+        if (MKLDNN_THREADING STREQUAL "OMP:INTEL")
+            message(${_omp_severity} "Intel OpenMP runtime could not be found. "
+                "Please either use OpenMP runtime that comes with the compiler "
+                "(via -DMKLDNN_THREADING={OMP,OMP:COMP}), or "
+                "explicitely provide the path to libiomp with the "
+                "-DCMAKE_LIBRARY_PATH option")
+        endif()
+    endif()
+endmacro()
+
+ if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+     add_definitions(/Qpar)
+     add_definitions(/openmp)
+@@ -78,6 +114,7 @@ if (MKLDNN_CPU_RUNTIME MATCHES "OMP")
+         message(${_omp_severity} "OpenMP library could not be found. "
+             "Proceeding might lead to highly sub-optimal performance.")
+     endif()
+    use_intel_omp_rt()
+ else()
+     # Compilation happens with OpenMP to enable `#pragma omp simd`
+     # but during linkage OpenMP dependency should be avoided
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 60bb0c94..cc3fc9d6 100644
+--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+@@ -73,8 +73,10 @@ endif()
+ add_library(${LIB_NAME}
+     ${MKLDNN_LIBRARY_TYPE} ${HEADERS} ${${LIB_NAME}_SUB_OBJS})
+
+-set_property(TARGET ${LIB_NAME} PROPERTY VERSION "${PROJECT_VERSION}.0")
+-set_property(TARGET ${LIB_NAME} PROPERTY SOVERSION "0")
+if(MKLDNN_LIB_VERSIONING_ENABLE)
+       set_property(TARGET ${LIB_NAME} PROPERTY VERSION "${PROJECT_VERSION}.0")
+       set_property(TARGET ${LIB_NAME} PROPERTY SOVERSION "0")
+endif()
+ set_property(TARGET ${LIB_NAME} PROPERTY PUBLIC_HEADER ${HEADERS})
+
+ target_include_directories(${LIB_NAME} PUBLIC
--- a/src/ngraph/codegen/compiler.cpp
+++ b/src/ngraph/codegen/compiler.cpp
@@ -190,6 +190,10 @@ void codegen::CompilerCore::initialize()
    // Prevent Eigen from using any LGPL3 code
    args.push_back("-DEIGEN_MPL2_ONLY");

+#if defined(NGRAPH_USE_LEGACY_MKLDNN)
+    args.push_back("-DNGRAPH_USE_LEGACY_MKLDNN");
+#endif
+
    // Prepare DiagnosticEngine
    IntrusiveRefCntPtr<DiagnosticOptions> diag_options = new DiagnosticOptions();
    diag_options->ErrorLimit = 20;

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -124,7 +124,6 @@ set(SRC
    pass/cpu_mat_fusion.cpp
    pass/cpu_memory_assignment.cpp
    pass/cpu_memory_optimization.cpp
-    pass/cpu_mkldnn_primitive_build.cpp
    pass/cpu_post_layout_optimizations.cpp
    pass/cpu_rnn_fusion.cpp
    pass/cpu_workspace_insertion.cpp
@@ -136,6 +135,7 @@ if (NOT NGRAPH_DEX_ONLY)
        cpu_emitter.cpp
        cpu_kernel_emitters.cpp
        cpu_kernel_utils.cpp
+        pass/cpu_mkldnn_primitive_build.cpp
        )
 endif()

@@ -168,6 +168,9 @@ if (NGRAPH_CPU_ENABLE)
            VERSION ${NGRAPH_VERSION}
            SOVERSION ${NGRAPH_API_VERSION})
    endif()
+    if (NGRAPH_USE_LEGACY_MKLDNN)
+        target_compile_definitions(cpu_backend PRIVATE "NGRAPH_USE_LEGACY_MKLDNN")
+    endif()
    if (NGRAPH_DEX_ONLY)
        target_compile_definitions(cpu_backend PRIVATE "NGRAPH_DEX_ONLY")
    endif()
@@ -207,7 +210,8 @@ if (NGRAPH_CPU_ENABLE)
    target_compile_definitions(cpu_backend PRIVATE CPU_BACKEND_DLL_EXPORTS)

    add_dependencies(cpu_backend libmkldnn ext_eigen)
-    target_link_libraries(cpu_backend PUBLIC ngraph libmkldnn libmkl libeigen libtbb)
+   	target_link_libraries(cpu_backend PUBLIC ngraph libmkldnn libmkl libeigen libtbb)
+        
    if (NGRAPH_JSON_ENABLE)
        target_link_libraries(cpu_backend PUBLIC libjson)
    endif()

--- a/src/ngraph/runtime/cpu/builder/add.cpp
+++ b/src/ngraph/runtime/cpu/builder/add.cpp
@@ -40,6 +40,8 @@ namespace ngraph

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto sum_pd = mkldnn_emitter->get_elementwise_add_desc(node);
+                    QUERY_SCRATCHPAD(sum, sum_pd);
+
                    // Add needs 4 primitives: input0, input1, result, and sum.
                    size_t add_index = mkldnn_emitter->reserve_primitive_space(4);
                    auto& deps = mkldnn_emitter->get_primitive_deps(add_index);
@@ -59,8 +61,12 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_elementwise_add(
-                                ctx->mkldnn_primitives, sum_pd, deps, add_index);
+                            mkldnn_emitter->build_elementwise_add(ctx->mkldnn_memories,
+                                                                  ctx->mkldnn_primitives,
+                                                                  ctx->mkldnn_scratchpad_mds,
+                                                                  sum_pd,
+                                                                  deps,
+                                                                  add_index);
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
@@ -68,7 +74,9 @@ namespace ngraph
                            ctx, deps[1], ctx->buffer_data[arg1_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[2], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, add_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, add_index, deps, cpu::mkldnn_utils::OpType::ADD);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/avg_pool.cpp
@@ -55,6 +55,8 @@ namespace ngraph
                    auto avg_pool_desc =
                        mkldnn_emitter->get_avg_pooling_forward_desc<ngraph::op::AvgPool>(node,
                                                                                          false);
+                    QUERY_SCRATCHPAD(pooling_forward, avg_pool_desc);
+
                    // AvgPool needs 3 primitives: input, result, and pooling_forward.
                    size_t avg_pool_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(avg_pool_index);
@@ -64,14 +66,20 @@ namespace ngraph
                            CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                            if (ctx->first_iteration)
                            {
-                                mkldnn_emitter->build_pooling_forward(
-                                    ctx->mkldnn_primitives, avg_pool_desc, deps, avg_pool_index);
+                                mkldnn_emitter->build_pooling_forward(ctx->mkldnn_memories,
+                                                                      ctx->mkldnn_primitives,
+                                                                      ctx->mkldnn_scratchpad_mds,
+                                                                      avg_pool_desc,
+                                                                      deps,
+                                                                      avg_pool_index);
                            }
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, avg_pool_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, avg_pool_index, deps, cpu::mkldnn_utils::OpType::AVGPOOL);
                        };
                    functors.emplace_back(functor);
                }
@@ -137,6 +145,8 @@ namespace ngraph
                    auto avg_pool_desc =
                        mkldnn_emitter->get_avg_pooling_backward_desc<ngraph::op::AvgPoolBackprop>(
                            node);
+                    QUERY_SCRATCHPAD_2ARGS(avg_pooling_backward, avg_pool_fwd_desc, avg_pool_desc);
+
                    // AvgPoolBackprop needs 3 primitives: input, result, and pooling_backward.
                    size_t avg_pool_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(avg_pool_index);
@@ -150,7 +160,9 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_pooling_backward(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_pooling_backward(ctx->mkldnn_memories,
+                                                                   ctx->mkldnn_primitives,
+                                                                   ctx->mkldnn_scratchpad_mds,
                                                                   avg_pool_desc,
                                                                   avg_pool_fwd_desc,
                                                                   deps,
@@ -160,7 +172,9 @@ namespace ngraph
                            ctx, deps[0], ctx->buffer_data[delta_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, avg_pool_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, avg_pool_index, deps, cpu::mkldnn_utils::OpType::AVGPOOLBACKPROP);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/batch_norm.cpp
+++ b/src/ngraph/runtime/cpu/builder/batch_norm.cpp
@@ -84,10 +84,11 @@ namespace ngraph
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto batchnorm_desc =
                        mkldnn_emitter->get_batchnorm_forward_desc<OP>(node, true);
+                    QUERY_SCRATCHPAD_2ARGS(batchnorm_forward, batchnorm_desc, ops);

                    auto weights_shape = Shape{2, args[0].get_size()};
                    auto weights_desc = mkldnn_emitter->build_memory_descriptor(
-                        weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
+                        weights_shape, args[0].get_element_type(), mkldnn::memory::FORMAT::nc);

                    // batchnorm forward needs 6 primitives: input, weights, result, mean,
                    // variance, and batch_normalization_forward.
@@ -111,7 +112,9 @@ namespace ngraph
                                                       CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_batchnorm_forward(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_batchnorm_forward(ctx->mkldnn_memories,
+                                                                    ctx->mkldnn_primitives,
+                                                                    ctx->mkldnn_scratchpad_mds,
                                                                    batchnorm_desc,
                                                                    weights_desc,
                                                                    training,
@@ -136,7 +139,8 @@ namespace ngraph
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[4], ctx->buffer_data[out2_buffer_index]);

-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, batchnorm_index);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, batchnorm_index, deps, cpu::mkldnn_utils::OpType::BATCHNORM3ARGS);
                    };
                    functors.emplace_back(functor);
                }
@@ -151,9 +155,11 @@ namespace ngraph
                    auto batchnorm_desc =
                        mkldnn_emitter->get_batchnorm_forward_desc<OP>(node, false);

+                    QUERY_SCRATCHPAD_2ARGS(batchnorm_forward, batchnorm_desc, ops);
+
                    auto weights_shape = Shape{2, args[0].get_size()};
                    auto weights_desc = mkldnn_emitter->build_memory_descriptor(
-                        weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
+                        weights_shape, args[0].get_element_type(), mkldnn::memory::FORMAT::nc);

                    // batchnorm forward needs 6 primitives: input, weights, result, mean,
                    // variance, and batch_normalization_forward.
@@ -177,7 +183,9 @@ namespace ngraph
                                                       CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_batchnorm_forward(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_batchnorm_forward(ctx->mkldnn_memories,
+                                                                    ctx->mkldnn_primitives,
+                                                                    ctx->mkldnn_scratchpad_mds,
                                                                    batchnorm_desc,
                                                                    weights_desc,
                                                                    training,
@@ -202,7 +210,8 @@ namespace ngraph
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[4], ctx->buffer_data[out0_buffer_index]);

-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, batchnorm_index);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, batchnorm_index, deps, cpu::mkldnn_utils::OpType::BATCHNORM5ARGS);
                    };
                    functors.emplace_back(functor);
                }
@@ -421,17 +430,24 @@ namespace ngraph
                auto batchnorm_desc = mkldnn_emitter->get_batchnorm_backward_desc(node);
                auto weights_shape = Shape{2, args[0].get_size()};
                auto weights_desc = mkldnn_emitter->build_memory_descriptor(
-                    weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
+                    weights_shape, args[0].get_element_type(), mkldnn::memory::FORMAT::nc);
                auto dweights_desc = mkldnn_emitter->build_memory_descriptor(
-                    weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
+                    weights_shape, args[0].get_element_type(), mkldnn::memory::FORMAT::nc);
+                auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);

                // batchnorm backward needs 8 primitives: weights, input, mean, variance,
                // dinput, dweights, and batch_normalization_backward.
                auto batchnorm_index = mkldnn_emitter->reserve_primitive_space(8);
                auto& deps = mkldnn_emitter->get_primitive_deps(batchnorm_index);

+                const ngraph::op::BatchNormTrainingBackprop* batchnorm =
+                    static_cast<const ngraph::op::BatchNormTrainingBackprop*>(node);
+                auto eps = batchnorm->get_eps_value();
+                QUERY_SCRATCHPAD_3ARGS(batchnorm_backward, batchnorm_desc, input_desc, eps);
+
                auto functor = [&,
                                batchnorm_desc,
+                                input_desc,
                                weights_desc,
                                dweights_desc,
                                batchnorm_index,
@@ -450,10 +466,14 @@ namespace ngraph
                                                   CPUExecutionContext* ectx) {
                    if (ctx->first_iteration)
                    {
-                        mkldnn_emitter->build_batchnorm_backward(ctx->mkldnn_primitives,
+                        mkldnn_emitter->build_batchnorm_backward(ctx->mkldnn_memories,
+                                                                 ctx->mkldnn_primitives,
+                                                                 ctx->mkldnn_scratchpad_mds,
                                                                 batchnorm_desc,
+                                                                 input_desc,
                                                                 weights_desc,
                                                                 dweights_desc,
+                                                                 eps,
                                                                 deps,
                                                                 batchnorm_index);
                    }
@@ -477,7 +497,8 @@ namespace ngraph
                        ctx, deps[5], ctx->buffer_data[out0_buffer_index]);
                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[6], stacked_dweights.get());

-                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, batchnorm_index);
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, batchnorm_index, deps, cpu::mkldnn_utils::OpType::BATCHNORMBACKPROP);

                    memcpy(ctx->buffer_data[out1_buffer_index],
                           stacked_dweights.get(),

--- a/src/ngraph/runtime/cpu/builder/bounded_relu.cpp
+++ b/src/ngraph/runtime/cpu/builder/bounded_relu.cpp
@@ -44,6 +44,8 @@ namespace ngraph
                {
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto bounded_relu_desc = mkldnn_emitter->get_bounded_relu_desc(node);
+                    QUERY_SCRATCHPAD(eltwise_forward, bounded_relu_desc);
+
                    // BoundedRelu needs 3 primitives: input, result, and eltwise_forward.
                    auto bounded_relu_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(bounded_relu_index);
@@ -56,7 +58,9 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_bounded_relu(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_bounded_relu(ctx->mkldnn_memories,
+                                                               ctx->mkldnn_primitives,
+                                                               ctx->mkldnn_scratchpad_mds,
                                                               bounded_relu_desc,
                                                               deps,
                                                               bounded_relu_index);
@@ -65,7 +69,9 @@ namespace ngraph
                            ctx, deps[0], ctx->buffer_data[input_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, bounded_relu_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, bounded_relu_index, deps, cpu::mkldnn_utils::OpType::BOUNDEDRELU);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/concat.cpp
+++ b/src/ngraph/runtime/cpu/builder/concat.cpp
@@ -101,6 +101,8 @@ namespace ngraph
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto concat_pd =
                        mkldnn_emitter->get_concat_desc<ngraph::op::Concat>(node, nargs);
+                    QUERY_SCRATCHPAD(concat, concat_pd);
+
                    std::vector<mkldnn::memory::desc> inputs_data_desc;
                    for (size_t i = 0; i < nargs; i++)
                    {
@@ -121,7 +123,9 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_concat(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_concat(ctx->mkldnn_memories,
+                                                         ctx->mkldnn_primitives,
+                                                         ctx->mkldnn_scratchpad_mds,
                                                         concat_pd,
                                                         inputs_data_desc,
                                                         deps,
@@ -134,7 +138,9 @@ namespace ngraph
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[nargs], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, concat_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, concat_index, deps, cpu::mkldnn_utils::OpType::CONCAT);
                    };

                    functors.emplace_back(functor);

--- a/src/ngraph/runtime/cpu/builder/convert_layout.cpp
+++ b/src/ngraph/runtime/cpu/builder/convert_layout.cpp
@@ -43,6 +43,7 @@ namespace ngraph
                auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
                auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);

+#if MKLDNN_VERSION_MAJOR < 1
                if (input_desc.data.format == mkldnn_nchw &&
                    result_desc.data.format == mkldnn_goihw)
                {
@@ -80,7 +81,58 @@ namespace ngraph
                        mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
                        mkldnn::memory::format::goihw);
                }
+#else
+                bool input_format_is_nchw = mkldnn_utils::mkldnn_md_matches_format_tag(
+                    input_desc.data, mkldnn::memory::format_tag::nchw);
+                if (input_format_is_nchw &&
+                    mkldnn_utils::mkldnn_md_matches_format_tag(result_desc.data,
+                                                               mkldnn::memory::format_tag::goihw))
+                {
+                    // becomes a copy
+                    input_desc = result_desc;
+                }
+                else if ((input_format_is_nchw ||
+                          mkldnn_utils::mkldnn_md_matches_format_tag(
+                              input_desc.data, mkldnn::memory::format_tag::nhwc)) &&
+                         (mkldnn_utils::mkldnn_md_matches_format_tag(
+                              result_desc.data, mkldnn::memory::format_tag::OIhw4i16o4i) &&
+                          // check if compensation is conv_s8s8(1U)
+                          result_desc.data.extra.flags & 0x1U))
+                {
+                    auto arg0_shape = args[0].get_shape();
+                    input_desc = mkldnn::memory::desc(
+                        mkldnn::memory::dims(arg0_shape.begin(), arg0_shape.end()),
+                        mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                        mkldnn::memory::format_tag::oihw);
+                }
+                else if (input_format_is_nchw && input_desc.data.ndims == 4 &&
+                         result_desc.data.ndims == 5 && node->get_users().size() == 1)
+                {
+                    Shape weights_shape_groups;
+                    if (auto gconv = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(
+                            node->get_users()[0]))
+                    {
+                        weights_shape_groups = gconv->get_weights_dimensions();
+                    }
+                    else if (auto gconvb =
+                                 std::dynamic_pointer_cast<ngraph::op::GroupConvolutionBias>(
+                                     node->get_users()[0]))
+                    {
+                        weights_shape_groups = gconvb->get_weights_dimensions();
+                    }
+                    else
+                    {
+                        throw ngraph_error("Incompatible input/output shape in ConvertLayout op");
+                    }
+                    input_desc = mkldnn::memory::desc(
+                        mkldnn::memory::dims(weights_shape_groups.begin(),
+                                             weights_shape_groups.end()),
+                        mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                        mkldnn::memory::format_tag::goihw);
+                }

+                mkldnn_emitter->query_scratchpad_reorder(input_desc, result_desc);
+#endif
                // ConvertLayout needs 3 primitives: input, result, and reorder.
                size_t reorder_index = mkldnn_emitter->reserve_primitive_space(3);
                auto& deps = mkldnn_emitter->get_primitive_deps(reorder_index);
@@ -89,7 +141,9 @@ namespace ngraph
                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_reorder(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_reorder(ctx->mkldnn_memories,
+                                                          ctx->mkldnn_primitives,
+                                                          ctx->mkldnn_scratchpad_mds,
                                                          input_desc,
                                                          result_desc,
                                                          deps,
@@ -99,7 +153,9 @@ namespace ngraph
                            ctx, deps[0], ctx->buffer_data[arg_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, reorder_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, reorder_index, deps, cpu::mkldnn_utils::OpType::CONVERTLAYOUT);
                    };
                functors.emplace_back(functor);
            }

--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
--- a/src/ngraph/runtime/cpu/builder/leaky_relu.cpp
+++ b/src/ngraph/runtime/cpu/builder/leaky_relu.cpp
@@ -44,6 +44,8 @@ namespace ngraph
                {
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto leaky_relu_desc = mkldnn_emitter->get_leaky_relu_desc(node);
+                    QUERY_SCRATCHPAD(eltwise_forward, leaky_relu_desc);
+
                    // CPULeakyRelu needs 3 primitives: input, result, and eltwise_forward.
                    auto leaky_relu_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(leaky_relu_index);
@@ -56,14 +58,20 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_leaky_relu(
-                                ctx->mkldnn_primitives, leaky_relu_desc, deps, leaky_relu_index);
+                            mkldnn_emitter->build_leaky_relu(ctx->mkldnn_memories,
+                                                             ctx->mkldnn_primitives,
+                                                             ctx->mkldnn_scratchpad_mds,
+                                                             leaky_relu_desc,
+                                                             deps,
+                                                             leaky_relu_index);
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[0], ctx->buffer_data[input_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, leaky_relu_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, leaky_relu_index, deps, cpu::mkldnn_utils::OpType::LEAKYRELU);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/lrn.cpp
+++ b/src/ngraph/runtime/cpu/builder/lrn.cpp
@@ -44,6 +44,8 @@ namespace ngraph
                {
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto lrn_desc = mkldnn_emitter->get_lrn_forward_desc(node);
+                    QUERY_SCRATCHPAD(lrn_forward, lrn_desc);
+
                    // LRN needs 3 primitives: input, result, and lrn_forward.
                    auto lrn_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(lrn_index);
@@ -52,14 +54,20 @@ namespace ngraph
                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_lrn_forward(
-                                ctx->mkldnn_primitives, lrn_desc, deps, lrn_index);
+                            mkldnn_emitter->build_lrn_forward(ctx->mkldnn_memories,
+                                                              ctx->mkldnn_primitives,
+                                                              ctx->mkldnn_scratchpad_mds,
+                                                              lrn_desc,
+                                                              deps,
+                                                              lrn_index);
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[0], ctx->buffer_data[arg_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, lrn_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, lrn_index, deps, cpu::mkldnn_utils::OpType::LRN);
                    };
                }
                else

--- a/src/ngraph/runtime/cpu/builder/lstm.cpp
+++ b/src/ngraph/runtime/cpu/builder/lstm.cpp
@@ -37,23 +37,27 @@ namespace ngraph
                        "Lstm is supported only through MKLDNN and doesnt have reference "
                        "INTERPRETER implementation");
                }
+#if MKLDNN_VERSION_MAJOR < 1
                if (args.size() != 5)
                {
                    throw ngraph_error(
                        "Lstm op doesnt have the required number of inputs to create MKLDNN "
                        "kernel");
                }
+#else
+                if (args.size() != 6)
+                {
+                    throw ngraph_error(
+                        "Lstm op doesnt have the required number of inputs to create MKLDNN "
+                        "kernel");
+                }
+#endif
                auto& functors = external_function->get_functors();

                auto src_layer_buffer_index =
                    external_function->get_buffer_index(args[0].get_name());
                auto src_iter_buffer_index =
                    external_function->get_buffer_index(args[1].get_name());
-                auto weights_layer_buffer_index =
-                    external_function->get_buffer_index(args[2].get_name());
-                auto weights_iter_buffer_index =
-                    external_function->get_buffer_index(args[3].get_name());
-                auto bias_buffer_index = external_function->get_buffer_index(args[4].get_name());
                auto dst_layer_buffer_index =
                    external_function->get_buffer_index(out[0].get_name());
                auto dst_iter_buffer_index = external_function->get_buffer_index(out[1].get_name());
@@ -61,6 +65,14 @@ namespace ngraph
                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                auto lstm_desc =
                    mkldnn_emitter->get_rnn_forward_desc<ngraph::op::Lstm>(node, args, out);
+
+#if MKLDNN_VERSION_MAJOR < 1
+                auto weights_layer_buffer_index =
+                    external_function->get_buffer_index(args[2].get_name());
+                auto weights_iter_buffer_index =
+                    external_function->get_buffer_index(args[3].get_name());
+                auto bias_buffer_index = external_function->get_buffer_index(args[4].get_name());
+
                // Lstm needs 9 primitives: src_layer, src_iter, weights_layer, weights_iter, bias,
                // dst_layer, dst_iter, workspace, and rnn_forward.
                // It needs a new workspace.
@@ -81,7 +93,9 @@ namespace ngraph
                                                       CPUExecutionContext* ectx) {
                    if (ctx->first_iteration)
                    {
-                        mkldnn_emitter->build_rnn_forward(ctx->mkldnn_primitives,
+                        mkldnn_emitter->build_rnn_forward(ctx->mkldnn_memories,
+                                                          ctx->mkldnn_primitives,
+                                                          ctx->mkldnn_scratchpad_mds,
                                                          ctx->mkldnn_workspaces,
                                                          lstm_desc,
                                                          deps,
@@ -103,9 +117,81 @@ namespace ngraph
                        ctx, deps[6], ctx->buffer_data[dst_iter_buffer_index]);
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[7], ctx->mkldnn_workspaces[deps[8]]);
-                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, lstm_index);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, lstm_index, deps, cpu::mkldnn_utils::OpType::LSTM);
+                };
+                functors.emplace_back(functor);
+#else
+                mkldnn_emitter->query_scratchpad_rnn_forward(lstm_desc);
+
+                auto src_iter_c_buffer_index =
+                    external_function->get_buffer_index(args[2].get_name());
+                auto weights_layer_buffer_index =
+                    external_function->get_buffer_index(args[3].get_name());
+                auto weights_iter_buffer_index =
+                    external_function->get_buffer_index(args[4].get_name());
+                auto bias_buffer_index = external_function->get_buffer_index(args[5].get_name());
+                auto dst_iter_c_buffer_index =
+                    external_function->get_buffer_index(out[2].get_name());
+
+                // Lstm needs 11 primitives: src_layer, src_iter, src_iter_c, weights_layer,
+                // weights_iter, bias,
+                // dst_layer, dst_iter, dst_iter_c, workspace, and lstm_forward.
+                // It needs a new workspace.
+                auto lstm_index =
+                    mkldnn_emitter->reserve_primitive_space(11, true /* new workspace */);
+                auto& deps = mkldnn_emitter->get_primitive_deps(lstm_index);
+
+                auto functor = [&,
+                                lstm_desc,
+                                lstm_index,
+                                src_layer_buffer_index,
+                                src_iter_buffer_index,
+                                src_iter_c_buffer_index,
+                                weights_layer_buffer_index,
+                                weights_iter_buffer_index,
+                                bias_buffer_index,
+                                dst_layer_buffer_index,
+                                dst_iter_buffer_index,
+                                dst_iter_c_buffer_index](CPURuntimeContext* ctx,
+                                                         CPUExecutionContext* ectx) {
+                    if (ctx->first_iteration)
+                    {
+                        mkldnn_emitter->build_rnn_forward(ctx->mkldnn_memories,
+                                                          ctx->mkldnn_primitives,
+                                                          ctx->mkldnn_scratchpad_mds,
+                                                          ctx->mkldnn_workspaces,
+                                                          lstm_desc,
+                                                          deps,
+                                                          lstm_index);
+                    }
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[0], ctx->buffer_data[src_layer_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[1], ctx->buffer_data[src_iter_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[2], ctx->buffer_data[src_iter_c_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[3], ctx->buffer_data[weights_layer_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[4], ctx->buffer_data[weights_iter_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[5], ctx->buffer_data[bias_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[6], ctx->buffer_data[dst_layer_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[7], ctx->buffer_data[dst_iter_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[8], ctx->buffer_data[dst_iter_c_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[9], ctx->mkldnn_workspaces[deps[10]]);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, lstm_index, deps, cpu::mkldnn_utils::OpType::LSTM);
                };
                functors.emplace_back(functor);
+#endif
            }

            void register_builders_lstm_cpp() { REGISTER_OP_BUILDER(Lstm); }

--- a/src/ngraph/runtime/cpu/builder/max_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/max_pool.cpp
@@ -54,6 +54,8 @@ namespace ngraph
                    auto max_pool_desc =
                        mkldnn_emitter->get_max_pooling_forward_desc<ngraph::op::MaxPool>(node,
                                                                                          false);
+                    QUERY_SCRATCHPAD(pooling_forward, max_pool_desc);
+
                    // MaxPool needs 3 primitives: input, result, and pooling_forward.
                    size_t max_pool_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(max_pool_index);
@@ -63,14 +65,20 @@ namespace ngraph
                            CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                            if (ctx->first_iteration)
                            {
-                                mkldnn_emitter->build_pooling_forward(
-                                    ctx->mkldnn_primitives, max_pool_desc, deps, max_pool_index);
+                                mkldnn_emitter->build_pooling_forward(ctx->mkldnn_memories,
+                                                                      ctx->mkldnn_primitives,
+                                                                      ctx->mkldnn_scratchpad_mds,
+                                                                      max_pool_desc,
+                                                                      deps,
+                                                                      max_pool_index);
                            }
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, max_pool_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, max_pool_index, deps, cpu::mkldnn_utils::OpType::MAXPOOL);
                        };
                    functors.emplace_back(functor);
                }
@@ -134,6 +142,7 @@ namespace ngraph
                        mkldnn_emitter->get_max_pooling_backward_desc<ngraph::op::MaxPoolBackprop>(
                            node);
                    auto fprop_src_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    QUERY_SCRATCHPAD_2ARGS(max_pooling_backward, fwd_pool_desc, bwd_pool_desc);

                    // MaxPoolBackprop forward needs 4 primitives: fprop_src, diff_src, workspace,
                    // and pooling_forward.
@@ -151,7 +160,11 @@ namespace ngraph
                                ctx, fdeps[1], ctx->buffer_data[out_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, fdeps[2], ctx->mkldnn_workspaces[fdeps[3]]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, fwd_pool_index);
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx,
+                                fwd_pool_index,
+                                fdeps,
+                                cpu::mkldnn_utils::OpType::MAXPOOLBACKPROPFORWARD);
                        };

                    // MaxPoolBackprop backward needs 4 primitives: diff_dst, workspace, diff_src,
@@ -168,7 +181,11 @@ namespace ngraph
                            ctx, bdeps[1], ctx->mkldnn_workspaces[bdeps[3]]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, bdeps[2], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, bwd_pool_index);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx,
+                            bwd_pool_index,
+                            bdeps,
+                            cpu::mkldnn_utils::OpType::MAXPOOLBACKPROPBACKWARD);
                    };
                    auto functor = [&,
                                    bwd_pool_desc,
@@ -181,7 +198,9 @@ namespace ngraph
                                                   CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_max_pooling_backward(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_max_pooling_backward(ctx->mkldnn_memories,
+                                                                       ctx->mkldnn_primitives,
+                                                                       ctx->mkldnn_scratchpad_mds,
                                                                       ctx->mkldnn_workspaces,
                                                                       bwd_pool_desc,
                                                                       fwd_pool_desc,
@@ -249,6 +268,7 @@ namespace ngraph
                    mkldnn_emitter
                        ->get_max_pooling_with_indices_forward_desc<ngraph::op::MaxPoolWithIndices>(
                            node);
+                QUERY_SCRATCHPAD(pooling_forward, max_pool_desc);

                // MaxPoolWithIndices needs 4 primitives: src, dst, workspace, and pooling_forward.
                size_t max_pool_index = mkldnn_emitter->reserve_primitive_space(4);
@@ -264,7 +284,12 @@ namespace ngraph
                    if (ctx->first_iteration)
                    {
                        mkldnn_emitter->build_max_pooling_with_indices_forward(
-                            ctx->mkldnn_primitives, max_pool_desc, deps, max_pool_index);
+                            ctx->mkldnn_memories,
+                            ctx->mkldnn_primitives,
+                            ctx->mkldnn_scratchpad_mds,
+                            max_pool_desc,
+                            deps,
+                            max_pool_index);
                    }
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
@@ -272,7 +297,9 @@ namespace ngraph
                        ctx, deps[1], ctx->buffer_data[out0_buffer_index]);
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[2], ctx->buffer_data[out1_buffer_index]);
-                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, max_pool_index);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, max_pool_index, deps, cpu::mkldnn_utils::OpType::MAXPOOLWITHINDICES);
                };
                functors.emplace_back(functor);
            }
@@ -300,6 +327,9 @@ namespace ngraph
                    mkldnn_emitter
                        ->get_max_pooling_backward_desc<ngraph::op::MaxPoolWithIndicesBackprop>(
                            node);
+                QUERY_SCRATCHPAD_2ARGS(
+                    max_pooling_with_indices_backward, fwd_pool_desc, bwd_pool_desc);
+
                // MaxPoolWithIndicesBackprop needs 4 primitives: diff_dst, fprop_workspace,
                // diff_src, and pooling_backward.
                size_t max_pool_index = mkldnn_emitter->reserve_primitive_space(4);
@@ -316,7 +346,9 @@ namespace ngraph
                    if (ctx->first_iteration)
                    {
                        mkldnn_emitter->build_max_pooling_with_indices_backward(
+                            ctx->mkldnn_memories,
                            ctx->mkldnn_primitives,
+                            ctx->mkldnn_scratchpad_mds,
                            bwd_pool_desc,
                            fwd_pool_desc,
                            deps,
@@ -328,7 +360,12 @@ namespace ngraph
                        ctx, deps[1], ctx->buffer_data[arg2_buffer_index]);
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[2], ctx->buffer_data[out_buffer_index]);
-                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, max_pool_index);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx,
+                        max_pool_index,
+                        deps,
+                        cpu::mkldnn_utils::OpType::MAXPOOLWITHINDICESBACKPROP);
                };
                functors.emplace_back(functor);
            }

--- a/src/ngraph/runtime/cpu/builder/quantization.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantization.cpp
@@ -53,8 +53,11 @@ namespace ngraph
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+                    QUERY_SCRATCHPAD_2ARGS(reorder, input_desc, result_desc);
+
                    auto scale_const_op = std::dynamic_pointer_cast<ngraph::op::Constant>(
                        dequantize->get_argument(1));
+
                    if (scale_const_op == nullptr)
                    {
                        auto arg1_buffer_index =
@@ -83,7 +86,9 @@ namespace ngraph
                                    static_cast<float*>(ctx->buffer_data[arg1_buffer_index]),
                                    static_cast<float*>(ctx->buffer_data[arg1_buffer_index]) +
                                        scales_size);
-                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_primitives,
+                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_memories,
+                                                                       ctx->mkldnn_primitives,
+                                                                       ctx->mkldnn_scratchpad_mds,
                                                                       input_desc,
                                                                       result_desc,
                                                                       dyn_scales,
@@ -94,7 +99,9 @@ namespace ngraph
                                ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, dequantize_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, dequantize_index, deps, cpu::mkldnn_utils::OpType::DEQUANTIZE);
                        };
                        functors.emplace_back(functor);
                    }
@@ -116,7 +123,9 @@ namespace ngraph
                                                     CPUExecutionContext* ectx) {
                            if (ctx->first_iteration)
                            {
-                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_primitives,
+                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_memories,
+                                                                       ctx->mkldnn_primitives,
+                                                                       ctx->mkldnn_scratchpad_mds,
                                                                       input_desc,
                                                                       result_desc,
                                                                       scales,
@@ -127,7 +136,9 @@ namespace ngraph
                                ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, dequantize_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, dequantize_index, deps, cpu::mkldnn_utils::OpType::DEQUANTIZE);
                        };
                        functors.emplace_back(functor);
                    }
@@ -314,6 +325,7 @@ namespace ngraph
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+                    QUERY_SCRATCHPAD_2ARGS(reorder, input_desc, result_desc);

                    auto scale_const_op =
                        std::dynamic_pointer_cast<ngraph::op::Constant>(quantize->get_argument(1));
@@ -351,7 +363,9 @@ namespace ngraph
                                }
                                // quantize across first dim (mask=2^0) if dyn_scales is a vector
                                const int mask = scales_size == 1 ? 0 : 1;
-                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_primitives,
+                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_memories,
+                                                                       ctx->mkldnn_primitives,
+                                                                       ctx->mkldnn_scratchpad_mds,
                                                                       input_desc,
                                                                       result_desc,
                                                                       dyn_scales,
@@ -363,7 +377,9 @@ namespace ngraph
                                ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quantize_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, quantize_index, deps, cpu::mkldnn_utils::OpType::QUANTIZE);
                        };
                        functors.emplace_back(functor);
                    }
@@ -385,7 +401,9 @@ namespace ngraph
                                                          CPUExecutionContext* ectx) {
                            if (ctx->first_iteration)
                            {
-                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_primitives,
+                                mkldnn_emitter->build_quantize_reorder(ctx->mkldnn_memories,
+                                                                       ctx->mkldnn_primitives,
+                                                                       ctx->mkldnn_scratchpad_mds,
                                                                       input_desc,
                                                                       result_desc,
                                                                       scales,
@@ -396,7 +414,9 @@ namespace ngraph
                                ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quantize_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, quantize_index, deps, cpu::mkldnn_utils::OpType::QUANTIZE);
                        };
                        functors.emplace_back(functor);
                    }

--- a/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
@@ -66,6 +66,8 @@ namespace ngraph
                    auto conv_attr =
                        mkldnn_emitter
                            ->get_convolution_forward_attr<ngraph::op::QuantizedConvolution>(node);
+                    QUERY_SCRATCHPAD_2ARGS(convolution_forward, conv_desc, conv_attr);
+
                    size_t conv_index = mkldnn_emitter->convolution_forward_init();
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);

@@ -94,7 +96,9 @@ namespace ngraph
                            // use conv channelwise (dim 1, mask=2^1) if dyn_scales is a vector
                            conv_attr.set_output_scales(0, dyn_scales);
                            mkldnn_emitter->build_convolution_forward<false>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                conv_desc,
                                conv_attr,
                                executor::global_cpu_engine,
@@ -107,7 +111,9 @@ namespace ngraph
                            ctx, deps[1], ctx->buffer_data[arg1_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[2], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, conv_index, deps, cpu::mkldnn_utils::OpType::QUANTIZEDCONVOLUTION);
                    };
                    functors.emplace_back(functor);
                }
@@ -339,6 +345,8 @@ namespace ngraph
                        mkldnn_emitter
                            ->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionRelu>(
                                node);
+                    QUERY_SCRATCHPAD_2ARGS(convolution_forward, conv_desc, conv_attr);
+
                    size_t conv_index = mkldnn_emitter->convolution_forward_init();
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);

@@ -364,7 +372,9 @@ namespace ngraph
                            const int mask = scales_size == 1 ? 0 : 2;
                            conv_attr.set_output_scales(mask, dyn_scales);
                            mkldnn_emitter->build_convolution_forward<false>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                conv_desc,
                                conv_attr,
                                executor::global_cpu_engine,
@@ -377,7 +387,12 @@ namespace ngraph
                            ctx, deps[1], ctx->buffer_data[arg1_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[2], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx,
+                            conv_index,
+                            deps,
+                            cpu::mkldnn_utils::OpType::QUANTIZEDCONVOLUTIONRELU);
                    };
                    functors.emplace_back(functor);
                }
@@ -415,6 +430,8 @@ namespace ngraph
                        mkldnn_emitter
                            ->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionBias>(
                                node);
+                    QUERY_SCRATCHPAD_2ARGS(convolution_forward, conv_desc, conv_attr);
+
                    size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);

@@ -441,7 +458,9 @@ namespace ngraph
                            const int mask = scales_size == 1 ? 0 : 2;
                            conv_attr.set_output_scales(mask, dyn_scales);
                            mkldnn_emitter->build_convolution_forward<true>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                conv_desc,
                                conv_attr,
                                executor::global_cpu_engine,
@@ -456,7 +475,12 @@ namespace ngraph
                            ctx, deps[2], ctx->buffer_data[arg2_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[3], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx,
+                            conv_index,
+                            deps,
+                            cpu::mkldnn_utils::OpType::QUANTIZEDCONVOLUTIONBIAS);
                    };
                    functors.emplace_back(functor);
                }
@@ -501,6 +525,8 @@ namespace ngraph
                        mkldnn_emitter
                            ->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionBiasAdd>(
                                node);
+                    QUERY_SCRATCHPAD_2ARGS(convolution_forward, conv_desc, conv_attr);
+
                    size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);

@@ -553,7 +579,9 @@ namespace ngraph
                            conv_attr.set_output_scales(mask, dyn_scales);
                            conv_attr.set_post_ops(new_pops);
                            mkldnn_emitter->build_convolution_forward<true>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                conv_desc,
                                conv_attr,
                                executor::global_cpu_engine,
@@ -576,7 +604,12 @@ namespace ngraph
                            ctx, deps[2], ctx->buffer_data[arg2_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[3], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx,
+                            conv_index,
+                            deps,
+                            cpu::mkldnn_utils::OpType::QUANTIZEDCONVOLUTIONBIASADD);
                    };
                    functors.emplace_back(functor);
                }
@@ -617,6 +650,8 @@ namespace ngraph
                        ngraph::op::QuantizedConvolutionBiasSignedAdd>(node);
                    auto conv_attr = mkldnn_emitter->get_convolution_forward_attr<
                        ngraph::op::QuantizedConvolutionBiasSignedAdd>(node);
+                    QUERY_SCRATCHPAD_2ARGS(convolution_forward, conv_desc, conv_attr);
+
                    size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);

@@ -669,7 +704,9 @@ namespace ngraph
                            const int mask = scales_size == 1 ? 0 : 2;
                            conv_attr.set_output_scales(mask, dyn_scales);
                            mkldnn_emitter->build_convolution_forward<true>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                conv_desc,
                                conv_attr,
                                executor::global_cpu_engine,
@@ -692,7 +729,12 @@ namespace ngraph
                            ctx, deps[2], ctx->buffer_data[arg2_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[3], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx,
+                            conv_index,
+                            deps,
+                            cpu::mkldnn_utils::OpType::QUANTIZEDCONVOLUTIONBIASSIGNEDADD);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/quantized_dot.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_dot.cpp
@@ -63,6 +63,8 @@ namespace ngraph
                    auto ip_attr =
                        mkldnn_emitter
                            ->get_inner_product_forward_attr<ngraph::op::QuantizedDotBias>(node);
+                    QUERY_SCRATCHPAD_2ARGS(ip_forward, ip_desc, ip_attr);
+
                    size_t ip_index = mkldnn_emitter->inner_product_forward_init(true);
                    auto& deps = mkldnn_emitter->get_primitive_deps(ip_index);

@@ -87,7 +89,9 @@ namespace ngraph
                                    scales_size);
                            ip_attr.set_output_scales(0, dyn_scales);
                            mkldnn_emitter->build_inner_product_forward<true>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                ip_desc,
                                ip_attr,
                                executor::global_cpu_engine,
@@ -102,7 +106,9 @@ namespace ngraph
                            ctx, deps[2], ctx->buffer_data[arg2_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[3], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, ip_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, ip_index, deps, cpu::mkldnn_utils::OpType::QUANTIZEDDOTBIAS);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/quantized_matmul.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_matmul.cpp
@@ -56,6 +56,8 @@ namespace ngraph
                    auto ip_attr =
                        mkldnn_emitter->get_inner_product_forward_attr<ngraph::op::QuantizedMatmul>(
                            node);
+                    QUERY_SCRATCHPAD_2ARGS(ip_forward, ip_desc, ip_attr);
+
                    size_t ip_index = mkldnn_emitter->inner_product_forward_init(false);
                    auto& deps = mkldnn_emitter->get_primitive_deps(ip_index);

@@ -76,7 +78,9 @@ namespace ngraph
                                *(static_cast<float*>(ctx->buffer_data[arg2_buffer_index])));
                            ip_attr.set_output_scales(0, dyn_scales);
                            mkldnn_emitter->build_inner_product_forward<false>(
+                                ctx->mkldnn_memories,
                                ctx->mkldnn_primitives,
+                                ctx->mkldnn_scratchpad_mds,
                                ip_desc,
                                ip_attr,
                                executor::global_cpu_engine,
@@ -89,7 +93,9 @@ namespace ngraph
                            ctx, deps[1], ctx->buffer_data[arg1_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[2], ctx->buffer_data[out0_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, ip_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, ip_index, deps, cpu::mkldnn_utils::OpType::QUANTIZEDMATMUL);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/relu.cpp
+++ b/src/ngraph/runtime/cpu/builder/relu.cpp
@@ -41,6 +41,8 @@ namespace ngraph

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto relu_desc = mkldnn_emitter->get_relu_forward_desc(node);
+                    QUERY_SCRATCHPAD(eltwise_forward, relu_desc);
+
                    // Relu needs 3 primitives: input, result, and eltwise_forward.
                    size_t relu_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(relu_index);
@@ -49,14 +51,20 @@ namespace ngraph
                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_relu_forward(
-                                ctx->mkldnn_primitives, relu_desc, deps, relu_index);
+                            mkldnn_emitter->build_relu_forward(ctx->mkldnn_memories,
+                                                               ctx->mkldnn_primitives,
+                                                               ctx->mkldnn_scratchpad_mds,
+                                                               relu_desc,
+                                                               deps,
+                                                               relu_index);
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[0], ctx->buffer_data[arg_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, relu_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, relu_index, deps, cpu::mkldnn_utils::OpType::RELU);
                    };
                    functors.emplace_back(functor);
                }
@@ -81,6 +89,8 @@ namespace ngraph
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto bwd_desc = mkldnn_emitter->get_relu_backward_desc(node);
                    auto fwd_desc = mkldnn_emitter->get_relu_forward_desc(node);
+                    QUERY_SCRATCHPAD_2ARGS(eltwise_backward, fwd_desc, bwd_desc);
+
                    // ReluBackprop needs 4 primitives: input, delta, result, and eltwise_backward.
                    size_t relu_index = mkldnn_emitter->reserve_primitive_space(4);
                    auto& deps = mkldnn_emitter->get_primitive_deps(relu_index);
@@ -95,8 +105,13 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_relu_backward(
-                                ctx->mkldnn_primitives, bwd_desc, fwd_desc, deps, relu_index);
+                            mkldnn_emitter->build_relu_backward(ctx->mkldnn_memories,
+                                                                ctx->mkldnn_primitives,
+                                                                ctx->mkldnn_scratchpad_mds,
+                                                                bwd_desc,
+                                                                fwd_desc,
+                                                                deps,
+                                                                relu_index);
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[0], ctx->buffer_data[arg_fwd_buffer_index]);
@@ -104,7 +119,9 @@ namespace ngraph
                            ctx, deps[1], ctx->buffer_data[delta_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[2], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, relu_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, relu_index, deps, cpu::mkldnn_utils::OpType::RELUBACKPROP);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/rnn.cpp
+++ b/src/ngraph/runtime/cpu/builder/rnn.cpp
@@ -44,11 +44,6 @@ namespace ngraph
                    external_function->get_buffer_index(args[0].get_name());
                auto src_iter_buffer_index =
                    external_function->get_buffer_index(args[1].get_name());
-                auto weights_layer_buffer_index =
-                    external_function->get_buffer_index(args[2].get_name());
-                auto weights_iter_buffer_index =
-                    external_function->get_buffer_index(args[3].get_name());
-                auto bias_buffer_index = external_function->get_buffer_index(args[4].get_name());
                auto dst_layer_buffer_index =
                    external_function->get_buffer_index(out[0].get_name());
                auto dst_iter_buffer_index = external_function->get_buffer_index(out[1].get_name());
@@ -56,6 +51,14 @@ namespace ngraph
                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                auto rnn_desc =
                    mkldnn_emitter->get_rnn_forward_desc<ngraph::op::Rnn>(node, args, out);
+
+#if MKLDNN_VERSION_MAJOR < 1
+                auto weights_layer_buffer_index =
+                    external_function->get_buffer_index(args[2].get_name());
+                auto weights_iter_buffer_index =
+                    external_function->get_buffer_index(args[3].get_name());
+                auto bias_buffer_index = external_function->get_buffer_index(args[4].get_name());
+
                // Rnn needs 9 primitives: src_layer, src_iter, weights_layer, weights_iter, bias,
                // dst_layer, dst_iter, workspace, and rnn_forward.
                // It needs a new workspace.
@@ -76,7 +79,9 @@ namespace ngraph
                                                       CPUExecutionContext* ectx) {
                    if (ctx->first_iteration)
                    {
-                        mkldnn_emitter->build_rnn_forward(ctx->mkldnn_primitives,
+                        mkldnn_emitter->build_rnn_forward(ctx->mkldnn_memories,
+                                                          ctx->mkldnn_primitives,
+                                                          ctx->mkldnn_scratchpad_mds,
                                                          ctx->mkldnn_workspaces,
                                                          rnn_desc,
                                                          deps,
@@ -98,9 +103,81 @@ namespace ngraph
                        ctx, deps[6], ctx->buffer_data[dst_iter_buffer_index]);
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[7], ctx->mkldnn_workspaces[deps[8]]);
-                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, rnn_index);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, rnn_index, deps, cpu::mkldnn_utils::OpType::RNN);
+                };
+                functors.emplace_back(functor);
+#else
+                mkldnn_emitter->query_scratchpad_rnn_forward(rnn_desc);
+
+                auto src_iter_c_buffer_index =
+                    external_function->get_buffer_index(args[2].get_name());
+                auto weights_layer_buffer_index =
+                    external_function->get_buffer_index(args[3].get_name());
+                auto weights_iter_buffer_index =
+                    external_function->get_buffer_index(args[4].get_name());
+                auto bias_buffer_index = external_function->get_buffer_index(args[5].get_name());
+                auto dst_iter_c_buffer_index =
+                    external_function->get_buffer_index(out[2].get_name());
+
+                // Rnn needs 11 primitives: src_layer, src_iter, src_iter_c, weights_layer,
+                // weights_iter, bias,
+                // dst_layer, dst_iter, dst_iter_c, workspace, and lstm_forward.
+                // It needs a new workspace.
+                auto rnn_index =
+                    mkldnn_emitter->reserve_primitive_space(11, true /* new workspace */);
+                auto& deps = mkldnn_emitter->get_primitive_deps(rnn_index);
+
+                auto functor = [&,
+                                rnn_desc,
+                                rnn_index,
+                                src_layer_buffer_index,
+                                src_iter_buffer_index,
+                                src_iter_c_buffer_index,
+                                weights_layer_buffer_index,
+                                weights_iter_buffer_index,
+                                bias_buffer_index,
+                                dst_layer_buffer_index,
+                                dst_iter_buffer_index,
+                                dst_iter_c_buffer_index](CPURuntimeContext* ctx,
+                                                         CPUExecutionContext* ectx) {
+                    if (ctx->first_iteration)
+                    {
+                        mkldnn_emitter->build_rnn_forward(ctx->mkldnn_memories,
+                                                          ctx->mkldnn_primitives,
+                                                          ctx->mkldnn_scratchpad_mds,
+                                                          ctx->mkldnn_workspaces,
+                                                          rnn_desc,
+                                                          deps,
+                                                          rnn_index);
+                    }
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[0], ctx->buffer_data[src_layer_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[1], ctx->buffer_data[src_iter_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[2], ctx->buffer_data[src_iter_c_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[3], ctx->buffer_data[weights_layer_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[4], ctx->buffer_data[weights_iter_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[5], ctx->buffer_data[bias_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[6], ctx->buffer_data[dst_layer_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[7], ctx->buffer_data[dst_iter_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[8], ctx->buffer_data[dst_iter_c_buffer_index]);
+                    cpu::mkldnn_utils::set_memory_ptr(
+                        ctx, deps[9], ctx->mkldnn_workspaces[deps[10]]);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, rnn_index, deps, cpu::mkldnn_utils::OpType::RNN);
                };
                functors.emplace_back(functor);
+#endif
            }

            void register_builders_rnn_cpp() { REGISTER_OP_BUILDER(Rnn); }

--- a/src/ngraph/runtime/cpu/builder/sigmoid.cpp
+++ b/src/ngraph/runtime/cpu/builder/sigmoid.cpp
@@ -43,6 +43,8 @@ namespace ngraph

                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                auto sigmoid_desc = mkldnn_emitter->get_sigmoid_forward_desc(node, false);
+                QUERY_SCRATCHPAD(eltwise_forward, sigmoid_desc);
+
                // Sigmoid needs 3 primitives: input, result, and eltwise_forward.
                auto sigmoid_index = mkldnn_emitter->reserve_primitive_space(3);
                auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
@@ -52,14 +54,20 @@ namespace ngraph
                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_sigmoid_forward(
-                                ctx->mkldnn_primitives, sigmoid_desc, deps, sigmoid_index);
+                            mkldnn_emitter->build_sigmoid_forward(ctx->mkldnn_memories,
+                                                                  ctx->mkldnn_primitives,
+                                                                  ctx->mkldnn_scratchpad_mds,
+                                                                  sigmoid_desc,
+                                                                  deps,
+                                                                  sigmoid_index);
                        }
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, sigmoid_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, sigmoid_index, deps, cpu::mkldnn_utils::OpType::SIGMOID);
                    };
                functors.emplace_back(functor);
            }
@@ -80,6 +88,8 @@ namespace ngraph
                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                auto fwd_desc = mkldnn_emitter->get_sigmoid_forward_desc(node, true);
                auto bwd_desc = mkldnn_emitter->get_sigmoid_backward_desc(node);
+                QUERY_SCRATCHPAD_2ARGS(eltwise_backward, fwd_desc, bwd_desc);
+
                // SigmoidBackprop needs 4 primitives: input, delta, result, and eltwise_backward.
                size_t sigmoid_index = mkldnn_emitter->reserve_primitive_space(4);
                auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
@@ -94,8 +104,13 @@ namespace ngraph
                                                  CPUExecutionContext* ectx) {
                    if (ctx->first_iteration)
                    {
-                        mkldnn_emitter->build_sigmoid_backward(
-                            ctx->mkldnn_primitives, bwd_desc, fwd_desc, deps, sigmoid_index);
+                        mkldnn_emitter->build_sigmoid_backward(ctx->mkldnn_memories,
+                                                               ctx->mkldnn_primitives,
+                                                               ctx->mkldnn_scratchpad_mds,
+                                                               bwd_desc,
+                                                               fwd_desc,
+                                                               deps,
+                                                               sigmoid_index);
                    }
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[0], ctx->buffer_data[arg0_buffer_index]);
@@ -103,7 +118,9 @@ namespace ngraph
                        ctx, deps[1], ctx->buffer_data[arg1_buffer_index]);
                    cpu::mkldnn_utils::set_memory_ptr(
                        ctx, deps[2], ctx->buffer_data[out_buffer_index]);
-                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, sigmoid_index);
+
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                        ctx, sigmoid_index, deps, cpu::mkldnn_utils::OpType::SIGMOIDBACKPROP);
                };
                functors.emplace_back(functor);
            }

--- a/src/ngraph/runtime/cpu/builder/slice.cpp
+++ b/src/ngraph/runtime/cpu/builder/slice.cpp
@@ -93,6 +93,8 @@ namespace ngraph
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+                    QUERY_SCRATCHPAD_4ARGS(slice, input_desc, result_desc, lower_bounds, out_shape);
+
                    // Slice needs 3 primitives: input, result, and reorder.
                    auto slice_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(slice_index);
@@ -108,7 +110,9 @@ namespace ngraph
                                                      CPUExecutionContext* ectx) {
                        if (ctx->first_iteration)
                        {
-                            mkldnn_emitter->build_slice(ctx->mkldnn_primitives,
+                            mkldnn_emitter->build_slice(ctx->mkldnn_memories,
+                                                        ctx->mkldnn_primitives,
+                                                        ctx->mkldnn_scratchpad_mds,
                                                        input_desc,
                                                        result_desc,
                                                        lower_bounds,
@@ -120,7 +124,9 @@ namespace ngraph
                            ctx, deps[0], ctx->buffer_data[arg_buffer_index]);
                        cpu::mkldnn_utils::set_memory_ptr(
                            ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, slice_index);
+
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                            ctx, slice_index, deps, cpu::mkldnn_utils::OpType::SLICE);
                    };

                    functors.emplace_back(functor);

--- a/src/ngraph/runtime/cpu/builder/softmax.cpp
+++ b/src/ngraph/runtime/cpu/builder/softmax.cpp
@@ -48,6 +48,8 @@ namespace ngraph
                {
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                    auto softmax_desc = mkldnn_emitter->get_softmax_forward_desc(node);
+                    QUERY_SCRATCHPAD(softmax_forward, softmax_desc);
+
                    // Softmax needs 3 primitives: input, result, and softmax_forward.
                    size_t softmax_index = mkldnn_emitter->reserve_primitive_space(3);
                    auto& deps = mkldnn_emitter->get_primitive_deps(softmax_index);
@@ -57,14 +59,20 @@ namespace ngraph
                            CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                            if (ctx->first_iteration)
                            {
-                                mkldnn_emitter->build_softmax_forward(
-                                    ctx->mkldnn_primitives, softmax_desc, deps, softmax_index);
+                                mkldnn_emitter->build_softmax_forward(ctx->mkldnn_memories,
+                                                                      ctx->mkldnn_primitives,
+                                                                      ctx->mkldnn_scratchpad_mds,
+                                                                      softmax_desc,
+                                                                      deps,
+                                                                      softmax_index);
                            }
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[0], ctx->buffer_data[arg_buffer_index]);
                            cpu::mkldnn_utils::set_memory_ptr(
                                ctx, deps[1], ctx->buffer_data[out_buffer_index]);
-                            cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, softmax_index);
+
+                            cpu::mkldnn_utils::mkldnn_invoke_primitive(
+                                ctx, softmax_index, deps, cpu::mkldnn_utils::OpType::SOFTMAX);
                        };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -207,11 +207,17 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context(Allocator* allocator)
            ctx->memory_buffers.push_back(buffer);
        }
        const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
-
+        // Create scratchpad
+        auto scratchpad_size = mkldnn_emitter->get_max_scratchpad_size();
        if (m_external_function->is_direct_execution())
        {
            ctx->mkldnn_primitives =
                std::vector<mkldnn::primitive*>(mkldnn_emitter->get_mkldnn_primitives().size());
+            ctx->mkldnn_memories =
+                std::vector<mkldnn::memory*>(mkldnn_emitter->get_mkldnn_memories().size());
+            ctx->mkldnn_scratchpad_mds = std::vector<mkldnn::memory::desc*>(
+                mkldnn_emitter->get_mkldnn_scratchpad_mds().size());
+            ctx->scratchpad_buffer = new AlignedBuffer(scratchpad_size, alignment);
        }
        else
        {
@@ -249,10 +255,23 @@ void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()
        {
            delete p;
        }
+        for (auto m : ctx->mkldnn_memories)
+        {
+            delete m;
+        }
        for (auto buffer : ctx->memory_buffers)
        {
            delete buffer;
        }
+        for (auto s : ctx->mkldnn_scratchpad_mds)
+        {
+            delete s;
+        }
+        if (m_external_function->is_direct_execution())
+        {
+            delete ctx->scratchpad_buffer;
+        }
+
        if (m_external_function->is_direct_execution() &&
            std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
        {

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_executor.cpp
+++ b/src/ngraph/runtime/cpu/cpu_executor.cpp
@@ -135,8 +135,11 @@ namespace ngraph
                    static CPUExecutor cpu_executor(num_thread_pools < 1 ? 1 : num_thread_pools);
                    return cpu_executor;
                }
-
+#if MKLDNN_VERSION_MAJOR < 1
                mkldnn::engine global_cpu_engine(mkldnn::engine::cpu, 0);
+#else
+                mkldnn::engine global_cpu_engine(mkldnn::engine::kind::cpu, 0);
+#endif
            }
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -679,7 +679,14 @@ using namespace ngraph::runtime;
    writer << "void inline CPURuntimeContextCG::init_mkldnn_primitives()\n";
    writer.block_begin();
    writer << "mkldnn_primitives = std::vector<mkldnn::primitive*>("
-           << to_string(m_mkldnn_emitter->get_mkldnn_primitives_cg().size()) << ");\n";
+           << to_string(m_mkldnn_emitter->get_mkldnn_primitives().size()) << ");\n";
+    writer << "mkldnn_memories = std::vector<mkldnn::memory*>("
+           << to_string(m_mkldnn_emitter->get_mkldnn_memories().size()) << ");\n";
+    writer << "mkldnn_scratchpad_mds = std::vector<mkldnn::memory::desc*>("
+           << to_string(m_mkldnn_emitter->get_mkldnn_scratchpad_mds().size()) << ");\n";
+    writer << "size_t scratchpad_size = " << m_mkldnn_emitter->get_max_scratchpad_size() << ";\n";
+    writer << "size_t alignment = 4096;\n";
+    writer << "scratchpad_buffer = new AlignedBuffer(scratchpad_size, alignment);\n";
    writer.block_end();
    writer << "\n";

@@ -742,9 +749,8 @@ using namespace ngraph::runtime;
        writer.block_begin();
        writer << "// read in memory descriptors and build mkldnn primitives\n";
        writer << "std::ifstream desc_file (\"" << m_desc_filename << "\", std::ios::binary);\n";
-        writer << "deserialize_memory_descs_and_build_memory_primitives(" << m_desc_filename
-               << ", cg_ctx, " << to_string(m_mkldnn_emitter->get_mkldnn_descriptors_size())
-               << ");\n";
+        writer << "deserialize_memory_descs_and_build_memory(" << m_desc_filename << ", cg_ctx, "
+               << to_string(m_mkldnn_emitter->get_mkldnn_descriptors_size()) << ");\n";
        writer.block_end();
    }


--- a/src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp
+++ b/src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp
@@ -21,6 +21,14 @@
 #include "ngraph/runtime/cpu/cpu_executor.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"

+#if MKLDNN_VERSION_MAJOR < 1
+#define UNDEF format_undef
+#define F32 f32
+#else
+#define UNDEF undef
+#define F32 data_type::f32
+#endif
+
 namespace ngraph
 {
    namespace runtime
@@ -29,8 +37,8 @@ namespace ngraph
        {
            const mkldnn::memory::desc
                LayoutDescriptor::DummyDesc(mkldnn::memory::dims(TENSOR_MAX_DIMS),
-                                            mkldnn::memory::f32,
-                                            mkldnn::memory::format::format_undef);
+                                            mkldnn::memory::F32,
+                                            mkldnn::memory::FORMAT::UNDEF);

            LayoutDescriptor::LayoutDescriptor(const ngraph::descriptor::Tensor& tv)
                : TensorLayout(tv)
@@ -109,15 +117,18 @@ namespace ngraph
                // http://intel.github.io/mkl-dnn/understanding_memory_formats.html
                try
                {
+#if MKLDNN_VERSION_MAJOR < 1
                    auto mem_prim_desc =
                        mkldnn::memory::primitive_desc(md, executor::global_cpu_engine);
                    m_buffer_size = mem_prim_desc.get_size();
+#else
+                    m_buffer_size = md.get_size();
+#endif
                }
                catch (const mkldnn::error& e)
                {
-                    throw ngraph_error(
-                        "error in computing mkldnn memory size from memory primitive desc: " +
-                        e.message);
+                    throw ngraph_error("error in computing mkldnn memory size from memory desc: " +
+                                       MKLDNN_ERROR_MESSAGE);
                }
            }


--- a/src/ngraph/runtime/cpu/cpu_layout_descriptor.hpp
+++ b/src/ngraph/runtime/cpu/cpu_layout_descriptor.hpp
@@ -50,7 +50,12 @@ namespace ngraph
                void set_mkldnn_md(const mkldnn::memory::desc& md);
                bool is_mkldnn_layout() const
                {
+#if MKLDNN_VERSION_MAJOR < 1
                    return m_mkldnn_md.data.format != mkldnn::memory::format::format_undef;
+#else
+                    return static_cast<mkldnn::memory::format_kind>(m_mkldnn_md.data.format_kind) !=
+                           mkldnn::memory::format_kind::undef;
+#endif
                }
                bool is_row_major_layout();


--- a/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
+++ b/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
@@ -63,8 +63,11 @@ namespace ngraph
                bool first_iteration;
                // stores tensor pointers
                std::vector<void*> buffer_data;
+                std::vector<mkldnn::memory*> mkldnn_memories;
                std::vector<mkldnn::primitive*> mkldnn_primitives;
                std::vector<AlignedBuffer*> memory_buffers;
+                std::vector<mkldnn::memory::desc*> mkldnn_scratchpad_mds;
+                AlignedBuffer* scratchpad_buffer;
                std::vector<char*> mkldnn_workspaces;
                tbb::flow::graph* G;
                tbb::global_control* c;

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -142,11 +142,20 @@ void runtime::cpu::CPUTensorView::read(void* target, size_t n) const
        auto output_desc = mkldnn_utils::create_blocked_mkldnn_md(
            this->get_shape(), cpu_tvl->get_strides(), this->get_element_type());

+#if MKLDNN_VERSION_MAJOR < 1
        memory input{{input_desc, executor::global_cpu_engine}, aligned_buffer};
        memory output{{output_desc, executor::global_cpu_engine}, target};
        reorder prim{input, output};
        mkldnn::stream s(mkldnn::stream::kind::eager);
        s.submit({prim}).wait();
+#else
+        memory input{input_desc, executor::global_cpu_engine, aligned_buffer};
+        memory output{output_desc, executor::global_cpu_engine, target};
+        reorder prim{input, output};
+        mkldnn::stream s(executor::global_cpu_engine);
+        prim.execute(s, {{MKLDNN_ARG_SRC, input}, {MKLDNN_ARG_DST, output}});
+        s.wait();
+#endif
    }
    else
    {

--- a/src/ngraph/runtime/cpu/cpu_visualize_tree.cpp
+++ b/src/ngraph/runtime/cpu/cpu_visualize_tree.cpp
@@ -15,6 +15,7 @@
 //*****************************************************************************

 #include "cpu_visualize_tree.hpp"
+#include <string>
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
@@ -49,12 +50,29 @@ static void visualize_layout_format(const Node& node, ostream& ss)
        {
            ss << "\ninput_order=" << reshape->get_input_order();
        }
-        ss << "\nin="
-           << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
-                  static_cast<mkldnn::memory::format>(in_tvl->get_mkldnn_md().data.format));
-        ss << " out="
-           << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
-                  static_cast<mkldnn::memory::format>(out_tvl->get_mkldnn_md().data.format));
+#if MKLDNN_VERSION_MAJOR >= 1
+        auto in_md = in_tvl->get_mkldnn_md();
+        auto out_md = out_tvl->get_mkldnn_md();
+        ss << "\nin strides={";
+        for (auto i = 0; i < in_md.data.ndims - 1; i++)
+        {
+            ss << in_md.data.format_desc.blocking.strides[i] << ",";
+        }
+        ss << in_md.data.format_desc.blocking.strides[in_md.data.ndims - 1] << "}";
+        ss << "\nout strides={";
+        for (auto i = 0; i < out_md.data.ndims - 1; i++)
+        {
+            ss << out_md.data.format_desc.blocking.strides[i] << ",";
+        }
+        ss << out_md.data.format_desc.blocking.strides[out_md.data.ndims - 1] << "}";
+#else
+        ss << "\nin=" << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                             static_cast<mkldnn::memory::FORMAT_KIND>(
+                                 in_tvl->get_mkldnn_md().data.FORMAT_KIND));
+        ss << " out=" << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                             static_cast<mkldnn::memory::FORMAT_KIND>(
+                                 out_tvl->get_mkldnn_md().data.FORMAT_KIND));
+#endif
        ss << " ";
    }
    catch (...)

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
--- a/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
@@ -19,20 +19,22 @@
 #include <mkldnn.hpp>

 #include "mkldnn_invoke.hpp"
+#include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/runtime/cpu/cpu_executor.hpp"
 #include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"

+#if MKLDNN_VERSION_MAJOR < 1
 extern "C" void ngraph::runtime::cpu::mkldnn_utils::set_memory_ptr(CPURuntimeContext* ctx,
-                                                                   size_t primitive_index,
+                                                                   size_t index,
                                                                   void* ptr)
 {
-    auto primitive = static_cast<mkldnn::memory*>(ctx->mkldnn_primitives[primitive_index]);
+    auto primitive = static_cast<mkldnn::memory*>(ctx->mkldnn_primitives[index]);
    primitive->set_data_handle(ptr);
 }

-extern "C" void ngraph::runtime::cpu::mkldnn_utils::mkldnn_invoke_primitive(CPURuntimeContext* ctx,
-                                                                            size_t primitive_index)
+extern "C" void ngraph::runtime::cpu::mkldnn_utils::mkldnn_invoke_primitive(
+    CPURuntimeContext* ctx, size_t primitive_index, std::vector<size_t>& deps, OpType type)
 {
    mkldnn::stream s(mkldnn::stream::kind::eager);
    try
@@ -41,6 +43,175 @@ extern "C" void ngraph::runtime::cpu::mkldnn_utils::mkldnn_invoke_primitive(CPUR
    }
    catch (const mkldnn::error& e)
    {
-        throw ngraph_error("Could not run mkdnn primitive " + e.message);
+        throw ngraph_error("Could not run mkdnn primitive " + MKLDNN_ERROR_MESSAGE);
    }
 }
+#else
+extern "C" void ngraph::runtime::cpu::mkldnn_utils::set_memory_ptr(CPURuntimeContext* ctx,
+                                                                   size_t index,
+                                                                   void* ptr)
+{
+    auto memory = ctx->mkldnn_memories[index];
+    memory->set_data_handle(ptr);
+}
+
+extern "C" void ngraph::runtime::cpu::mkldnn_utils::mkldnn_invoke_primitive(
+    CPURuntimeContext* ctx, size_t primitive_index, std::vector<size_t>& deps, OpType type)
+{
+    std::unordered_map<int, mkldnn::memory> exec_args;
+    size_t nargs;
+    switch (type)
+    {
+    case OpType::ADD:
+        exec_args = {{MKLDNN_ARG_MULTIPLE_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_MULTIPLE_SRC + 1, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[2]]}};
+        break;
+    case OpType::AVGPOOL:
+    case OpType::BOUNDEDRELU:
+    case OpType::CONVERTLAYOUT:
+    case OpType::LEAKYRELU:
+    case OpType::LRN:
+    case OpType::MAXPOOL:
+    case OpType::QUANTIZE:
+    case OpType::DEQUANTIZE:
+    case OpType::QUANTIZEDAVGPOOL:
+    case OpType::QUANTIZEDMAXPOOL:
+    case OpType::RELU:
+    case OpType::SIGMOID:
+    case OpType::SLICE:
+    case OpType::SOFTMAX:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[1]]}};
+        break;
+    case OpType::AVGPOOLBACKPROP:
+        exec_args = {{MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_DIFF_SRC, *ctx->mkldnn_memories[deps[1]]}};
+        break;
+    case OpType::BATCHNORM3ARGS:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_MEAN, *ctx->mkldnn_memories[deps[3]]},
+                     {MKLDNN_ARG_VARIANCE, *ctx->mkldnn_memories[deps[4]]}};
+        break;
+    case OpType::BATCHNORM5ARGS:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_MEAN, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_VARIANCE, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[3]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[4]]}};
+        break;
+    case OpType::BATCHNORMBACKPROP:
+        exec_args = {{MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_MEAN, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_VARIANCE, *ctx->mkldnn_memories[deps[3]]},
+                     {MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[4]]},
+                     {MKLDNN_ARG_DIFF_SRC, *ctx->mkldnn_memories[deps[5]]},
+                     {MKLDNN_ARG_DIFF_WEIGHTS, *ctx->mkldnn_memories[deps[6]]}};
+        break;
+    case OpType::CONCAT:
+    case OpType::QUANTIZEDCONCAT:
+        nargs = deps.size() - 1;
+        for (size_t i = 0; i < nargs; i++)
+        {
+            exec_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *ctx->mkldnn_memories[deps[i]]});
+        }
+        exec_args.insert({MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[nargs]]});
+        break;
+    case OpType::CONVOLUTION:
+    case OpType::CONVOLUTIONRELU:
+    case OpType::CONVOLUTIONADD:
+    case OpType::GROUPCONVOLUTION:
+    case OpType::QUANTIZEDMATMUL:
+    case OpType::QUANTIZEDCONVOLUTION:
+    case OpType::QUANTIZEDCONVOLUTIONRELU:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[2]]}};
+        break;
+    case OpType::CONVOLUTIONBIAS:
+    case OpType::CONVOLUTIONBIASADD:
+    case OpType::GROUPCONVOLUTIONBIAS:
+    case OpType::QUANTIZEDDOTBIAS:
+    case OpType::QUANTIZEDCONVOLUTIONBIAS:
+    case OpType::QUANTIZEDCONVOLUTIONBIASADD:
+    case OpType::QUANTIZEDCONVOLUTIONBIASSIGNEDADD:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_BIAS, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[3]]}};
+        break;
+    case OpType::CONVOLUTIONBACKPROPDATA:
+        exec_args = {{MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_DIFF_SRC, *ctx->mkldnn_memories[deps[2]]}};
+        break;
+    case OpType::CONVOLUTIONBACKPROPWEIGHTS:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DIFF_WEIGHTS, *ctx->mkldnn_memories[deps[2]]}};
+        break;
+    case OpType::CONVOLUTIONBACKPROPWEIGHTSBIAS:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DIFF_WEIGHTS, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_DIFF_BIAS, *ctx->mkldnn_memories[deps[3]]}};
+        break;
+    case OpType::DECONVOLUTIONBIAS:
+        exec_args = {{MKLDNN_ARG_WEIGHTS, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_BIAS, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[3]]}};
+        break;
+    case OpType::LSTM:
+    case OpType::RNN:
+        exec_args = {{MKLDNN_ARG_SRC_LAYER, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_SRC_ITER, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_SRC_ITER_C, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_WEIGHTS_LAYER, *ctx->mkldnn_memories[deps[3]]},
+                     {MKLDNN_ARG_WEIGHTS_ITER, *ctx->mkldnn_memories[deps[4]]},
+                     {MKLDNN_ARG_BIAS, *ctx->mkldnn_memories[deps[5]]},
+                     {MKLDNN_ARG_DST_LAYER, *ctx->mkldnn_memories[deps[6]]},
+                     {MKLDNN_ARG_DST_ITER, *ctx->mkldnn_memories[deps[7]]},
+                     {MKLDNN_ARG_DST_ITER_C, *ctx->mkldnn_memories[deps[8]]},
+                     {MKLDNN_ARG_WORKSPACE, *ctx->mkldnn_memories[deps[9]]}};
+        break;
+    case OpType::MAXPOOLBACKPROPFORWARD:
+    case OpType::MAXPOOLWITHINDICES:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_WORKSPACE, *ctx->mkldnn_memories[deps[2]]},
+                     {MKLDNN_ARG_DST, *ctx->mkldnn_memories[deps[1]]}};
+        break;
+    case OpType::MAXPOOLBACKPROPBACKWARD:
+    case OpType::MAXPOOLWITHINDICESBACKPROP:
+        exec_args = {{MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_WORKSPACE, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DIFF_SRC, *ctx->mkldnn_memories[deps[2]]}};
+        break;
+    case OpType::RELUBACKPROP:
+    case OpType::SIGMOIDBACKPROP:
+        exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
+                     {MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[1]]},
+                     {MKLDNN_ARG_DIFF_SRC, *ctx->mkldnn_memories[deps[2]]}};
+        break;
+    }
+
+    mkldnn::memory scratchpad(*ctx->mkldnn_scratchpad_mds[primitive_index],
+                              executor::global_cpu_engine,
+                              ctx->scratchpad_buffer->get_ptr());
+    exec_args.insert({MKLDNN_ARG_SCRATCHPAD, scratchpad});
+
+    mkldnn::stream s(executor::global_cpu_engine);
+    try
+    {
+        (*ctx->mkldnn_primitives[primitive_index]).execute(s, exec_args);
+        s.wait();
+    }
+    catch (const mkldnn::error& e)
+    {
+        throw ngraph_error("Could not run mkdnn primitive " + MKLDNN_ERROR_MESSAGE);
+    }
+}
+#endif
--- a/src/ngraph/runtime/cpu/mkldnn_invoke.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_invoke.hpp
@@ -28,10 +28,61 @@ namespace ngraph

            namespace mkldnn_utils
            {
-                extern "C" void
-                    set_memory_ptr(CPURuntimeContext* ctx, size_t primitive_index, void* ptr);
+                enum class OpType
+                {
+                    ADD,
+                    AVGPOOL,
+                    AVGPOOLBACKPROP,
+                    BATCHNORM3ARGS,
+                    BATCHNORM5ARGS,
+                    BATCHNORMBACKPROP,
+                    BOUNDEDRELU,
+                    CONCAT,
+                    CONVERTLAYOUT,
+                    CONVOLUTION,
+                    CONVOLUTIONRELU,
+                    CONVOLUTIONADD,
+                    CONVOLUTIONBIAS,
+                    CONVOLUTIONBIASADD,
+                    CONVOLUTIONBACKPROPDATA,
+                    CONVOLUTIONBACKPROPWEIGHTS,
+                    CONVOLUTIONBACKPROPWEIGHTSBIAS,
+                    GROUPCONVOLUTION,
+                    GROUPCONVOLUTIONBIAS,
+                    DECONVOLUTIONBIAS,
+                    LEAKYRELU,
+                    LRN,
+                    LSTM,
+                    MAXPOOL,
+                    MAXPOOLBACKPROPFORWARD,
+                    MAXPOOLBACKPROPBACKWARD,
+                    MAXPOOLWITHINDICES,
+                    MAXPOOLWITHINDICESBACKPROP,
+                    QUANTIZE,
+                    DEQUANTIZE,
+                    QUANTIZEDAVGPOOL,
+                    QUANTIZEDMAXPOOL,
+                    QUANTIZEDCONCAT,
+                    QUANTIZEDDOTBIAS,
+                    QUANTIZEDMATMUL,
+                    QUANTIZEDCONVOLUTION,
+                    QUANTIZEDCONVOLUTIONBIAS,
+                    QUANTIZEDCONVOLUTIONBIASADD,
+                    QUANTIZEDCONVOLUTIONBIASSIGNEDADD,
+                    QUANTIZEDCONVOLUTIONRELU,
+                    RELU,
+                    RELUBACKPROP,
+                    RNN,
+                    SIGMOID,
+                    SIGMOIDBACKPROP,
+                    SLICE,
+                    SOFTMAX
+                };
+                extern "C" void set_memory_ptr(CPURuntimeContext* ctx, size_t index, void* ptr);
                extern "C" void mkldnn_invoke_primitive(CPURuntimeContext* ctx,
-                                                        size_t primitive_index);
+                                                        size_t primitive_index,
+                                                        std::vector<size_t>& deps,
+                                                        OpType type);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -24,6 +24,73 @@
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/type/element_type.hpp"

+#if MKLDNN_VERSION_MAJOR < 1
+#define FORMAT format
+#define FORMAT_KIND format
+#define FORMAT_KIND_UNDEF mkdnn_format_undef
+#define FORMAT_ANY mkldnn_any
+#define FORMAT_UNDEF mkldnn_undef
+#define DATA_UNDEF data_undef
+
+#define CHANGE_FORMAT                                                                              \
+    if (weights_desc.data.format == mkldnn_nchw)                                                   \
+    {                                                                                              \
+        weights_desc.data.format = mkldnn_oihw;                                                    \
+    }                                                                                              \
+    if (weights_desc.data.format == mkldnn_ncdhw)                                                  \
+    {                                                                                              \
+        weights_desc.data.format = mkldnn_oidhw;                                                   \
+    }
+
+#define BN_FLAG_CLASS batch_normalization_flag
+
+#define PADDING , mkldnn::padding_kind::zero
+
+#define SET_ROUND_MODE attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
+
+#define QUERY_SCRATCHPAD(op_name, x)
+#define QUERY_SCRATCHPAD_2ARGS(op_name, x, y)
+#define QUERY_SCRATCHPAD_3ARGS(op_name, x, y, z)
+#define QUERY_SCRATCHPAD_4ARGS(op_name, x, y, z, u)
+
+#define MKLDNN_ERROR_MESSAGE e.message
+
+#else
+#define TENSOR_MAX_DIMS MKLDNN_MAX_NDIMS
+#define FORMAT format_tag
+#define FORMAT_KIND format_kind
+#define FORMAT_KIND_UNDEF format_kind::undef
+#define FORMAT_ANY static_cast<mkldnn_format_kind_t>(mkldnn::memory::format_kind::any)
+#define FORMAT_UNDEF format_tag::undef
+#define DATA_UNDEF undef
+
+#define CHANGE_FORMAT
+
+#define BN_FLAG_CLASS normalization_flags
+
+#define PADDING
+
+#define SET_ROUND_MODE
+
+#define QUERY_SCRATCHPAD(op_name, x) mkldnn_emitter->query_scratchpad_##op_name(x)
+#define QUERY_SCRATCHPAD_2ARGS(op_name, x, y) mkldnn_emitter->query_scratchpad_##op_name(x, y)
+#define QUERY_SCRATCHPAD_3ARGS(op_name, x, y, z) mkldnn_emitter->query_scratchpad_##op_name(x, y, z)
+#define QUERY_SCRATCHPAD_4ARGS(op_name, x, y, z, u)                                                \
+    mkldnn_emitter->query_scratchpad_##op_name(x, y, z, u)
+
+#define ATTR_S                                                                                     \
+    mkldnn::primitive_attr attr;                                                                   \
+    attr.set_scratchpad_mode(mkldnn::scratchpad_mode::user);
+
+#define GET_SIZE                                                                                   \
+    mkldnn::memory::desc scratchpad_md = pd.scratchpad_desc();                                     \
+    size_t size = scratchpad_md.get_size();                                                        \
+    m_max_scratchpad_size = size > m_max_scratchpad_size ? size : m_max_scratchpad_size;
+
+#define MKLDNN_ERROR_MESSAGE std::string(e.message)
+
+#endif
+
 namespace ngraph
 {
    namespace runtime
@@ -36,12 +103,12 @@ namespace ngraph
 #ifndef _WIN32
                extern "C" void mkl_serv_free_buffers();
 #endif
-                mkldnn::memory::format
+                mkldnn::memory::FORMAT
                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout);
-                mkldnn::memory::format CreateNativeDataFormat(const Shape& shape);
+                mkldnn::memory::FORMAT CreateNativeDataFormat(const Shape& shape);
                const std::string& get_mkldnn_data_type_string(const ngraph::element::Type& type);
                mkldnn::memory::data_type get_mkldnn_data_type(const ngraph::element::Type& type);
-                const std::string& get_mkldnn_format_string(mkldnn::memory::format fmt);
+                const std::string& get_mkldnn_format_string(mkldnn::memory::FORMAT fmt);

                const mkldnn::memory::desc& get_input_mkldnn_md(const Node* node, size_t index);
                const mkldnn::memory::desc& get_output_mkldnn_md(const Node* node, size_t index);
@@ -49,7 +116,7 @@ namespace ngraph
                mkldnn::memory::desc create_default_mkldnn_md(const Node* node,
                                                              size_t index,
                                                              bool is_output,
-                                                              mkldnn::memory::format format);
+                                                              mkldnn::memory::FORMAT format);
                bool is_perm_sorted(const Strides& a, const AxisVector& perm);
                bool can_create_mkldnn_md(const ngraph::element::Type type);
                bool can_create_mkldnn_md(const Shape& dims,
@@ -58,6 +125,11 @@ namespace ngraph
                mkldnn::memory::desc create_blocked_mkldnn_md(const Shape& dims,
                                                              const Strides& strides,
                                                              const ngraph::element::Type type);
+                mkldnn::memory::desc
+                    create_blocked_mkldnn_md_helper(const mkldnn::memory::dims& dim,
+                                                    const Strides& strides,
+                                                    const mkldnn::memory::dims& stride,
+                                                    const mkldnn::memory::data_type dtype);
                mkldnn::memory::desc try_get_named_md(const mkldnn_memory_desc_t& md);
                mkldnn::memory::desc rotate_blocked_md(const mkldnn::memory::desc& in,
                                                       const AxisVector& axis_order);
@@ -66,13 +138,13 @@ namespace ngraph
                mkldnn::memory::desc expand_blocked_md(const mkldnn::memory::desc& in,
                                                       AxisVector& axis_list);

-                bool compare_mkldnn_formats(mkldnn::memory::format lhs, mkldnn::memory::format rhs);
+                bool compare_mkldnn_formats(mkldnn::memory::FORMAT lhs, mkldnn::memory::FORMAT rhs);
                bool compare_mkldnn_mds(const mkldnn::memory::desc& lhs,
                                        const mkldnn::memory::desc& rhs);
                bool is_mkldnn_padded_layout(const mkldnn::memory::desc& in,
                                             const AxisVector& axis_list);
-                bool is_mkldnn_filter_format(mkldnn::memory::format fmt);
-                bool is_mkldnn_blocked_data_format(mkldnn::memory::format fmt);
+                bool is_mkldnn_filter_format(mkldnn::memory::FORMAT fmt);
+                bool is_mkldnn_blocked_data_format(mkldnn::memory::FORMAT fmt);
                bool can_use_mkldnn_batchnorm_fprop(const ngraph::Node* node);
                bool can_use_mkldnn_batchnorm_bprop(const ngraph::Node* node);

@@ -95,8 +167,8 @@ namespace ngraph
                std::map<element::Type, const mkldnn::memory::data_type>&
                    get_mkldnn_data_type_map();
                std::map<element::Type, const std::string>& get_mkldnn_data_type_string_map();
-                std::map<mkldnn::memory::format, const std::string>& get_mkldnn_format_string_map();
-                std::set<mkldnn::memory::format>& get_filter_formats();
+                std::map<mkldnn::memory::FORMAT, const std::string>& get_mkldnn_format_string_map();
+                std::set<mkldnn::memory::FORMAT>& get_filter_formats();
                template <typename T>
                bool can_use_mkldnn_conv(ngraph::Node* node)
                {
@@ -152,6 +224,26 @@ namespace ngraph
                    }
                    return true;
                }
+
+#if MKLDNN_VERSION_MAJOR >= 1
+                std::map<mkldnn::memory::format_kind, const std::string>&
+                    get_mkldnn_format_kind_string_map();
+                const std::string&
+                    get_mkldnn_format_kind_string(mkldnn::memory::format_kind fmt_kind);
+                bool inline compare_mkldnn_dims(mkldnn_dims_t& arr1,
+                                                mkldnn_dims_t& arr2,
+                                                size_t size);
+                bool compare_mkldnn_strides_order(mkldnn_dims_t& stride1,
+                                                  mkldnn_dims_t& stride2,
+                                                  size_t size);
+                bool compare_mkldnn_md_formats(const mkldnn::memory::desc& lhs,
+                                               const mkldnn::memory::desc& rhs);
+                bool mkldnn_md_matches_format_tag(const mkldnn::memory::desc&,
+                                                  const mkldnn::memory::format_tag&);
+                mkldnn::memory::desc create_default_mkldnn_md_with_strides(
+                    const Node* node, size_t index, mkldnn::memory::dims& strides, bool is_output);
+                bool is_mkldnn_desc_blocked_data_format(const mkldnn::memory::desc& desc);
+#endif
            }
        }
    }

--- a/src/ngraph/runtime/cpu/op/lstm.cpp
+++ b/src/ngraph/runtime/cpu/op/lstm.cpp
--- a/src/ngraph/runtime/cpu/op/lstm.hpp
+++ b/src/ngraph/runtime/cpu/op/lstm.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <mkldnn.hpp>
 #include "ngraph/op/op.hpp"
 #include "ngraph/runtime/cpu/op/rnn_utils.hpp"
 #include "ngraph/util.hpp"
@@ -29,22 +30,23 @@ namespace ngraph
        public:
            static const std::string type_name;
            const std::string& description() const override { return type_name; }
-            // INPUTS:
-            // [0] - {Xt} input tensor of layout TNC, Shape{sequence length*batch_size,
-            //       feature_size}
-            // [1] - recurrent state tensors {ht_1 | ct_1} of Shape{sequence length*batch_size,
-            //       feature_size}
-            // [2] - initializer for the input weights matrix, used for the linear transformation of
-            //       the inputs.
-            // [3] - initializer for the recurrent weights matrix, used for the linear
-            //       transformation of the recurrent state.
-            // [4] - Initializer for the bias vector w.r.to inputs + hidden state (ibh_bias +
-            //       hbh_bias)
+// INPUTS:
+// [0] - {Xt} input tensor of layout TNC, Shape{sequence length*batch_size,
+//       feature_size}
+// [1] - recurrent state tensors {ht_1 | ct_1} of Shape{sequence length*batch_size,
+//       feature_size}
+// [2] - initializer for the input weights matrix, used for the linear transformation of
+//       the inputs.
+// [3] - initializer for the recurrent weights matrix, used for the linear
+//       transformation of the recurrent state.
+// [4] - Initializer for the bias vector w.r.to inputs + hidden state (ibh_bias +
+//       hbh_bias)

-            // OUTPUT VALUE: A tuple with the following structure:
-            //   [0] - ht, output tensor with shape (sequence_length*batch_size, num_hidden) .
-            //   [1] - {ht | ct} output recurrent state tensor with the same shape as states
+// OUTPUT VALUE: A tuple with the following structure:
+//   [0] - ht, output tensor with shape (sequence_length*batch_size, num_hidden) .
+//   [1] - {ht | ct} output recurrent state tensor with the same shape as states

+#if MKLDNN_VERSION_MAJOR < 1
            // This version of the LSTM op supports MKLDNN emitter code, this can be used standalone
            // for computing RNN
            // without fusing RNN cell (LSTM)'s across time steps.
@@ -54,6 +56,15 @@ namespace ngraph
                 const Output<Node>& weights_iter,
                 const Output<Node>& bias,
                 ngraph::runtime::cpu::rnn_utils::rnntype rnn_type);
+#else
+            Lstm(const Output<Node>& src_layer,
+                 const Output<Node>& src_iter,
+                 const Output<Node>& src_iter_c,
+                 const Output<Node>& weights_layer,
+                 const Output<Node>& weights_iter,
+                 const Output<Node>& bias,
+                 ngraph::runtime::cpu::rnn_utils::rnntype rnn_type);
+#endif
            Shape get_output_tensor_shape() const { return m_output_tensor_shape; }
            Shape get_output_cell_shape() const { return m_output_cell_shape; }
            ngraph::runtime::cpu::rnn_utils::rnntype get_rnn_type() const { return m_rnntype; }

--- a/src/ngraph/runtime/cpu/op/rnn.cpp
+++ b/src/ngraph/runtime/cpu/op/rnn.cpp
--- a/src/ngraph/runtime/cpu/op/rnn.hpp
+++ b/src/ngraph/runtime/cpu/op/rnn.hpp
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -1961,6 +1961,8 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_deconvolution_affine_foldi
        prelu, "CPUFusion.deconvolution_affine_folding_relu");
    this->add_matcher(m, callback);
 }
+
+#if MKLDNN_VERSION_MAJOR < 1
 void ngraph::runtime::cpu::pass::CPUFusion::construct_fuse_lstm_recurrent_state()
 {
    auto src_layer_label = std::make_shared<pattern::op::Label>(element::f32, Shape{30, 100});
@@ -2009,6 +2011,7 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_fuse_lstm_recurrent_state(
                                                        "CPUFusion.fuse_lstm_recurrent_state");
    this->add_matcher(m, callback);
 }
+#endif

 void ngraph::runtime::cpu::pass::CPUFusion::construct_update_slice()
 {

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_mkldnn_primitive_build.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_mkldnn_primitive_build.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_rnn_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_rnn_fusion.cpp
--- a/src/ngraph/runtime/cpu/pregenerated_src/cpu_cg_runtime_context.hpp
+++ b/src/ngraph/runtime/cpu/pregenerated_src/cpu_cg_runtime_context.hpp
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp