Merge pull request #3013 from NervanaSystems/mlir

MLIR Integration into CPU backend

Merge pull request #3013 from NervanaSystems/mlir
MLIR Integration into CPU backend
4abb06df · Scott Cyphers · GitHub · e7d0117e · a2de30f3 · 4abb06df
Unverified Commit 4abb06df authored Jun 26, 2019 by Scott Cyphers Committed by GitHub Jun 26, 2019
40 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -162,6 +162,7 @@ ngraph_var(NGRAPH_USE_PREBUILT_LLVM DEFAULT "FALSE")
 option(NGRAPH_UNIT_TEST_ENABLE "Control the building of unit tests" TRUE)
 option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
 option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
+option(NGRAPH_MLIR_ENABLE "Control the building of MLIR backend" FALSE)
 option(NGRAPH_INTELGPU_ENABLE "Control the building of the Intel GPU backend with clDNN" FALSE)
 option(NGRAPH_GPU_ENABLE "Control the building of the GPU backend" FALSE)
 option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" TRUE)
@@ -190,6 +191,12 @@ if (NGRAPH_CPU_ENABLE
    set(NGRAPH_INTEL_CPU_ONLY_ENABLE ON)
 endif()
+if (NGRAPH_MLIR_ENABLE AND (NOT NGRAPH_DEX_ONLY))
+    #disable code-gen due to sym collision with LLVM
+    message(FATAL_ERROR "Cannot build with both MLIR and code-gen ON.\n"
+                        "Use -DNGRAPH_DEX_ONLY=ON and try again. \n")
+endif()
 if (NGRAPH_DISTRIBUTED_ENABLE)
    if ("${NGRAPH_DISTRIBUTED_ENABLE}" STREQUAL "MLSL")
        if (NGRAPH_INTEL_CPU_ONLY_ENABLE)
@@ -232,6 +239,7 @@ endmacro()
 NORMALIZE_BOOL(NGRAPH_UNIT_TEST_ENABLE)
 NORMALIZE_BOOL(NGRAPH_TOOLS_ENABLE)
 NORMALIZE_BOOL(NGRAPH_CPU_ENABLE)
+NORMALIZE_BOOL(NGRAPH_MLIR_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTELGPU_ENABLE)
 NORMALIZE_BOOL(NGRAPH_GPU_ENABLE)
 NORMALIZE_BOOL(NGRAPH_INTERPRETER_ENABLE)
@@ -253,6 +261,7 @@ NORMALIZE_BOOL(NGRAPH_JSON_ENABLE)
 message(STATUS "NGRAPH_UNIT_TEST_ENABLE:        ${NGRAPH_UNIT_TEST_ENABLE}")
 message(STATUS "NGRAPH_TOOLS_ENABLE:            ${NGRAPH_TOOLS_ENABLE}")
 message(STATUS "NGRAPH_CPU_ENABLE:              ${NGRAPH_CPU_ENABLE}")
+message(STATUS "NGRAPH_MLIR_ENABLE:             ${NGRAPH_MLIR_ENABLE}")
 message(STATUS "NGRAPH_INTELGPU_ENABLE:         ${NGRAPH_INTELGPU_ENABLE}")
 message(STATUS "NGRAPH_GPU_ENABLE:              ${NGRAPH_GPU_ENABLE}")
 message(STATUS "NGRAPH_INTERPRETER_ENABLE:      ${NGRAPH_INTERPRETER_ENABLE}")
@@ -375,6 +384,11 @@ if (NGRAPH_CPU_ENABLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_CPU_ENABLE")
 endif()
+if (NGRAPH_MLIR_ENABLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_MLIR_ENABLE")
+    set(NGRAPH_MLIR_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/contrib/mlir)
+endif()
 if (NGRAPH_PLAIDML_ENABLE)
    find_package(PlaidML CONFIG)
    if (NOT PLAIDML_FOUND)
@@ -493,6 +507,9 @@ endif()
 if(NGRAPH_CPU_ENABLE)
    include(cmake/external_mkldnn.cmake)
 endif()
+if (NGRAPH_MLIR_ENABLE)
+    include(cmake/external_mlir.cmake)
+endif()
 if(NGRAPH_INTELGPU_ENABLE)
    include(cmake/external_cldnn.cmake)
 endif()

--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -48,7 +48,8 @@ project/doc-contributor-README.rst  @indie
 /src/ngraph/pattern/                @jbobba @aprocter
 /src/ngraph/runtime/                @rkimballn1 @jbobba
 /src/ngraph/runtime/cpu/            @jbobba
-/src/ngraph/runtime/cpu/builder/allreduce.*pp    @wenzhe-nrv @jbobba
+/src/contrib/mlir/                  @nmostafa @dcaballe
+/src/ngraph/runtime/cpu/builder/allreduce.*pp    @wenzhe-nrv @jbobba @avijit-nervana
 /src/ngraph/runtime/dynamic/        @aprocter
 /src/ngraph/runtime/gpu/            @rkimballn1
 /src/ngraph/runtime/hybrid/         @rkimballn1

--- a/cmake/external_mlir.cmake
+++ b/cmake/external_mlir.cmake
+# ******************************************************************************
+# Copyright 2017-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+include(ExternalProject)
+set(MLIR_LLVM_REPO_URL https://github.com/llvm/llvm-project.git)
+set(MLIR_REPO_URL https://github.com/tensorflow/mlir.git)
+# Change these commit IDs to move to latest stable versions
+set(MLIR_LLVM_COMMIT_ID bb2b527)
+set(MLIR_COMMIT_ID 49f7efc)
+set(MLIR_PROJECT_ROOT ${CMAKE_CURRENT_BINARY_DIR}/mlir_project)
+set(MLIR_LLVM_ROOT ${MLIR_PROJECT_ROOT}/llvm-projects)
+set(MLIR_SOURCE_DIR ${MLIR_LLVM_ROOT}/llvm/projects/mlir)
+set(MLIR_BUILD_DIR ${MLIR_LLVM_ROOT}/build)
+# MLIR has to be pre-built before ngraph build starts
+# this will clone and build MLIR during cmake config instead
+configure_file(${CMAKE_SOURCE_DIR}/cmake/mlir_fetch.cmake.in ${MLIR_PROJECT_ROOT}/CMakeLists.txt)
+execute_process(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+                WORKING_DIRECTORY "${MLIR_PROJECT_ROOT}")
+# clone and build llvm
+execute_process(COMMAND "${CMAKE_COMMAND}" --build . --target ext_mlir_llvm
+                WORKING_DIRECTORY "${MLIR_PROJECT_ROOT}")
+# clone and build mlir
+execute_process(COMMAND "${CMAKE_COMMAND}" --build . --target ext_mlir
+                WORKING_DIRECTORY "${MLIR_PROJECT_ROOT}")
+# point find_package to the pre-built libs
+set(LLVM_DIR ${MLIR_LLVM_ROOT}/build/lib/cmake/llvm)
+set(MLIR_SRC_INCLUDE_PATH ${MLIR_SOURCE_DIR}/include)
+set(MLIR_BIN_INCLUDE_PATH ${MLIR_BUILD_DIR}/projects/mlir/include)
+set(MLIR_INCLUDE_PATHS  ${MLIR_SRC_INCLUDE_PATH};${MLIR_BIN_INCLUDE_PATH})
--- a/cmake/mlir_fetch.cmake.in
+++ b/cmake/mlir_fetch.cmake.in
+# ******************************************************************************
+# Copyright 2017-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+cmake_minimum_required(VERSION 3.1)
+include(ExternalProject)
+project(mlir-fetch NONE)
+ExternalProject_Add(
+    ext_mlir_llvm
+    PREFIX mlir_llvm
+    GIT_REPOSITORY ${MLIR_LLVM_REPO_URL}
+    GIT_TAG ${MLIR_LLVM_COMMIT_ID}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+    UPDATE_COMMAND ""
+    SOURCE_DIR ${MLIR_LLVM_ROOT}
+    DOWNLOAD_NO_PROGRESS TRUE
+    EXCLUDE_FROM_ALL TRUE
+    )
+ExternalProject_Add(
+    ext_mlir
+    PREFIX mlir
+    DEPENDS ${MLIR_DEPENDS}
+    GIT_REPOSITORY ${MLIR_REPO_URL}
+    GIT_TAG ${MLIR_COMMIT_ID}
+    CONFIGURE_COMMAND ""
+    CMAKE_GENERATOR ${CMAKE_GENERATOR}
+    CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM}
+    CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET}
+    BUILD_COMMAND ${CMAKE_COMMAND} ../llvm -DLLVM_BUILD_EXAMPLES=OFF -DLLVM_ENABLE_CXX1Y=Y -DLLVM_TARGETS_TO_BUILD=host -DLLVM_ENABLE_RTTI=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    COMMAND ${CMAKE_COMMAND} --build . --target check-mlir -- -j8 
+    INSTALL_COMMAND ""
+    UPDATE_COMMAND ""
+    SOURCE_DIR ${MLIR_SOURCE_DIR}
+    BINARY_DIR ${MLIR_BUILD_DIR}
+    STAMP_DIR "${MLIR_PROJECT_ROOT}/mlir/stamp"
+    DOWNLOAD_NO_PROGRESS TRUE
+    EXCLUDE_FROM_ALL TRUE
+    )
--- a/licenses/mlir-license.txt
+++ b/licenses/mlir-license.txt
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign.
+All rights reserved.
+Developed by:
+    LLVM Team
+    University of Illinois at Urbana-Champaign
+    http://llvm.org
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,6 +20,11 @@ include_directories(ngraph)
 add_subdirectory(resource)
+# This must be added before any backend that uses MLIR
+if (NGRAPH_MLIR_ENABLE)
+    add_subdirectory(${NGRAPH_MLIR_SOURCE_DIR})
+endif()
 add_subdirectory(ngraph)
 if (NGRAPH_TOOLS_ENABLE)

--- a/src/contrib/mlir/CMakeLists.txt
+++ b/src/contrib/mlir/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+set(SRC
+    dialect/dialect.cpp
+    dialect/type.cpp
+    dialect/ops.cpp
+    compiler.cpp
+    lowerer.cpp
+    memory_manager.cpp
+    pass/mlir_subgraph_extraction.cpp
+    pass/mlir_subgraph_extraction.hpp
+)
+if (NGRAPH_MLIR_ENABLE)
+    add_library(mlir_backend SHARED ${SRC})
+    message(STATUS "LLVM Directory: ${LLVM_DIR}")
+    # Link LLVM and MLIR
+    find_package(LLVM REQUIRED CONFIG)
+    set(MLIR_LLVM_INCLUDEPATH ${LLVM_INCLUDE_DIRS})
+    message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+    message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+    message(STATUS "LLVM RTTI is ${LLVM_ENABLE_RTTI}")
+    add_definitions(${LLVM_DEFINITIONS})
+    target_include_directories(mlir_backend PRIVATE ${LLVM_INCLUDE_DIRS})
+    message(STATUS "MLIR Headers at : ${MLIR_INCLUDE_PATHS}")
+    message(STATUS "LLVM Headers at : ${MLIR_LLVM_INCLUDEPATH}")
+    target_include_directories(mlir_backend PRIVATE ${MLIR_INCLUDE_PATHS}) 
+    llvm_map_components_to_libnames(llvm_libs support core irreader)
+    # Link MLIR libs
+    target_link_libraries(
+        mlir_backend PRIVATE 
+        MLIRAnalysis
+        MLIREDSC
+        MLIRExecutionEngine
+        MLIRIR
+        MLIRLLVMIR
+        MLIRParser
+        MLIRPass
+        MLIRTargetLLVMIR
+        MLIRTransforms
+        MLIRSupport
+    )
+    # some libs need whole archive linkage because of Globals static initialization
+    function(whole_archive_link target)
+        if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
+            set(link_flags "-Llib -Wl,-all_load ")
+            FOREACH(LIB ${ARGN})
+                string(CONCAT link_flags ${link_flags} "${LIB}")
+            ENDFOREACH(LIB)
+        else()
+            set(link_flags "-Llib -Wl,--whole-archive,")
+            FOREACH(LIB ${ARGN})
+                string(CONCAT link_flags ${link_flags} "${LIB},")
+            ENDFOREACH(LIB)
+            string(CONCAT link_flags ${link_flags} "--no-whole-archive")
+        endif()
+        message(STATUS "MLIR Ops link flag: ${link_flags}" )
+        set_target_properties(${target} PROPERTIES LINK_FLAGS ${link_flags})
+    endfunction(whole_archive_link)
+    whole_archive_link(mlir_backend
+        ${LLVM_BUILD_LIBRARY_DIR}/libMLIRAffineOps.a
+        ${LLVM_BUILD_LIBRARY_DIR}/libMLIRStandardOps.a
+    )
+    # Link LLVM libs
+    target_link_libraries(
+        mlir_backend PRIVATE 
+        ${llvm_libs}
+    )
+    # Link ngraph 
+    target_link_libraries(mlir_backend PUBLIC ngraph)
+    # table-gen dialect ops
+    # include table-gen helpers
+    include(${LLVM_DIR}/TableGen.cmake)
+    function(ngraph_tablegen ofn)
+        tablegen(MLIR ${ARGV} "-I${MLIR_SRC_INCLUDE_PATH}" "-I${MLIR_BIN_INCLUDE_PATH}")
+        set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${ofn} PARENT_SCOPE)
+    endfunction()
+    set(MLIR_TABLEGEN_EXE mlir-tblgen)
+    set(LLVM_TARGET_DEFINITIONS dialect/ops.td)
+    ngraph_tablegen(ops.h.inc -gen-op-decls)
+    ngraph_tablegen(ops.cpp.inc -gen-op-defs)
+    add_public_tablegen_target(ngraph_ops_gen)
+    add_dependencies(mlir_backend ngraph_ops_gen)
+    target_include_directories(mlir_backend PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+endif()
--- a/src/contrib/mlir/compiler.cpp
+++ b/src/contrib/mlir/compiler.cpp
--- a/src/contrib/mlir/compiler.hpp
+++ b/src/contrib/mlir/compiler.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "memory_manager.hpp"
+#include "ngraph/node.hpp"
+#include <mlir/ExecutionEngine/ExecutionEngine.h>
+#include <mlir/ExecutionEngine/MemRefUtils.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/Types.h>
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+namespace ngraph
+{
+    namespace descriptor
+    {
+        class Tensor;
+    }
+    namespace element
+    {
+        class Type;
+    }
+    namespace op
+    {
+        class CompiledKernel;
+    }
+    namespace runtime
+    {
+        namespace ngmlir
+        {
+            /// This class is the entry point to MLIR from nGraph. It drives the conversion of
+            /// nGraph sub-graphs, represented with CompiledKernel nodes, to MLIR nGraph dialect
+            /// and its lowering, optimization and execution using LLVM-based MLIR execution engine.
+            class MLIRCompiler
+            {
+            public:
+                /// Initializes MLIR environment. It must be called only once per execution.
+                static void init_mlir();
+            public:
+                using TensorList = std::vector<descriptor::Tensor*>;
+                using TypeList = llvm::SmallVector<mlir::Type, 4>;
+                MLIRCompiler(const ngraph::op::CompiledKernel* compiled_kernel,
+                             const std::vector<void*>& external_tensors);
+                /// Compiles and runs a subgraph in MLIR.
+                void compile_and_run();
+                /// Returns the memory manager used by this sub-graph compiler.
+                MLIRMemMgr& get_mem_mgr() { return m_mem_mgr; }
+                /// Returns memory manager pointer argument ID in call interface.
+                unsigned get_mem_mgr_arg_id(mlir::Function* func)
+                {
+                    return func->getNumArguments() - 1;
+                }
+            private:
+                struct TensorInfo
+                {
+                    // MLIR values this tensor maps to.
+                    mlir::Value* m_value;
+                };
+            private:
+                void build_ng_dialect_module();
+                void lower_ng_dialect();
+                void optimize();
+                void bind_arguments();
+                void execute();
+                void cleanup();
+                mlir::Type get_mlir_type(const descriptor::Tensor* tensor);
+                mlir::Type get_mlir_type(const element::Type& type);
+                TensorInfo get_tensor_value(descriptor::Tensor* tensor);
+                void update_tensor_value(descriptor::Tensor* tensor, mlir::Value* value);
+                void build_ng_dialect();
+                template <typename OP>
+                static mlir::Value* create_op(MLIRCompiler& compiler, const ngraph::Node* ng_node)
+                {
+                    throw std::runtime_error("Unimplemented op '" + ng_node->description() +
+                                             "' in MLIR Compiler");
+                }
+                template <typename BinOp>
+                mlir::Value* create_binary_op(const ngraph::Node* ng_node);
+                void create_return();
+                /// Helper to create memref arguments for MLIR function signature
+                llvm::SmallVector<void*, 8> allocate_memref_args(mlir::Function* func);
+                /// Helper to allocate a mem ref object. Handles static shapes only for now.
+                mlir::StaticFloatMemRef* allocate_memref_descriptor(mlir::Type type);
+                /// Helper to dump MLIR module into llvm::dbgs prepended by the message \p msg.
+                void dump_mlir_module(const std::string msg);
+            private:
+                // Sub-graph to be compiled and executed with MLIR.
+                const ngraph::op::CompiledKernel* m_compiled_kernel;
+                // Pointers to externally allocated memory for sub-graph's input and output tensors.
+                const std::vector<void*>& m_external_tensors;
+                // Arguments for the MLIR function generated for the nGraph sub-graph.
+                llvm::SmallVector<void*, 8> m_invoke_args;
+                // MLIR context that holds all the MLIR information related to the sub-graph
+                // compilation.
+                mlir::MLIRContext m_context;
+                std::unique_ptr<mlir::Module> m_module;
+                std::unique_ptr<mlir::FuncBuilder> m_builder;
+                std::unique_ptr<mlir::ExecutionEngine> m_engine;
+                using TensorToInfo = std::pair<descriptor::Tensor*, TensorInfo>;
+                using TensorToInfoMap = std::unordered_map<descriptor::Tensor*, TensorInfo>;
+                using MLIRCompOpFunction =
+                    std::function<mlir::Value*(MLIRCompiler& compiler, const ngraph::Node*)>;
+                using MLIRCompOpMap = std::unordered_map<std::type_index, MLIRCompOpFunction>;
+                // Maps tensor to the value it represents in the IR
+                // use for MLIR dialect gen
+                TensorToInfoMap m_tensor_to_value_map;
+                static const MLIRCompOpMap op_dispatcher;
+                // Memory manager for temp allocations inside JIT'ed code
+                MLIRMemMgr m_mem_mgr;
+            };
+        }
+    }
+}
--- a/src/contrib/mlir/dialect/dialect.cpp
+++ b/src/contrib/mlir/dialect/dialect.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "dialect.hpp"
+#include "ngraph/check.hpp"
+#include "ops.hpp"
+#include "type.hpp"
+using namespace mlir;
+NGDialect::NGDialect(mlir::MLIRContext* ctx)
+    : mlir::Dialect("ng", ctx)
+{
+    addTypes<NGTensorType>();
+    addTypes<NGIntegerType>();
+    addTypes<NGBoolType>();
+    addOperations<
+#define GET_OP_LIST
+#include "ops.cpp.inc"
+        >();
+}
+void NGDialect::printType(mlir::Type type, raw_ostream& os) const
+{
+    switch (type.getKind())
+    {
+    case NG_TENSOR_TYPE_ID:
+    {
+        os << "tensor<";
+        auto tensor_ty = type.cast<NGTensorType>();
+        for (auto dim : tensor_ty.getShape())
+        {
+            os << dim << 'x';
+        }
+        os << tensor_ty.getElementType() << '>';
+        return;
+    }
+    case NG_I8_TYPE_ID:
+    case NG_I16_TYPE_ID:
+    case NG_I32_TYPE_ID:
+    case NG_I64_TYPE_ID:
+    case NG_U8_TYPE_ID:
+    case NG_U16_TYPE_ID:
+    case NG_U32_TYPE_ID:
+    case NG_U64_TYPE_ID:
+    {
+        auto int_ty = type.cast<NGIntegerType>();
+        os << "i" << int_ty.getWidth();
+        return;
+    }
+    case NG_BOOL_TYPE_ID:
+    {
+        os << "bool";
+        return;
+    }
+    default: { NGRAPH_CHECK(false, "Incorrect type to print?");
+    }
+    }
+}
--- a/src/contrib/mlir/dialect/dialect.hpp
+++ b/src/contrib/mlir/dialect/dialect.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "ngraph/check.hpp"
+namespace mlir
+{
+    class NGDialect : public mlir::Dialect
+    {
+    public:
+        explicit NGDialect(mlir::MLIRContext* ctx);
+        mlir::Type parseType(llvm::StringRef tyData, mlir::Location loc) const override
+        {
+            NGRAPH_CHECK(false, "Unsupported type parsing.");
+            return mlir::Type();
+        }
+        void printType(mlir::Type type, llvm::raw_ostream& os) const override;
+    };
+}
--- a/src/contrib/mlir/dialect/ops.cpp
+++ b/src/contrib/mlir/dialect/ops.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ops.hpp"
+#include "assertion.hpp"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "type.hpp"
+using llvm::ArrayRef;
+using llvm::raw_ostream;
+using llvm::raw_string_ostream;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+using namespace mlir;
+// TODO:
+// - Move verifiers and other OP helpers (e.g. getSomeAttribute()) to separate files
+//
+// - Op helpers: Since it is not possible to add arbitrary code (and would complicate the .td file)
+// to Ops classes, we will add helper classes with static methods for each Op that needs it
+// Additional verification methods
+// Tensor type checks are already verified by the caller of these methods
+/// Checks if all operands and results are of compatible shapes
+template <typename T>
+static mlir::LogicalResult verifyCompatibleOperandsAndResults(T* op, bool checkResult = true)
+{
+    mlir::Type t0 = op->getOperation()->getOperand(0)->getType();
+    mlir::NGTensorType opType0 = t0.cast<NGTensorType>();
+    Operation* opr = op->getOperation();
+    auto i = 0;
+    for (auto operand : opr->getOperands())
+    {
+        if (i == 0)
+            continue;
+        mlir::Type t = operand->getType();
+        mlir::NGTensorType opType = t.cast<NGTensorType>();
+        if (!opType.isCompatible(opType0))
+            return op->emitOpError("Incompatible operand shape");
+        i++;
+    }
+    if (checkResult)
+    {
+        for (auto result : opr->getResults())
+        {
+            mlir::Type t = result->getType();
+            mlir::NGTensorType resType = t.cast<NGTensorType>();
+            if (!resType.isCompatible(opType0))
+                return op->emitOpError("Incompatible operand shape");
+        }
+    }
+    return mlir::success();
+}
+template <typename T>
+static mlir::LogicalResult verifyUnaryArithOp(T* op)
+{
+    return verifyCompatibleOperandsAndResults(op);
+}
+template <typename T>
+static mlir::LogicalResult verifyBinaryArithOp(T* op)
+{
+    return verifyCompatibleOperandsAndResults(op);
+}
+template <typename T>
+static mlir::LogicalResult verifyAxisReductionOp(T* op)
+{
+    return mlir::failure();
+}
+template <typename T>
+static mlir::LogicalResult verifyLogicalReductionOp(T* op)
+{
+    // TODO: verifyAxisReductionOp(op) + input and return element type.
+    return mlir::failure();
+}
+template <typename T>
+static mlir::LogicalResult verifyIndexReductionOp(T* op)
+{
+    // TODO: verifyAxisReductionOp(op) + return element type + single axis.
+    return mlir::failure();
+}
+template <typename T>
+static mlir::LogicalResult verifyOp(T* op)
+{
+    return op->emitOpError("Unsupported verifier for this operation");
+}
+template <>
+mlir::LogicalResult verifyOp(NGDotOp* op)
+{
+    // TODO(dcab): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+template <>
+mlir::LogicalResult verifyOp(NGSelectOp* op)
+{
+    mlir::Type t0 = op->getOperation()->getOperand(0)->getType();
+    mlir::Type t1 = op->getOperation()->getOperand(1)->getType();
+    mlir::Type t2 = op->getOperation()->getOperand(2)->getType();
+    mlir::Type r0 = op->getOperation()->getResult(0)->getType();
+    NGTensorType opType0 = t0.cast<NGTensorType>();
+    NGTensorType opType1 = t1.cast<NGTensorType>();
+    NGTensorType opType2 = t2.cast<NGTensorType>();
+    NGTensorType resType = r0.cast<NGTensorType>();
+    // arg1 arg2 of same shape and elt type
+    if (!opType1.isCompatible(opType2))
+        return op->emitOpError("Incompatible operand shapes or types for select op");
+    // arg0 of same shape and elt type is bool
+    if (!opType0.isCompatibleShape(opType1) || !opType0.getElementType().isa<NGBoolType>())
+        return op->emitOpError("Incompatible shape for arg0 of select op");
+    // result is of same shape and elt type as arg1/2
+    if (!resType.isCompatible(opType1))
+        return op->emitOpError("Incompatible result shape or type for select op");
+    return mlir::success();
+}
+template <typename T>
+static mlir::LogicalResult verifyCmpOp(T* op)
+{
+    mlir::LogicalResult result = verifyCompatibleOperandsAndResults(op, false /*checkResult*/);
+    if (failed(result))
+    {
+        return result;
+    }
+    mlir::Type t0 = op->getOperation()->getOperand(0)->getType();
+    mlir::NGTensorType opType0 = t0.cast<NGTensorType>();
+    mlir::Type r0 = op->getOperation()->getResult(0)->getType();
+    NGTensorType resType = r0.cast<NGTensorType>();
+    // result of same shape as input and has bool type
+    if (!resType.isCompatibleShape(opType0) || !resType.getElementType().isa<NGBoolType>())
+        return op->emitOpError("Incompatible result shape or type for comparison op");
+    return mlir::success();
+}
+namespace mlir
+{
+#define GET_OP_CLASSES
+#include "ops.cpp.inc"
+}
--- a/src/contrib/mlir/dialect/ops.hpp
+++ b/src/contrib/mlir/dialect/ops.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <cstdarg>
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/STLExtras.h"
+namespace mlir
+{
+#define GET_OP_CLASSES
+#include "ops.h.inc"
+#undef GET_OP_CLASSES
+}
--- a/src/contrib/mlir/dialect/ops.td
+++ b/src/contrib/mlir/dialect/ops.td
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+//
+// This is the nGraph Dialect operation definition file.
+//
+//===----------------------------------------------------------------------===//
+include "mlir/IR/OpBase.td"
+// nGraph Dialect operations definitions
+//
+// This files declares nGraph operations that table-gen uses to create C++ code
+// For more information about tablegen. See https://llvm.org/docs/TableGen/index.html
+//
+// The output files are ops.h.inc and ops.cpp.inc and are generated at build time
+// The file declares base classes to ease opcode definitions and hoist common parts out.
+// Each class fixes a set of attributes. For example:
+// class NG_Unary_Arith_Op defines a base class for all unary arithmetic ops without side-effects
+//
+// An opcode is a record definition of the form
+//      def AbsOp      : NG_Unary_Arith_Op<"abs">;
+//
+// Each def will corresponding to a C++ class
+def NG_Dialect : Dialect {
+  let name = "ng";
+  // TODO: Have the dialect under its own mlir::ngraph namespace
+  // At mlir top-level for now
+  let cppNamespace = "";
+}
+// nGraph Types
+// This defines records equivalent to nGraph types. It doesn't generate code.
+// This is used as a type in the DAG input/outputs.
+// Constraints (CPred) are used to type-check args/results of that type during op verification
+def NG_TensorType : Type<CPred<"$_self.isa<mlir::NGTensorType>()">,
+                     "nGraph Tensor Type">;
+// A generic un-typed MemRef. Used for Fake instructions inserted during dialect lowering
+def NG_MemRefType : Type<IsMemRefTypePred, "MemRef Type">;
+// nGraph operation base class.
+// Prepends "ng." to operation name
+class NG_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<NG_Dialect, mnemonic, traits> {}
+// Operations producing single result.
+// Will set OneResult trait based on Results out dag.
+class NG_OneResult_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_Op<mnemonic, traits>, Results<(outs NG_TensorType:$res)> {}
+// Base for fake instructions defining MemRef values
+class NG_MemRefDef_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_Op<mnemonic, traits>, Results<(outs NG_MemRefType:$res)> {}
+// Operations producing no results
+class NG_ZeroResult_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_Op<mnemonic, traits>, Results<(outs)> {}
+// Base class for arithmetic unary operations without side effects.
+class NG_Unary_Arith_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_OneResult_Op<mnemonic, !listconcat([NoSideEffect], traits)>,
+      Arguments<(ins NG_TensorType:$arg)>
+{
+  // TODO: Implement
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+  let verifier = [{ return verifyUnaryArithOp(this); }];
+}
+// Base class for arithmetic binary operations without side effects.
+class NG_Binary_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_OneResult_Op<mnemonic, !listconcat([NoSideEffect], traits)>,
+      Arguments<(ins NG_TensorType:$lhs, NG_TensorType:$rhs)>
+{
+  // TODO: Implement
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+}
+// Base class for arithmetic binary operations with verifier.
+class NG_Binary_Arith_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_OneResult_Op<mnemonic, traits>,
+      Arguments<(ins NG_TensorType:$lhs, NG_TensorType:$rhs)>
+{
+  // TODO: Implement
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+  let verifier = [{ return verifyBinaryArithOp(this); }];
+}
+// Base class for comparison operations with verifier.
+class NG_Cmp_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_OneResult_Op<mnemonic, traits>,
+      Arguments<(ins NG_TensorType:$lhs, NG_TensorType:$rhs)>
+{
+  // TODO: Implement
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+  let verifier = [{ return verifyCmpOp(this); }];
+}
+// Base class for ternary operations without side effects.
+class NG_Ternary_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_OneResult_Op<mnemonic, !listconcat([NoSideEffect], traits)>,
+      Arguments<(ins NG_TensorType:$op0, NG_TensorType:$op1, NG_TensorType:$op2)>
+{
+  // TODO: Implement
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+}
+// Base class for terminator operations.
+class NG_Terminator_Op<string mnemonic, list<OpTrait> traits = []> :
+    NG_Op<mnemonic, !listconcat(traits, [Terminator])>,
+    Arguments<(ins Variadic<NG_TensorType>:$args)>, Results<(outs)> {}
+// Unary Operations
+def NGAbsOp      : NG_Unary_Arith_Op<"abs">;
+def NGACosOp     : NG_Unary_Arith_Op<"acos">;
+def NGASinOp     : NG_Unary_Arith_Op<"asin">;
+def NGATanOp     : NG_Unary_Arith_Op<"atan">;
+def NGCeilOp     : NG_Unary_Arith_Op<"ceil">;
+def NGConvertOp  : NG_Unary_Arith_Op<"conv">;
+def NGCosOp      : NG_Unary_Arith_Op<"cos">;
+def NGCoshOp     : NG_Unary_Arith_Op<"cosh">;
+def NGExpOp      : NG_Unary_Arith_Op<"exp">;
+def NGFloorOp    : NG_Unary_Arith_Op<"floor">;
+def NGLogOp      : NG_Unary_Arith_Op<"log">;
+def NGNegOp      : NG_Unary_Arith_Op<"neg">;
+def NGNotOp      : NG_Unary_Arith_Op<"not">;
+def NGSignOp     : NG_Unary_Arith_Op<"sign">;
+def NGSinOp      : NG_Unary_Arith_Op<"sin">;
+def NGSinhOp     : NG_Unary_Arith_Op<"sinh">;
+def NGTanOp      : NG_Unary_Arith_Op<"tan">;
+def NGTanhOp     : NG_Unary_Arith_Op<"tanh">;
+def NGSqrtOp     : NG_Unary_Arith_Op<"sqrt">;
+// Binary Operations
+def NGAddOp      : NG_Binary_Arith_Op<"add", [Commutative]>;
+def NGAndOp      : NG_Binary_Arith_Op<"and", [Commutative]>;
+def NGSubOp      : NG_Binary_Arith_Op<"sub">;
+def NGDivOp      : NG_Binary_Arith_Op<"div">;
+def NGMaxOp      : NG_Binary_Arith_Op<"max", [Commutative]>;
+def NGMinOp      : NG_Binary_Arith_Op<"min", [Commutative]>;
+def NGMulOp      : NG_Binary_Arith_Op<"mul", [Commutative]>;
+def NGPowOp      : NG_Binary_Arith_Op<"pow">;
+// Comparison
+def NGEqOp        : NG_Cmp_Op<"equal">;
+def NGGreaterOp   : NG_Cmp_Op<"greater">;
+def NGGreaterEqOp : NG_Cmp_Op<"greater.eq">;
+def NGLessOp      : NG_Cmp_Op<"less">;
+def NGLessEqOp    : NG_Cmp_Op<"less.eq">;
+def NGNotEqOp     : NG_Cmp_Op<"not.equal">;
+// Other
+def NGSelectOp    : NG_Ternary_Op<"select">
+{
+  let verifier = [{ return verifyOp(this); }];
+}
+// Dot Product
+def NGDotOp : NG_Binary_Op<"dot">
+{
+  // TODO: Add reduction axis attribute when needed.
+  let verifier = [{ return verifyOp(this); }];
+}
+class NG_Axis_Reduction_Op<string mnemonic, list<OpTrait> traits = []> :
+      NG_OneResult_Op<mnemonic, !listconcat([NoSideEffect], traits)>,
+      Arguments<(ins NG_TensorType:$operand, I64ArrayAttr:$axes)>
+{
+  let summary = "Base class for reduction operations that perform a reduction "
+                "across the axes of a  single tensor.";
+  let description = "Axes are represented as an array of I64 attributes.";
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+  // TODO
+  let verifier = [{ return verifyAxisReductionOp(this); }];
+}
+// Axis reduction operations.
+def NGSumRedOp : NG_Axis_Reduction_Op<"sum.red">
+{
+  let summary = "Axis sum reduction of a tensor.";
+  let verifier = [{ return verifyAxisReductionOp(this); }];
+}
+def NGProdRedOp : NG_Axis_Reduction_Op<"prod.red">
+{
+  let summary = "Axis product reduction of a tensor.";
+  let verifier = [{ return verifyAxisReductionOp(this); }];
+}
+def NGMinRedOp : NG_Axis_Reduction_Op<"min.red">
+{
+  let summary = "Axis minimum reduction of a tensor.";
+  let verifier = [{ return verifyAxisReductionOp(this); }];
+}
+def NGMaxRedOp : NG_Axis_Reduction_Op<"max.red">
+{
+  let summary = "Axis maximum reduction of a tensor.";
+  let verifier = [{ return verifyAxisReductionOp(this); }];
+}
+def NGArgMinRedOp : NG_Axis_Reduction_Op<"argmin.red">
+{
+  let summary = "Axis minimum index reduction of a tensor.";
+  let verifier = [{ return verifyIndexReductionOp(this); }];
+}
+def NGArgMaxRedOp : NG_Axis_Reduction_Op<"argmax.red">
+{
+  let summary = "Axis maximum index reduction of a tensor.";
+  let verifier = [{ return verifyIndexReductionOp(this); }];
+}
+def NGAllRedOp : NG_Axis_Reduction_Op<"all.red">
+{
+  let summary = "Axis logical AND reduction of a boolean tensor.";
+  let verifier = [{ return verifyLogicalReductionOp(this); }];
+}
+def NGAnyRedOp : NG_Axis_Reduction_Op<"any.red">
+{
+  let summary = "Axis logical OR reduction of a boolean tensor.";
+  let verifier = [{ return verifyLogicalReductionOp(this); }];
+}
+// Terminator Ops
+def NGReturnOp : NG_Terminator_Op<"return">;
+// Fake ops
+def NGFakeInputOp : NG_MemRefDef_Op<"fake.input", [NoSideEffect]>;
--- a/src/contrib/mlir/dialect/type.cpp
+++ b/src/contrib/mlir/dialect/type.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "type.hpp"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/Support/STLExtras.h"
+#include "ngraph/assertion.hpp"
+using llvm::ArrayRef;
+using llvm::raw_ostream;
+using llvm::raw_string_ostream;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+using namespace mlir;
+unsigned NGIntegerType::getWidth() const
+{
+    switch (getKind())
+    {
+    case NG_I8_TYPE_ID:
+    case NG_U8_TYPE_ID: return 8;
+    case NG_I16_TYPE_ID:
+    case NG_U16_TYPE_ID: return 16;
+    case NG_I32_TYPE_ID:
+    case NG_U32_TYPE_ID: return 32;
+    case NG_I64_TYPE_ID:
+    case NG_U64_TYPE_ID: return 64;
+    default: NGRAPH_CHECK(false, "Invalid type ID");
+    }
+    return 0;
+}
+bool NGIntegerType::isSigned() const
+{
+    switch (getKind())
+    {
+    case NG_I8_TYPE_ID:
+    case NG_I16_TYPE_ID:
+    case NG_I32_TYPE_ID:
+    case NG_I64_TYPE_ID: return true;
+    case NG_U8_TYPE_ID:
+    case NG_U16_TYPE_ID:
+    case NG_U32_TYPE_ID:
+    case NG_U64_TYPE_ID: return false;
+    default: NGRAPH_CHECK(false, "Invalid type ID");
+    }
+    return false;
+}
+/// Creates TensorType objects. They all point to the same storage if
+/// element type and shape are the same.
+NGTensorType NGTensorType::get(MLIRContext* context, EltType eltType, Shape shape)
+{
+    return Base::get(context, NGTypeKind::NG_TENSOR_TYPE_ID, eltType, shape);
+}
+bool NGTensorType::isCompatible(NGTensorType& other) const
+{
+    // Exact same tensor
+    if (this == &other)
+        return true;
+    // different tensors, check if of same element type and compatible shapes
+    if (getElementType() != other.getElementType())
+        return false;
+    // TODO: Handle dynamic ranks
+    // MLIR MemRefType doesn't seem to support it at the moment.
+    return isCompatibleShape(other);
+}
+bool NGTensorType::isCompatibleShape(NGTensorType& other) const
+{
+    auto shape = getShape();
+    auto otherShape = other.getShape();
+    if (shape.size() != otherShape.size())
+        return false;
+    for (auto i = 0; i < shape.size(); i++)
+    {
+        NGRAPH_CHECK(shape[i] >= -1, "Invalid tensor shape", shape[i]);
+        NGRAPH_CHECK(otherShape[i] >= -1, "Invalid tensor shape", otherShape[i]);
+        if (shape[i] == -1 || otherShape[i] == -1 || shape[i] == otherShape[i])
+            continue;
+        return false;
+    }
+    return true;
+}
--- a/src/contrib/mlir/dialect/type.hpp
+++ b/src/contrib/mlir/dialect/type.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/Types.h"
+#include "ngraph/check.hpp"
+namespace mlir
+{
+    using llvm::raw_ostream;
+    enum NGTypeKind
+    {
+        // The enum starts at the range reserved for this dialect.
+        // These values are pre-defined in MLIR lib and not configurable from here.
+        NG_TYPE = mlir::Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_0_TYPE,
+        // Element types that are added by the dialect.
+        // Other types are just re-use of std dialect types.
+        NG_FIRST_INT_TYPE_ID,
+        NG_I8_TYPE_ID = NG_FIRST_INT_TYPE_ID,
+        NG_I16_TYPE_ID,
+        NG_I32_TYPE_ID,
+        NG_I64_TYPE_ID,
+        NG_U8_TYPE_ID,
+        NG_U16_TYPE_ID,
+        NG_U32_TYPE_ID,
+        NG_U64_TYPE_ID,
+        NG_LAST_INT_TYPE_ID = NG_U64_TYPE_ID,
+        NG_BOOL_TYPE_ID,
+        // Tensor type
+        NG_TENSOR_TYPE_ID
+    };
+    // reuse std float types as-is
+    using NGFloatType = mlir::FloatType;
+    /// Integer type. It represents an integer of width 8,16,32,64. Signed or not.
+    class NGIntegerType : public mlir::Type::TypeBase<NGIntegerType, mlir::Type>
+    {
+    public:
+        using Base::Base;
+        static NGIntegerType get(NGTypeKind kind, mlir::MLIRContext* context)
+        {
+            NGRAPH_CHECK(kindof(kind), "Not an integer kind.");
+            return Base::get(context, kind);
+        }
+        /// Create signed Int8
+        static NGIntegerType getInt8(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_I8_TYPE_ID, ctx);
+        }
+        /// Create signed Int16
+        static NGIntegerType getInt16(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_I16_TYPE_ID, ctx);
+        }
+        /// Create signed Int32
+        static NGIntegerType getInt32(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_I32_TYPE_ID, ctx);
+        }
+        /// Create signed Int64
+        static NGIntegerType getInt64(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_I64_TYPE_ID, ctx);
+        }
+        /// Create unsigned Int8
+        static NGIntegerType getUInt8(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_U8_TYPE_ID, ctx);
+        }
+        /// Create unsigned Int16
+        static NGIntegerType getUInt16(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_U16_TYPE_ID, ctx);
+        }
+        /// Create unsigned Int32
+        static NGIntegerType getUInt32(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_U32_TYPE_ID, ctx);
+        }
+        /// Create unsigned Int64
+        static NGIntegerType getUInt64(mlir::MLIRContext* ctx)
+        {
+            return get(NGTypeKind::NG_U64_TYPE_ID, ctx);
+        }
+        /// RTTI support. So we can do obj->isa<NGIntegerType>()
+        static bool kindof(unsigned kind)
+        {
+            return kind >= NGTypeKind::NG_FIRST_INT_TYPE_ID &&
+                   kind <= NGTypeKind::NG_LAST_INT_TYPE_ID;
+        }
+        /// Return the bitwidth of this integer type.
+        unsigned getWidth() const;
+        /// Check if signed type
+        bool isSigned() const;
+        /// Check if Int8
+        bool isInt8() const { return getKind() == NG_I8_TYPE_ID; }
+        /// Check if UInt8
+        bool isUInt8() const { return getKind() == NG_U8_TYPE_ID; }
+        /// Check if Int16
+        bool isInt16() const { return getKind() == NG_I16_TYPE_ID; }
+        /// Check if UInt16
+        bool isUInt16() const { return getKind() == NG_U16_TYPE_ID; }
+        /// Check if Int32
+        bool isInt32() const { return getKind() == NG_I32_TYPE_ID; }
+        /// Check if UInt32
+        bool isUInt32() const { return getKind() == NG_U32_TYPE_ID; }
+        /// Check if Int64
+        bool isInt64() const { return getKind() == NG_I64_TYPE_ID; }
+        /// Check if UInt64
+        bool isUInt64() const { return getKind() == NG_U64_TYPE_ID; }
+        // Delete convenience methods inherited from MLIR Type class.
+        // This would avoid confusion if we do something like this and get false.
+        //
+        //      if (type->cast<NGIntegerType>()->isInteger(32)) {}
+        //
+        // Those helpers use type id, and since we have our own Integer type id, they
+        // don't apply.
+        bool isInteger(unsigned width) const = delete;
+        unsigned getIntOrFloatBitWidth() const = delete;
+        bool isIntOrIndex() const = delete;
+        bool isIntOrIndexOrFloat() const = delete;
+        bool isIntOrFloat() const = delete;
+    };
+    /// Boolean Type.
+    class NGBoolType : public mlir::Type::TypeBase<NGBoolType, mlir::Type>
+    {
+    public:
+        using Base::Base;
+        static NGBoolType get(NGTypeKind kind, mlir::MLIRContext* context)
+        {
+            NGRAPH_CHECK(kindof(kind), "Not a bool type.");
+            return Base::get(context, kind);
+        }
+        static bool kindof(unsigned kind) { return kind == NGTypeKind::NG_BOOL_TYPE_ID; }
+        static NGBoolType get(mlir::MLIRContext* ctx) { return get(NG_BOOL_TYPE_ID, ctx); }
+    };
+    // Note that dialect types don't add new data members, so always possible
+    // to use NG or std types here
+    using EltType = mlir::Type;
+    // TODO: Can we use ngraph::shape here (given the hashing requirements)
+    using Shape = llvm::ArrayRef<int64_t>;
+    /// Tensor Type storage. There is a unique instance per type attributes.
+    /// Tensor Type is combination of the element type and shape. Each different
+    /// shape is a unique type.
+    struct NGTensorTypeStorage : public mlir::TypeStorage
+    {
+        // Tensor key is its type and shape.
+        // This is called when the user requests a specific tensor type
+        using KeyTy = std::tuple<EltType, Shape>;
+        static unsigned hashKey(const KeyTy& key)
+        {
+            return llvm::hash_combine(std::get<0>(key), std::get<1>(key));
+        }
+        bool operator==(const KeyTy& key) const
+        {
+            return key == KeyTy(getElementType(), getShape());
+        }
+        static NGTensorTypeStorage* construct(mlir::TypeStorageAllocator& allocator,
+                                              const KeyTy& key)
+        {
+            // Deep copy the type shape over to MLIR context
+            EltType eltType = std::get<0>(key);
+            Shape shape = allocator.copyInto(std::get<1>(key));
+            auto* storage = allocator.allocate<NGTensorTypeStorage>();
+            return new (storage) NGTensorTypeStorage(eltType, shape);
+        }
+        Shape getShape() const { return m_shape; }
+        EltType getElementType() const { return m_eltType; }
+    private:
+        NGTensorTypeStorage(EltType eltType, Shape shape)
+            : m_eltType(eltType)
+            , m_shape(shape)
+        {
+        }
+    private:
+        EltType m_eltType;
+        Shape m_shape;
+    };
+    /// NGraph Tensor Type
+    class NGTensorType : public mlir::Type::TypeBase<NGTensorType, mlir::Type, NGTensorTypeStorage>
+    {
+    public:
+        using Base::Base;
+        EltType getElementType() const { return getImpl()->getElementType(); }
+        Shape getShape() const { return getImpl()->getShape(); }
+        /// Tensor Rank. Static shape only for now
+        int getRank() { return getShape().size(); }
+        /// Computes tensor size in bytes
+        size_t getSizeInBytes()
+        {
+            size_t s = 1;
+            auto shape = getShape();
+            for (auto i = 0; i < getRank(); i++)
+            {
+                // no dynamic dims
+                if (shape[i] == -1)
+                    return -1;
+                s *= shape[i];
+            }
+            // Multiply times element size
+            return s * llvm::divideCeil(getElementType().getIntOrFloatBitWidth(), 8);
+        }
+        /// Checks if two tensors are compatible. Compatible means:
+        /// Exactly same element types
+        /// Compatible shapes: see isCompatibleShape.
+        bool isCompatible(NGTensorType& other) const;
+        /// Check if Shapes are of same rank and  matching dimensions unless one of them is dynamic.
+        bool isCompatibleShape(NGTensorType& other) const;
+        /// create a unique tensor type based on element type and shape.
+        static NGTensorType get(mlir::MLIRContext* context, EltType eltType, Shape shape);
+        /// for llvm RTTI
+        static bool kindof(unsigned kind) { return kind == NGTypeKind::NG_TENSOR_TYPE_ID; }
+    };
+}
--- a/src/contrib/mlir/lowerer.cpp
+++ b/src/contrib/mlir/lowerer.cpp
--- a/src/contrib/mlir/lowerer.hpp
+++ b/src/contrib/mlir/lowerer.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "contrib/mlir/compiler.hpp"
+#include <mlir/Pass/Pass.h>
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace ngmlir
+        {
+            class MLIRCompiler;
+        }
+    }
+}
+namespace mlir
+{
+    mlir::Pass* createDialectLoweringPass(ngraph::runtime::ngmlir::MLIRCompiler* compiler);
+}
--- a/src/contrib/mlir/memory_manager.cpp
+++ b/src/contrib/mlir/memory_manager.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "memory_manager.hpp"
+#include <memory>
+#include "ngraph/ngraph_visibility.hpp"
+using namespace ngraph::runtime::ngmlir;
+/// Call back to allocate memory for temps from JIT'ed code
+extern "C" NGRAPH_API void* __mlir_allocate(MLIRMemMgr* mem_mgr, size_t size)
+{
+    return mem_mgr->allocate(size);
+}
+void* MLIRMemMgr::allocate(size_t size)
+{
+    void* ptr = malloc(size);
+    ptrList.push_back(ptr);
+    return ptr;
+}
+void MLIRMemMgr::freeAll()
+{
+    for (auto p : ptrList)
+    {
+        free(p);
+    }
+}
--- a/src/ngraph/runtime/cpu/op/loop_kernel.hpp
+++ b/src/ngraph/runtime/cpu/op/loop_kernel.hpp
@@ -16,35 +16,32 @@
 #pragma once
-#include "ngraph/op/op.hpp"
+#include <stdint.h>
-#include "ngraph/util.hpp"
+#include <stdlib.h>
+#include <vector>
 namespace ngraph
 {
    namespace runtime
    {
-        namespace cpu
+        namespace ngmlir
        {
-            namespace op
+            /// Memory manager for temporaries in MLIR compiled sub-graph
+            /// It handles call-backs from the code and returns pointer to allocated memory
+            /// Also, handles freeing up memory
+            class MLIRMemMgr
            {
-                /// \brief LoopKernel represents graphs consisting
+            public:
-                /// of arithmetic operations that can be executed in the same loop
+                /// Allocates data for temporary tensor. Currently, it is called for each
-                class LoopKernel : public ngraph::op::Op
+                /// temp tensor defintion. Keeps track of each pointer and free them during cleanup.
-                {
+                // TODO: Use pre-allocation from framework memory manager
-                public:
+                void* allocate(size_t size);
-                    LoopKernel(const NodeVector& node_list,
-                               const NodeVector& outputs,
+                /// Frees all allocated pointers
-                               const NodeVector& args);
+                void freeAll();
-                    virtual std::shared_ptr<Node>
-                        copy_with_new_args(const NodeVector& new_args) const override;
-                    const NodeVector& get_node_list() const { return m_node_list; }
+            private:
-                    const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
+                std::vector<void*> ptrList;
-                private:
+            };
-                    NodeVector m_node_list;
-                    NodeVector m_output_nodes;
-                };
-            }
        }
    }
 }
--- a/src/contrib/mlir/op_lowerers.inc
+++ b/src/contrib/mlir/op_lowerers.inc
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+// Add new dialect ops lowerers to this file
+#define DECL_OP_CONV(OP) \
+class OP##Conversion : public mlir::DialectConversionPattern \
+{\
+public:\
+    explicit OP##Conversion(mlir::MLIRContext *context, DialectLoweringPass& pass)\
+    : mlir::DialectConversionPattern(mlir::OP::getOperationName(), 1, context),\
+      m_pass(pass)\
+    {} \
+    void rewrite(Operation *op, ArrayRef<Value *> operands, PatternRewriter &rewriter) const override; \
+    DialectLoweringPass& m_pass;\
+};
+DECL_OP_CONV(NGAddOp)
+DECL_OP_CONV(NGDotOp)
+DECL_OP_CONV(NGReturnOp)
+#undef DECL_OP_CONV
--- a/src/contrib/mlir/ops_supported.inc
+++ b/src/contrib/mlir/ops_supported.inc
+// List of all ops supported by MLIR backend end-to-end
+#ifndef MLIR_OP
+#define MLIR_OP
+#endif
+MLIR_OP(Add)
+MLIR_OP(Dot)
+// Add new supported ops here
+#undef MLIR_OP
--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "mlir_subgraph_extraction.hpp"
+#include "ngraph/assertion.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/op/add.hpp"
+#include "ngraph/op/dot.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
+#include "ngraph/op/get_output_element.hpp"
+using namespace ngraph::descriptor;
+using namespace ngraph::op;
+using namespace ngraph::pass;
+#define TI(x) std::type_index(typeid(x))
+bool MLIRSubgraphExtractionPass::run_on_function(std::shared_ptr<Function> func)
+{
+    // Create a CompiledKernel for all the ops in the function, except Parameters and Results.
+    NodeVector ck_ops;
+    for (auto op : func->get_ordered_ops())
+    {
+        // All ops must be supported by MLIR compiler
+        if (!is_supported_mlir_op(op))
+        {
+            return false;
+        }
+        if (TI(Parameter) != TI(*op) && TI(Result) != TI(*op))
+        {
+            ck_ops.push_back(op);
+        }
+    }
+    NodeVector ck_args;
+    for (auto& param : func->get_parameters())
+    {
+        ck_args.push_back(param);
+    }
+    NodeVector ck_outputs = std::move(get_subgraph_outputs(ck_ops, {} /*exclusions*/));
+    if (ck_outputs.size() != 1)
+    {
+        return false;
+    }
+    auto ck = std::make_shared<CompiledKernel>(ck_ops, ck_outputs, ck_args);
+    // Connect CompiledKernel to output nodes by replacing the output descriptors of the output
+    // nodes.
+    for (size_t i = 0, end = ck_outputs.size(); i < end; ++i)
+    {
+        auto& output_descs = ck_outputs[i]->get_outputs();
+        NGRAPH_CHECK(output_descs.size() == 1, "Unexpected multiple output descriptors");
+        auto& out_desc = output_descs[0];
+        // 'replace_output' invalidates iterator of the original container. Use a copy instead.
+        const std::set<descriptor::Input*> input_descs = out_desc.get_inputs();
+        for (descriptor::Input* in_desc : input_descs)
+        {
+            in_desc->replace_output(ck, i);
+        }
+    }
+    return true;
+}
+#define TI(x) std::type_index(typeid(x))
+bool MLIRSubgraphExtractionPass::is_supported_mlir_op(std::shared_ptr<Node> node)
+{
+    if (TI(Parameter) == TI(*node) || TI(Result) == TI(*node))
+    {
+        return true;
+    }
+    // supported by backend ?
+    if (m_supported_ops.find(TI(*node)) == m_supported_ops.end())
+    {
+        return false;
+    }
+    // check on invariants expected by MLIR backend
+    // Dot is 2D only
+    if (TI(ngraph::op::Dot) == TI(*node))
+    {
+        if (node->get_input_shape(0).size() != 2 || node->get_input_shape(1).size() != 2)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+const std::set<std::type_index> MLIRSubgraphExtractionPass::m_supported_ops{
+#define MLIR_OP(OP) TI(ngraph::op::OP),
+#include "contrib/mlir/ops_supported.inc"
+};
--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.hpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "ngraph/pass/pass.hpp"
+namespace ngraph
+{
+    namespace pass
+    {
+        /// This pass creates CompiledKernel ops enclosing sub-graphs that will be compiled and
+        /// executed by MLIR.
+        // TODO: WIP. Currently we only create a single CompiledKernel op for the whole function
+        // body.
+        class MLIRSubgraphExtractionPass : public ngraph::pass::FunctionPass
+        {
+        public:
+            MLIRSubgraphExtractionPass() {}
+            bool run_on_function(std::shared_ptr<Function> func) override;
+            /// Checks if an ngraph node is supported by MLIR backend
+            bool is_supported_mlir_op(std::shared_ptr<Node> node);
+        private:
+            static const std::set<std::type_index> m_supported_ops;
+        };
+    }
+}
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -170,6 +170,8 @@ set (SRC
    op/experimental/quantized_dot.hpp
    op/experimental/quantized_dot_bias.cpp
    op/experimental/quantized_dot_bias.hpp
+    op/experimental/compiled_kernel.cpp
+    op/experimental/compiled_kernel.hpp
    op/experimental/transpose.cpp
    op/experimental/transpose.hpp
    op/experimental/layers/ctc_greedy_decoder.cpp

--- a/src/ngraph/runtime/cpu/op/loop_kernel.cpp
+++ b/src/ngraph/runtime/cpu/op/loop_kernel.cpp
@@ -14,15 +14,16 @@
 // limitations under the License.
 //*****************************************************************************
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
+#include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/util.hpp"
 using namespace std;
 using namespace ngraph;
-shared_ptr<Node>
+shared_ptr<Node> ngraph::op::CompiledKernel::copy_with_new_args(const NodeVector& new_args) const
-    ngraph::runtime::cpu::op::LoopKernel::copy_with_new_args(const NodeVector& new_args) const
 {
    auto args = get_arguments();
    if (new_args.size() != args.size())
@@ -56,13 +57,13 @@ shared_ptr<Node>
        new_outputs.push_back(nm.at(o.get()));
    }
-    return std::make_shared<LoopKernel>(new_node_list, new_outputs, new_args);
+    return std::make_shared<CompiledKernel>(new_node_list, new_outputs, new_args);
 }
-ngraph::runtime::cpu::op::LoopKernel::LoopKernel(const NodeVector& node_list,
+ngraph::op::CompiledKernel::CompiledKernel(const NodeVector& node_list,
-                                                 const NodeVector& outputs,
+                                           const NodeVector& outputs,
-                                                 const NodeVector& args)
+                                           const NodeVector& args)
-    : Op("LoopKernel", check_single_output_args({args}))
+    : Op("CompiledKernel", check_single_output_args({args}))
    , m_node_list(node_list)
    , m_output_nodes(outputs)
 {

--- a/src/ngraph/op/experimental/compiled_kernel.hpp
+++ b/src/ngraph/op/experimental/compiled_kernel.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "ngraph/op/op.hpp"
+#include "ngraph/util.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief CompiledKernel represents a sub-graph that can be compiled and executed
+        /// independently.
+        ///
+        /// This op can be used to delimit sub-graphs that with special compilation requirements
+        /// within a function. For example, we currently use it to delimit sub-graphs that will be
+        /// independently compiled and executed by MLIR backend.
+        class CompiledKernel : public ngraph::op::Op
+        {
+        public:
+            CompiledKernel(const NodeVector& node_list,
+                           const NodeVector& outputs,
+                           const NodeVector& args);
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+            const NodeVector& get_node_list() const { return m_node_list; }
+            const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
+        private:
+            NodeVector m_node_list;
+            NodeVector m_output_nodes;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -104,7 +104,6 @@ set(SRC
    op/group_conv_bias.cpp
    op/halide_op.cpp
    op/leaky_relu.cpp
-    op/loop_kernel.cpp
    op/lstm.cpp
    op/matmul_bias.cpp
    op/max_pool_with_indices.cpp
@@ -114,10 +113,10 @@ set(SRC
    op/update_slice.cpp
    pass/cpu_assignment.cpp
    pass/cpu_collapse_dims.cpp
+    pass/cpu_compiled_kernel_fusion.cpp
    pass/cpu_fusion.cpp
    pass/cpu_horizontal_fusion.cpp
    pass/cpu_layout.cpp
-    pass/cpu_loop_kernel_fusion.cpp
    pass/cpu_mat_fusion.cpp
    pass/cpu_memory_assignment.cpp
    pass/cpu_memory_optimization.cpp
@@ -140,13 +139,20 @@ endif()
 if (NGRAPH_HALIDE)
    set(SRC
        ${SRC}
+        builder/compiled_kernel.cpp
        builder/halide_op.cpp
-        builder/loop_kernel.cpp
        builder/halide_generators.cpp
        pass/halide_subgraph_extraction.cpp
        )
 endif()
+if (NGRAPH_MLIR_ENABLE)
+    set(SRC
+        ${SRC}
+        builder/mlir_cpu_compiled_kernel.cpp
+        )
+endif()
 if (NGRAPH_CPU_ENABLE)
    set(NGRAPH_CPU_DEBUGINFO_ENABLE 0 CACHE STRING "Enable debuginfo in the CPU backend")
@@ -203,6 +209,7 @@ if (NGRAPH_CPU_ENABLE)
        target_link_libraries(cpu_backend PUBLIC codegen)
    endif()
    target_include_directories(cpu_backend SYSTEM PUBLIC libmkldnn)
    if (NOT APPLE AND NOT MSVS)
        # CPU backend uses third-party libraries like Eigen that might be linked in and
        # exported by other DSOs as well. In the absence of versioning, this could lead to the
@@ -212,5 +219,23 @@ if (NGRAPH_CPU_ENABLE)
        set_property(TARGET cpu_backend APPEND PROPERTY LINK_FLAGS "-Wl,-Bsymbolic-functions -Wl,--exclude-libs=ALL")
    endif()
+    if (NGRAPH_MLIR_ENABLE)
+        # TODO: can we get away without LLVM/MLIR include path. 
+        # Currently mlir backend compiler.hpp include LLVM/MLIR files
+        get_directory_property(MLIR_LLVM_INCLUDEPATH
+                               DIRECTORY ${NGRAPH_MLIR_SOURCE_DIR}
+                               DEFINITION MLIR_LLVM_INCLUDEPATH)
+        message(STATUS "Building CPU backend with MLIR")
+        message(STATUS "MLIR INCLUDE DIRS: ${MLIR_INCLUDE_PATHS}")
+        message(STATUS "LLVM INCLUDE DIRS: ${MLIR_LLVM_INCLUDEPATH}")
+        add_dependencies(cpu_backend mlir_backend)
+        target_include_directories(cpu_backend PUBLIC ${MLIR_INCLUDE_PATHS} ${MLIR_LLVM_INCLUDEPATH})
+        target_link_libraries(cpu_backend PUBLIC mlir_backend)
+        # TODO: Get rid of the compile time def, and move all MLIR code to separate src files
+        # and add them to cpu_backend here instead.
+        target_compile_definitions(cpu_backend PRIVATE "NGRAPH_MLIR_ENABLE")
+    endif()
    install(TARGETS cpu_backend DESTINATION ${NGRAPH_INSTALL_LIB})
 endif()
--- a/src/ngraph/runtime/cpu/builder/loop_kernel.cpp
+++ b/src/ngraph/runtime/cpu/builder/loop_kernel.cpp
@@ -35,7 +35,7 @@
 #include "halide_generators.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/runtime/cpu/op/compiled_kernel.hpp"
 using namespace std;
 using namespace ngraph;
@@ -49,10 +49,10 @@ namespace ngraph
        namespace cpu
        {
            template <>
-            void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::LoopKernel)
+            void Builder::BUILDER_DECL(ngraph::op::CompiledKernel)
            {
-                const ngraph::runtime::cpu::op::LoopKernel* hs =
+                const ngraph::op::CompiledKernel* hs =
-                    static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
+                    static_cast<const ngraph::op::CompiledKernel*>(node);
                const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators();
@@ -99,7 +99,7 @@ namespace ngraph
                    //a subgraph
                    if (op->get_outputs().size() > 1)
                    {
-                        throw ngraph_error("no multi-output ops in a LoopKernel");
+                        throw ngraph_error("no multi-output ops in a CompiledKernel");
                    }
                    halide_functions[op->get_output_tensor_ptr()->get_name()] =
                        generators.at(TI(*op))(inputs);

--- a/src/ngraph/runtime/cpu/builder/mlir_cpu_compiled_kernel.cpp
+++ b/src/ngraph/runtime/cpu/builder/mlir_cpu_compiled_kernel.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "contrib/mlir/compiler.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
+#include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
+using namespace ngraph;
+using namespace ngraph::op;
+using namespace ngraph::runtime::cpu;
+using namespace ngraph::runtime::ngmlir;
+#define TI(x) type_index(typeid(x))
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(CompiledKernel)
+            {
+                auto& functors = external_function->get_functors();
+                // Tensors haven't been allocated yet so we have to keep a pointer to the pointer
+                // that will hold the future memory address.
+                std::vector<size_t> buffer_indices;
+                for (const TensorViewWrapper& arg : args)
+                {
+                    auto buffer_index = external_function->get_buffer_index(arg.get_name());
+                    buffer_indices.push_back(buffer_index);
+                }
+                for (const TensorViewWrapper& result : out)
+                {
+                    auto buffer_index = external_function->get_buffer_index(result.get_name());
+                    buffer_indices.push_back(buffer_index);
+                }
+                // Create functor that will be executed to compile and run this CompiledKernel.
+                // Note that 'double_ptr_args' must be captured by value since it's a local var.
+                auto functor = [node, buffer_indices](CPURuntimeContext* ctx,
+                                                      CPUExecutionContext* ectx) {
+                    // MLIR requires a list of type-erased pointer to arguments. Tensors must have
+                    // been allocated at this point so we can get rid of the extra reference.
+                    std::vector<void*> ptr_args;
+                    for (auto& buffer_index : buffer_indices)
+                    {
+                        ptr_args.push_back(ctx->buffer_data[buffer_index]);
+                    }
+                    // Compile nodes within the CompiledKernel op.
+                    auto* compiled_kernel = static_cast<const CompiledKernel*>(node);
+                    MLIRCompiler mlir_compiler(compiled_kernel, ptr_args);
+                    // TODO: Decouple 'compile' and 'run' APIs. We want to be able to run the same
+                    // jitted code on different arguments.
+                    mlir_compiler.compile_and_run();
+                };
+                functors.emplace_back(functor);
+            }
+        }
+    }
+}
+#undef TI
--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -25,6 +25,10 @@
 #include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/util.hpp"
+#ifdef NGRAPH_MLIR_ENABLE
+#include "contrib/mlir/compiler.hpp"
+#endif
 using namespace ngraph;
 using namespace std;
@@ -90,6 +94,14 @@ shared_ptr<runtime::Executable>
                                       ngraph::pass::PassConfig& pass_config,
                                       bool performance_counters_enabled)
 {
+#ifdef NGRAPH_MLIR_ENABLE
+    if (std::getenv("NGRAPH_MLIR") != nullptr)
+    {
+        // Initialize MLIR compiler
+        ngmlir::MLIRCompiler::init_mlir();
+    }
+#endif
    shared_ptr<runtime::Executable> rc;
    auto it = m_exec_map.find(func);
    if (it != m_exec_map.end())

--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
@@ -38,6 +38,7 @@
 #include "ngraph/op/divide.hpp"
 #include "ngraph/op/equal.hpp"
 #include "ngraph/op/exp.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/greater.hpp"
@@ -104,10 +105,13 @@
 #include "ngraph/runtime/cpu/kernel/tanh.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/halide_op.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/type/element_type.hpp"
 #include "ngraph/util.hpp"
+#ifdef NGRAPH_MLIR_ENABLE
+#include "contrib/mlir/compiler.hpp"
+#endif
 using namespace std;
 using namespace ngraph;
@@ -472,8 +476,8 @@ namespace ngraph
            {
                static BuildOpMap build_dispatcher{
                    {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
-                    {TI(ngraph::runtime::cpu::op::LoopKernel),
+                    {TI(ngraph::op::CompiledKernel),
-                     &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::LoopKernel>},
+                     &runtime::cpu::Builder::build<ngraph::op::CompiledKernel>},
                    {TI(ngraph::runtime::cpu::op::HalideOp),
                     &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::HalideOp>}};

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -53,6 +53,7 @@
 #include "ngraph/op/erf.hpp"
 #include "ngraph/op/exp.hpp"
 #include "ngraph/op/experimental/batch_mat_mul.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/experimental/generate_mask.hpp"
 #include "ngraph/op/experimental/quantized_avg_pool.hpp"
 #include "ngraph/op/experimental/quantized_concat.hpp"
@@ -125,7 +126,6 @@
 #include "ngraph/runtime/cpu/op/dropout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/leaky_relu.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
@@ -3883,7 +3883,7 @@ namespace ngraph
                                      std::function<std::string(const std::vector<std::string>&)>>
                inline_emitters = initialize_inline_emitters();
-            // GOEE doesn't see GOEs in subgraphs that are hidden inside LoopKernels
+            // GOEE doesn't see GOEs in subgraphs that are hidden inside CompiledKernels
            // we have to manually propagate the source output
            static const ngraph::descriptor::Output*
                get_goe_input_output(ngraph::descriptor::Output* output)
@@ -3898,22 +3898,22 @@ namespace ngraph
            }
            template <>
-            void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::LoopKernel)
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::CompiledKernel)
            {
                std::unordered_map<const ngraph::descriptor::Output*, std::string>
                    loop_symbol_table;
                // pre-fill symbol table with inputs
-                const ngraph::runtime::cpu::op::LoopKernel* clk =
+                const ngraph::op::CompiledKernel* ck =
-                    static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
+                    static_cast<const ngraph::op::CompiledKernel*>(node);
-                NodeVector output_nodes = clk->get_kernel_outputs();
+                NodeVector output_nodes = ck->get_kernel_outputs();
-                NodeVector node_list = clk->get_node_list();
+                NodeVector node_list = ck->get_node_list();
                for (size_t i = 0; i < args.size(); i++)
                {
                    std::string sname = std::string(args[i].get_name()) + "[i]";
-                    auto entry = std::make_pair(&clk->get_inputs().at(i).get_output(), sname);
+                    auto entry = std::make_pair(&ck->get_inputs().at(i).get_output(), sname);
                    loop_symbol_table.insert(entry);
                }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -33,6 +33,7 @@
 #include "ngraph/codegen/execution_engine.hpp"
 #endif
+#include "contrib/mlir/pass/mlir_subgraph_extraction.hpp"
 #include "ngraph/descriptor/input.hpp"
 #include "ngraph/descriptor/output.hpp"
 #include "ngraph/file_util.hpp"
@@ -69,6 +70,7 @@
 #include "ngraph/op/erf.hpp"
 #include "ngraph/op/exp.hpp"
 #include "ngraph/op/experimental/batch_mat_mul.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/experimental/generate_mask.hpp"
 #include "ngraph/op/experimental/quantized_avg_pool.hpp"
 #include "ngraph/op/experimental/quantized_concat.hpp"
@@ -171,7 +173,6 @@
 #include "ngraph/runtime/cpu/op/dropout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/leaky_relu.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
@@ -427,8 +428,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::And), &runtime::cpu::CPU_Emitter::emit<op::And>},
    {TI(ngraph::op::Or), &runtime::cpu::CPU_Emitter::emit<op::Or>},
    {TI(ngraph::op::CPULeakyRelu), &runtime::cpu::CPU_Emitter::emit<op::CPULeakyRelu>},
-    {TI(ngraph::runtime::cpu::op::LoopKernel),
+    {TI(ngraph::op::CompiledKernel), &runtime::cpu::CPU_Emitter::emit<op::CompiledKernel>},
-     &runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::LoopKernel>},
    {TI(ngraph::op::LRN), &runtime::cpu::CPU_Emitter::emit<ngraph::op::LRN>},
    {TI(ngraph::op::GenerateMask), &runtime::cpu::CPU_Emitter::emit<ngraph::op::GenerateMask>},
    {TI(ngraph::op::ConvolutionAdd), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionAdd>},
@@ -1181,7 +1181,12 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(
    REGISTER_KNOBBED_PASS(RecurrentReshapeElimination, false, ngraph::pass);
    REGISTER_KNOBBED_PASS_WITH_ARGS(
        CoreFusion, true, ngraph::pass, ngraph::pass::FusionType::ALL_FUSIONS);
-    REGISTER_KNOBBED_PASS(CPUFusion, true, runtime::cpu::pass);
+    // Disable CPUFusion if MLIR is enabled to preserve core ops.
+    if (std::getenv("NGRAPH_MLIR") == nullptr)
+    {
+        REGISTER_KNOBBED_PASS(CPUFusion, true, runtime::cpu::pass);
+    }
    REGISTER_KNOBBED_PASS(CPUQuantFusion, true, runtime::cpu::pass);
    REGISTER_KNOBBED_PASS(CPUHorizontalFusion, true, runtime::cpu::pass);
    REGISTER_KNOBBED_PASS(CPUCollapseDims, true, runtime::cpu::pass);
@@ -1189,6 +1194,13 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(
    REGISTER_KNOBBED_PASS(HalideSubgraphExtraction, true, ngraph::runtime::cpu::pass);
 #endif
+#ifdef NGRAPH_MLIR_ENABLE
+    if (std::getenv("NGRAPH_MLIR") != nullptr)
+    {
+        REGISTER_KNOBBED_PASS(MLIRSubgraphExtractionPass, /*enable by default*/ true, ngraph::pass);
+    }
+#endif
    NodeVector nv_cwi; // We dont need CPUWorkspaceInsertion to return list of indices
    REGISTER_KNOBBED_PASS_WITH_ARGS(CPUWorkspaceInsertion, true, runtime::cpu::pass, nv_cwi, false);
    REGISTER_KNOBBED_PASS_WITH_ARGS(CPUAssignment, true, runtime::cpu::pass, this);

--- a/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp
@@ -23,6 +23,7 @@
 #include "ngraph/log.hpp"
 #include "ngraph/op/abs.hpp"
 #include "ngraph/op/add.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/maximum.hpp"
 #include "ngraph/op/minimum.hpp"
@@ -31,8 +32,7 @@
 #include "ngraph/op/subtract.hpp"
 #include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
 #include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
 #define TI(x) std::type_index(typeid(x))
@@ -49,10 +49,10 @@ struct LKGraph
    NodeVector m_nodes;
 };
-class LoopKernelCollector
+class CompiledKernelCollector
 {
 public:
-    LoopKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
+    CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
    {
        for (auto n : f->get_ordered_ops())
        {
@@ -70,13 +70,13 @@ public:
                else
                {
                    auto smallest_head = m_heads.at(arg_from_fusible_group);
-                    auto& lkgraph = m_graphs.at(smallest_head);
+                    auto& ckgraph = m_graphs.at(smallest_head);
-                    lkgraph.m_nodes.push_back(n);
+                    ckgraph.m_nodes.push_back(n);
                    for (auto arg : n->get_arguments())
                    {
                        if (is_leaf(arg))
                        {
-                            lkgraph.m_inputs.push_back(arg);
+                            ckgraph.m_inputs.push_back(arg);
                        }
                    }
                    m_heads.insert(std::make_pair(n, smallest_head));
@@ -88,18 +88,18 @@ public:
        prune_graphs(min_nodes_to_fuse);
    }
-    const std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> get_loop_kernels() const
+    const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
    {
-        std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> lks;
+        std::vector<std::shared_ptr<op::CompiledKernel>> cks;
        for (auto e : m_graphs)
        {
-            auto& lkg = e.second;
+            auto& ckg = e.second;
-            NodeVector member_outputs = ngraph::get_subgraph_outputs(lkg.m_nodes, NodeVector{});
+            NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
-            auto lk = std::make_shared<runtime::cpu::op::LoopKernel>(
+            auto ck =
-                lkg.m_nodes, member_outputs, lkg.m_inputs);
+                std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
-            lks.push_back(lk);
+            cks.push_back(ck);
        }
-        return lks;
+        return cks;
    }
 private:
@@ -172,20 +172,20 @@ private:
    std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
 };
-bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
+bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
    std::shared_ptr<ngraph::Function> function)
 {
-    LoopKernelCollector lkc(function, m_min_kernel_size);
+    CompiledKernelCollector ckc(function, m_min_kernel_size);
-    auto loop_kernels = lkc.get_loop_kernels();
+    auto compiled_kernels = ckc.get_compiled_kernels();
-    for (auto lk : loop_kernels)
+    for (auto ck : compiled_kernels)
    {
-        auto outputs = lk->get_kernel_outputs();
+        auto outputs = ck->get_kernel_outputs();
-        std::set<std::shared_ptr<Node>> lk_nodes_set(lk->get_node_list().begin(),
+        std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
-                                                     lk->get_node_list().end());
+                                                     ck->get_node_list().end());
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(lk, i);
+            auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
            auto& ith_output = ith_goe->get_outputs().at(0);
            if (outputs.at(i)->get_outputs().size() > 1)
@@ -203,8 +203,8 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
            for (auto input : inputs_copy)
            {
                // this user is NOT internal to this loop kernel
-                // so it needs to be replaced with corresponding lk's GOE
+                // so it needs to be replaced with corresponding ck's GOE
-                if (lk_nodes_set.count(input->get_node()) == 0)
+                if (ck_nodes_set.count(input->get_node()) == 0)
                {
                    input->replace_output(ith_output);
                }
@@ -212,5 +212,5 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
        }
    }
-    return !loop_kernels.empty();
+    return !compiled_kernels.empty();
 }
--- a/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp
@@ -26,10 +26,10 @@ namespace ngraph
        {
            namespace pass
            {
-                class CPULoopKernelFusion : public ngraph::pass::FunctionPass
+                class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
                {
                public:
-                    CPULoopKernelFusion(size_t min_kernel_size = 2)
+                    CPUCompiledKernelFusion(size_t min_kernel_size = 2)
                        : FunctionPass()
                        , m_min_kernel_size(min_kernel_size)
                    {

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -52,6 +52,7 @@
 #include "ngraph/op/erf.hpp"
 #include "ngraph/op/exp.hpp"
 #include "ngraph/op/experimental/batch_mat_mul.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/experimental/dyn_broadcast.hpp"
 #include "ngraph/op/experimental/dyn_pad.hpp"
 #include "ngraph/op/experimental/dyn_reshape.hpp"

--- a/test/backend_binary_elementwise.in.cpp
+++ b/test/backend_binary_elementwise.in.cpp
@@ -57,6 +57,36 @@ NGRAPH_TEST(${BACKEND_NAME}, add)
                                  (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector()));
 }
+NGRAPH_TEST(${BACKEND_NAME}, dot_add)
+{
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto dot = make_shared<op::Dot>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape_out);
+    auto add = make_shared<op::Add>(dot, C);
+    auto f = make_shared<Function>(add, ParameterVector{A, B, C});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape_out);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+    copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    copy_data(c, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b, c});
+    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
+                                  vector<float>{35.f, 40.f, 45.f, 68.f, 82.f, 96.f}));
+}
 NGRAPH_TEST(${BACKEND_NAME}, add_overload)
 {
    Shape shape{2, 2};

--- a/test/backend_dot.in.cpp
+++ b/test/backend_dot.in.cpp
@@ -421,6 +421,32 @@ NGRAPH_TEST(${BACKEND_NAME}, dot2d)
    EXPECT_TRUE(test::all_close_f((vector<float>{19, 22, 43, 50}), read_vector<float>(result)));
 }
+NGRAPH_TEST(${BACKEND_NAME}, dot2d_non_square)
+{
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto dot = make_shared<op::Dot>(A, B);
+    auto f = make_shared<Function>(dot, ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+    copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
+                                  vector<float>{30.f, 36.f, 42.f, 66.f, 81.f, 96.f}));
+}
 //
 // Here is what numpy does:
 //

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp