[MLIR] Use call back for MatMul. (#3838)

* [MLIR] Use call back for MatMul. * Use callback for Gemm. * Use mkldnn callback for Softmax. * Address PR feedback. * Fix merge errors. * Change to tail allocation struct. * Use mkldnn callback for AvgPool. * Add callbacks for AvgPoolBackprop, MaxPool, and MaxPoolBackprop. * Fix merge errors. * Use UnrankedMemRefType for callbacks. * Address PR feedback. * Cleanup. * Address PR feedback. * Fix a bug. * Use global variable to hold attributes. * Convert layout if needed for pooling. * Address PR feedback. * Add header. * Address PR feedback. * Update Copyright to 2017-2020. * Address PR feedback. Co-authored-by: Scott Cyphers <diyessi@users.noreply.github.com>

[MLIR] Use call back for MatMul. (#3838)
* [MLIR] Use call back for MatMul. * Use callback for Gemm. * Use mkldnn callback for Softmax. * Address PR feedback. * Fix merge errors. * Change to tail allocation struct. * Use mkldnn callback for AvgPool. * Add callbacks for AvgPoolBackprop, MaxPool, and MaxPoolBackprop. * Fix merge errors. * Use UnrankedMemRefType for callbacks. * Address PR feedback. * Cleanup. * Address PR feedback. * Fix a bug. * Use global variable to hold attributes. * Convert layout if needed for pooling. * Address PR feedback. * Add header. * Address PR feedback. * Update Copyright to 2017-2020. * Address PR feedback. Co-authored-by: Scott Cyphers <diyessi@users.noreply.github.com>
c737a573 · Amy Zhuang · Scott Cyphers · e8c0282c · c737a573 · c737a573
Commit c737a573 authored Jan 16, 2020 by Amy Zhuang Committed by Scott Cyphers Jan 16, 2020
23 changed files
--- a/src/contrib/mlir/CMakeLists.txt
+++ b/src/contrib/mlir/CMakeLists.txt
@@ -36,6 +36,7 @@ set(SRC
    core/pass/ng_dialect_builder.hpp
    runtime/cpu/memory_manager.cpp
    runtime/cpu/cpu_runtime.cpp
+    runtime/cpu/cpu_callbacks.cpp
    utils.cpp
 )

@@ -90,7 +91,8 @@ target_link_libraries(
 )

 # Link ngraph 
-target_link_libraries(mlir_backend PUBLIC ngraph)
+target_link_libraries(mlir_backend PUBLIC ngraph libmkl libmkldnn)
+target_include_directories(mlir_backend SYSTEM PUBLIC libmkldnn)

 # table-gen dialect ops
 # include table-gen helpers

--- a/src/contrib/mlir/backend/cpu/cpu_backend.cpp
+++ b/src/contrib/mlir/backend/cpu/cpu_backend.cpp
@@ -33,6 +33,7 @@
 #include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h>
 #include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h>
 #include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+#include <mlir/IR/StandardTypes.h>
 #include <mlir/Pass/PassManager.h>
 #include <mlir/Target/LLVMIR.h>
 #include <mlir/Transforms/DialectConversion.h>

--- a/src/contrib/mlir/backend/pass/affine_lowerer.cpp
+++ b/src/contrib/mlir/backend/pass/affine_lowerer.cpp
@@ -22,6 +22,8 @@
 #include "contrib/mlir/backend/analysis/memory_analysis.hpp"
 #include "contrib/mlir/core/ngraph_dialect/ops.hpp"
 #include "contrib/mlir/core/ngraph_dialect/type.hpp"
+#include "contrib/mlir/runtime/cpu/callback_utils.hpp"
+#include "contrib/mlir/utils.hpp"
 #include "ngraph/assertion.hpp"

 #include <llvm/ADT/DenseSet.h>
@@ -30,6 +32,7 @@
 #include <mlir/EDSC/Helpers.h>
 #include <mlir/EDSC/Intrinsics.h>
 #include <mlir/IR/AffineExpr.h>
+#include <mlir/IR/Function.h>
 #include <mlir/IR/IntegerSet.h>
 #include <mlir/IR/MLIRContext.h>
 #include <mlir/IR/StandardTypes.h>
@@ -40,6 +43,8 @@
 #define PASS_NAME "convert-ngraph-to-affine"
 #define DEBUG_TYPE PASS_NAME

+std::vector<ngraph::runtime::ngmlir::opAttrs> opAttrsVec;
+
 // anonymous namespace
 // no need to expose any of the following outside of this file
 namespace
@@ -164,6 +169,12 @@ namespace
                               PatternRewriter& rewriter,
                               DialectLoweringPass& pass);

+    template <typename OP>
+    void lowerPooling(Operation* op,
+                      ArrayRef<Value*> operands,
+                      PatternRewriter& rewriter,
+                      DialectLoweringPass& pass);
+
    ValueHandle createZeroConstant(mlir::Type type);
    ValueHandle createOneConstant(mlir::Type type);

@@ -203,6 +214,13 @@ namespace
        /// Inserts dealloc Ops for each temporary allocated by AllocOp
        void insertDeallocs(PatternRewriter& rewriter);
        NGraphTypeConverter& getTypeConverter() { return typeConverter; }
+        FuncOp getCallDecl(StringRef name,
+                           ArrayRef<Type> args,
+                           ArrayRef<Type> output,
+                           PatternRewriter& rewriter);
+
+        inline size_t insertAttrs(opAttrs attrs);
+
        MemoryAnalysis* getMemAnalysis() const { return m_memAnalysis; }
    private:
        /// Collect a set of patterns to convert from the nGraph dialect to Affine dialect.
@@ -222,6 +240,9 @@ namespace
        MemoryAnalysis* m_memAnalysis;
        // TODO: Workaround for findOutputValues and buildOutputDefs. See NGCPU-470.
        std::string funcName;
+
+        // Store the attributes needed by callback
+        std::vector<opAttrs> m_attrsVec;
    };

    void DialectLoweringPass::runOnModule()
@@ -271,6 +292,8 @@ namespace
            // separate rewrite pattern. Retrieve new function after signature conversion.
            insertNoAliasArgAttrs();
        }
+
+        opAttrsVec = m_attrsVec;
    }

    void DialectLoweringPass::populateNGraphToAffineConversionPatterns(
@@ -467,6 +490,33 @@ namespace
        }
    }

+    mlir::FuncOp DialectLoweringPass::getCallDecl(StringRef name,
+                                                  ArrayRef<Type> args,
+                                                  ArrayRef<Type> output,
+                                                  PatternRewriter& rewriter)
+    {
+        auto module = getModule();
+        auto* context = getModule().getContext();
+        auto callBackFunc = module.lookupSymbol<mlir::FuncOp>(name);
+        if (!callBackFunc)
+        {
+            // Create a function declaration and insert to the module.
+            auto callBackType = rewriter.getFunctionType(args, output);
+            PatternRewriter::InsertionGuard insertGuard(rewriter);
+            rewriter.setInsertionPointToStart(module.getBody());
+            SmallVector<NamedAttribute, 4> attributes;
+            rewriter.create<mlir::FuncOp>(rewriter.getUnknownLoc(), name, callBackType, attributes);
+            callBackFunc = module.lookupSymbol<mlir::FuncOp>(name);
+        }
+        return callBackFunc;
+    }
+
+    inline size_t DialectLoweringPass::insertAttrs(opAttrs attrs)
+    {
+        m_attrsVec.push_back(attrs);
+        return m_attrsVec.size() - 1;
+    }
+
    // NGDialect converters
    Type NGraphTypeConverter::convertType(Type type)
    {
@@ -1198,6 +1248,368 @@ namespace
        return matchSuccess();
    }

+    // Use callback: Pooling, MatMul, Gemm, Softmax
+    static void castMemRef(SmallVector<mlir::Value*, 4> inputs,
+                           SmallVector<mlir::Value*, 4>& outputs,
+                           PatternRewriter& rewriter,
+                           UnrankedMemRefType type)
+    {
+        for (auto in : inputs)
+        {
+            auto out = rewriter.create<mlir::MemRefCastOp>(rewriter.getUnknownLoc(), in, type);
+            outputs.push_back(out);
+        }
+    }
+
+    REWRITER(NGAvgPoolOp)
+    {
+        lowerPooling<mlir::NGAvgPoolOp>(op, operands, rewriter, pass);
+        return matchSuccess();
+    }
+
+    REWRITER(NGAvgPoolBackpropOp)
+    {
+        lowerPooling<mlir::NGAvgPoolBackpropOp>(op, operands, rewriter, pass);
+        return matchSuccess();
+    }
+
+    REWRITER(NGMaxPoolOp)
+    {
+        lowerPooling<mlir::NGMaxPoolOp>(op, operands, rewriter, pass);
+        return matchSuccess();
+    }
+
+    REWRITER(NGMaxPoolBackpropOp)
+    {
+        auto pooling = cast<NGMaxPoolBackpropOp>(op);
+        auto loc = pooling.getLoc();
+
+        // Retrieve/generate Values for operands and result.
+        ScopedContext scope(rewriter, loc);
+        Value* src = operands[0];
+        Value* delta = operands[1];
+        ArrayRef<Attribute> windowShape = pooling.windowShape().getValue();
+        ArrayRef<Attribute> windowStrides = pooling.windowMovementStrides().getValue();
+        ArrayRef<Attribute> padBelow = pooling.padBelow().getValue();
+        ArrayRef<Attribute> padAbove = pooling.padAbove().getValue();
+
+        Value* result = pass.buildOutputDefs(op, rewriter)[0];
+        NGRAPH_CHECK(src && delta && result, "Unexpected null values in MaxPoolBackprop Op");
+
+        auto resultTy = result->getType().dyn_cast<MemRefType>();
+        auto resultShape = resultTy.getShape();
+        auto srcTy = src->getType().dyn_cast<MemRefType>();
+        auto srcShape = srcTy.getShape();
+        auto deltaTy = delta->getType().dyn_cast<MemRefType>();
+        auto deltaShape = deltaTy.getShape();
+        NGRAPH_CHECK(resultTy, "Unexpected non-memref result type");
+        NGRAPH_CHECK(srcTy, "Unexpected non-memref src type");
+        NGRAPH_CHECK(deltaTy, "Unexpected non-memref delta type");
+
+        Type elemTy = resultTy.getElementType();
+        NGRAPH_CHECK(elemTy == srcTy.getElementType() && elemTy == deltaTy.getElementType(),
+                     "Types mismatch in MaxPoolBackprop");
+
+        NGRAPH_CHECK((srcShape.size() == 4 && resultShape.size() == 4) ||
+                         (srcShape.size() == 5 && resultShape.size() == 5),
+                     "MKLDNN pooling operation is only supported for 3D and 5D tensors");
+
+        auto int64Ty = rewriter.getIntegerType(64);
+        auto unrankedMemrefTy = UnrankedMemRefType::get(elemTy, 0);
+        SmallVector<mlir::Value*, 4> inputs = {src, delta, result};
+        SmallVector<mlir::Value*, 4> outputs;
+        castMemRef(inputs, outputs, rewriter, unrankedMemrefTy);
+
+        FuncOp callBackFunc = pass.getCallDecl(
+            "__mlir_callback_2_inputs",
+            {unrankedMemrefTy, unrankedMemrefTy, unrankedMemrefTy, int64Ty, int64Ty},
+            {},
+            rewriter);
+
+        opAttrs attrs;
+        if (srcShape.size() == 4)
+        {
+            attrs.poolAttrs2d.includePaddingInAvgComputation = false;
+            for (auto i = 0; i < 2; i++)
+            {
+                attrs.poolAttrs2d.windowShape[i] = windowShape[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs2d.windowStrides[i] = windowStrides[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs2d.padBelow[i] = padBelow[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs2d.padAbove[i] = padAbove[i].cast<IntegerAttr>().getInt();
+            }
+        }
+        else if (srcShape.size() == 5)
+        {
+            opAttrs attrs;
+            attrs.poolAttrs3d.includePaddingInAvgComputation = false;
+            for (auto i = 0; i < 3; i++)
+            {
+                attrs.poolAttrs3d.windowShape[i] = windowShape[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs3d.windowStrides[i] = windowStrides[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs3d.padBelow[i] = padBelow[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs3d.padAbove[i] = padAbove[i].cast<IntegerAttr>().getInt();
+            }
+        }
+        auto index = pass.insertAttrs(attrs);
+        auto attrsIndexArg =
+            rewriter.create<mlir::ConstantIntOp>(rewriter.getUnknownLoc(), index, 64);
+        auto opTypeArg = rewriter.create<mlir::ConstantIntOp>(
+            rewriter.getUnknownLoc(), static_cast<int64_t>(OpType::MAXPOOLBACKPROP), 64);
+        SmallVector<mlir::Value*, 4> args = {
+            outputs[0], outputs[1], outputs[2], attrsIndexArg, opTypeArg};
+
+        rewriter.create<mlir::CallOp>(rewriter.getUnknownLoc(), callBackFunc, args);
+        rewriter.replaceOp(op, result);
+        return matchSuccess();
+    }
+
+    REWRITER(NGMatMulOp)
+    {
+        auto matmul = cast<NGMatMulOp>(op);
+        auto loc = matmul.getLoc();
+
+        // Retrieve/generate Values for operands and result.
+        ScopedContext scope(rewriter, loc);
+        Value* lhs = operands[0];
+        Value* rhs = operands[1];
+        Value* result = pass.buildOutputDefs(op, rewriter)[0];
+        NGRAPH_CHECK(lhs && rhs && result, "Unexpected null values in MatMulOp");
+
+        auto resultTy = result->getType().dyn_cast<MemRefType>();
+        auto resultShape = resultTy.getShape();
+        auto lhsTy = lhs->getType().dyn_cast<MemRefType>();
+        auto lhsShape = lhsTy.getShape();
+        auto rhsTy = rhs->getType().dyn_cast<MemRefType>();
+        auto rhsShape = rhsTy.getShape();
+        NGRAPH_CHECK(resultTy, "Unexpected non-memref result type");
+        NGRAPH_CHECK(lhsTy, "Unexpected non-memref LHS type");
+        NGRAPH_CHECK(rhsTy, "Unexpected non-memref RHS type");
+
+        Type elemTy = resultTy.getElementType();
+        NGRAPH_CHECK(elemTy == lhsTy.getElementType() && elemTy == rhsTy.getElementType(),
+                     "Types mismatch in MatMulOp");
+
+        NGRAPH_CHECK(lhsShape.size() == 2 && rhsShape.size() == 2 && resultShape.size() == 2,
+                     "MatMul operation is only supported for 2D tensors");
+
+        opAttrs attrs;
+        attrs.gemmAttrs2d.transposeA = matmul.transposeA();
+        attrs.gemmAttrs2d.transposeB = matmul.transposeB();
+        attrs.gemmAttrs2d.m = lhsShape[0];
+        attrs.gemmAttrs2d.k = lhsShape[1];
+        attrs.gemmAttrs2d.n = rhsShape[1];
+        attrs.gemmAttrs2d.lda = lhsShape[1];
+        attrs.gemmAttrs2d.ldb = rhsShape[1];
+
+        if (matmul.transposeA())
+        {
+            attrs.gemmAttrs2d.m = lhsShape[1];
+            attrs.gemmAttrs2d.k = lhsShape[0];
+        }
+        if (matmul.transposeB())
+        {
+            attrs.gemmAttrs2d.n = rhsShape[0];
+        }
+        attrs.gemmAttrs2d.ldc = attrs.gemmAttrs2d.n;
+
+        auto int64Ty = rewriter.getIntegerType(64);
+        auto unrankedMemrefTy = UnrankedMemRefType::get(elemTy, 0);
+        auto callBackFunc = pass.getCallDecl(
+            "__mlir_callback_2_inputs",
+            {unrankedMemrefTy, unrankedMemrefTy, unrankedMemrefTy, int64Ty, int64Ty},
+            {},
+            rewriter);
+
+        auto index = pass.insertAttrs(attrs);
+        auto attrsIndexArg =
+            rewriter.create<mlir::ConstantIntOp>(rewriter.getUnknownLoc(), index, 64);
+        auto opTypeArg = rewriter.create<mlir::ConstantIntOp>(
+            rewriter.getUnknownLoc(), static_cast<int64_t>(OpType::MATMUL), 64);
+        SmallVector<mlir::Value*, 4> inputs = {lhs, rhs, result};
+        SmallVector<mlir::Value*, 4> outputs;
+        castMemRef(inputs, outputs, rewriter, unrankedMemrefTy);
+        SmallVector<mlir::Value*, 4> args = {
+            outputs[0], outputs[1], outputs[2], attrsIndexArg, opTypeArg};
+
+        rewriter.create<mlir::CallOp>(rewriter.getUnknownLoc(), callBackFunc, args);
+        rewriter.replaceOp(op, result);
+
+        return matchSuccess();
+    }
+
+    REWRITER(NGGemmOp)
+    {
+        auto gemm = cast<NGGemmOp>(op);
+        auto loc = gemm.getLoc();
+
+        // Retrieve/generate Values for operands and result.
+        ScopedContext scope(rewriter, loc);
+        Value* lhs = operands[0];
+        Value* rhs = operands[1];
+        Value* bias = operands[2];
+        Value* result = pass.buildOutputDefs(op, rewriter)[0];
+        NGRAPH_CHECK(lhs && rhs && bias && result, "Unexpected null values in GemmOp");
+
+        auto resultTy = result->getType().dyn_cast<MemRefType>();
+        auto lhsTy = lhs->getType().dyn_cast<MemRefType>();
+        auto lhsShape = lhsTy.getShape();
+        auto rhsTy = rhs->getType().dyn_cast<MemRefType>();
+        auto rhsShape = rhsTy.getShape();
+        auto biasTy = bias->getType().dyn_cast<MemRefType>();
+        auto biasShape = biasTy.getShape();
+        NGRAPH_CHECK(resultTy, "Unexpected non-memref result type");
+        NGRAPH_CHECK(lhsTy, "Unexpected non-memref LHS type");
+        NGRAPH_CHECK(rhsTy, "Unexpected non-memref RHS type");
+        NGRAPH_CHECK(biasTy, "Unexpected non-memref bias type");
+
+        Type elemTy = resultTy.getElementType();
+        NGRAPH_CHECK(elemTy == lhsTy.getElementType() && elemTy == rhsTy.getElementType() &&
+                         elemTy == biasTy.getElementType(),
+                     "Types mismatch in GemmOp");
+
+        MemRefView vRes(result), vLhs(lhs), vRhs(rhs), vBias(bias);
+
+        NGRAPH_CHECK(vLhs.rank() == 2 && vRhs.rank() == 2 && vRes.rank() == 2 && vBias.rank() <= 2,
+                     "Gemm operation is only supported for 2D tensors");
+
+        opAttrs attrs;
+        attrs.gemmAttrs2d.transposeA = gemm.transA();
+        attrs.gemmAttrs2d.transposeB = gemm.transB();
+        attrs.gemmAttrs2d.alpha = gemm.alpha().convertToFloat();
+        attrs.gemmAttrs2d.beta = gemm.beta().convertToFloat();
+        attrs.gemmAttrs2d.m = lhsShape[0];
+        attrs.gemmAttrs2d.k = lhsShape[1];
+        attrs.gemmAttrs2d.n = rhsShape[1];
+        attrs.gemmAttrs2d.lda = lhsShape[1];
+        attrs.gemmAttrs2d.ldb = rhsShape[1];
+
+        if (gemm.transA())
+        {
+            attrs.gemmAttrs2d.m = lhsShape[1];
+            attrs.gemmAttrs2d.k = lhsShape[0];
+        }
+        if (gemm.transB())
+        {
+            attrs.gemmAttrs2d.n = rhsShape[0];
+        }
+        attrs.gemmAttrs2d.ldc = attrs.gemmAttrs2d.n;
+
+        int broadcastHint;
+        if (vBias.rank() == 0)
+        {
+            // Scalar
+            broadcastHint = 2;
+        }
+        else if (vBias.rank() == 2)
+        {
+            if (biasShape[0] == attrs.gemmAttrs2d.m && biasShape[1] == 1)
+            {
+                broadcastHint = 1;
+            }
+            else if (biasShape[0] == 1 && biasShape[1] == attrs.gemmAttrs2d.n)
+            {
+                broadcastHint = 0;
+            }
+            else
+            {
+                broadcastHint = -1;
+            }
+        }
+        else
+        {
+            if (biasShape[0] == attrs.gemmAttrs2d.m)
+            {
+                broadcastHint = 1;
+            }
+            else if (biasShape[0] == attrs.gemmAttrs2d.n)
+            {
+                broadcastHint = 0;
+            }
+        }
+        attrs.gemmAttrs2d.broadcastHint = broadcastHint;
+
+        auto int64Ty = rewriter.getIntegerType(64);
+        auto unrankedMemrefTy = UnrankedMemRefType::get(elemTy, 0);
+        auto callBackFunc = pass.getCallDecl("__mlir_callback_3_inputs",
+                                             {unrankedMemrefTy,
+                                              unrankedMemrefTy,
+                                              unrankedMemrefTy,
+                                              unrankedMemrefTy,
+                                              int64Ty,
+                                              int64Ty},
+                                             {},
+                                             rewriter);
+
+        auto index = pass.insertAttrs(attrs);
+        auto attrsIndexArg =
+            rewriter.create<mlir::ConstantIntOp>(rewriter.getUnknownLoc(), index, 64);
+        auto opTypeArg = rewriter.create<mlir::ConstantIntOp>(
+            rewriter.getUnknownLoc(), static_cast<int64_t>(OpType::GEMM), 64);
+        SmallVector<mlir::Value*, 4> inputs = {lhs, rhs, bias, result};
+        SmallVector<mlir::Value*, 4> outputs;
+        castMemRef(inputs, outputs, rewriter, unrankedMemrefTy);
+        SmallVector<mlir::Value*, 4> args = {
+            outputs[0], outputs[1], outputs[2], outputs[3], attrsIndexArg, opTypeArg};
+
+        rewriter.create<mlir::CallOp>(rewriter.getUnknownLoc(), callBackFunc, args);
+        rewriter.replaceOp(op, result);
+
+        return matchSuccess();
+    }
+
+    REWRITER(NGSoftMaxOp)
+    {
+        auto softmax = cast<NGSoftMaxOp>(op);
+        auto loc = softmax.getLoc();
+
+        // Retrieve/generate Values for operands and result.
+        ScopedContext scope(rewriter, loc);
+        Value* lhs = operands[0];
+        Value* result = pass.buildOutputDefs(op, rewriter)[0];
+        NGRAPH_CHECK(lhs && result, "Unexpected null values in SoftmaxOp");
+
+        auto resultTy = result->getType().dyn_cast<MemRefType>();
+        auto resultShape = resultTy.getShape();
+        auto lhsTy = lhs->getType().dyn_cast<MemRefType>();
+        auto lhsShape = lhsTy.getShape();
+        NGRAPH_CHECK(resultTy, "Unexpected non-memref result type");
+        NGRAPH_CHECK(lhsTy, "Unexpected non-memref LHS type");
+
+        Type elemTy = resultTy.getElementType();
+        NGRAPH_CHECK(elemTy == lhsTy.getElementType(), "Types mismatch in SoftmaxOp");
+
+        NGRAPH_CHECK((lhsShape.size() == 2 && resultShape.size() == 2) ||
+                         (lhsShape.size() == 4 && resultShape.size() == 4),
+                     "MKLDNN Softmax operation is only supported for 2D and 4D tensors");
+
+        auto int64Ty = rewriter.getIntegerType(64);
+        auto unrankedMemrefTy = UnrankedMemRefType::get(elemTy, 0);
+        auto axes = softmax.axes().getValue();
+        opAttrs attrs;
+        attrs.intAttr = axes[0].cast<IntegerAttr>().getInt();
+        auto index = pass.insertAttrs(attrs);
+        auto attrsIndexArg =
+            rewriter.create<mlir::ConstantIntOp>(rewriter.getUnknownLoc(), index, 64);
+        auto opTypeArg = rewriter.create<mlir::ConstantIntOp>(
+            rewriter.getUnknownLoc(), static_cast<int64_t>(OpType::SOFTMAX), 64);
+
+        FuncOp callBackFunc =
+            pass.getCallDecl("__mlir_callback_1_input",
+                             {unrankedMemrefTy, unrankedMemrefTy, int64Ty, int64Ty},
+                             {},
+                             rewriter);
+
+        SmallVector<mlir::Value*, 4> inputs = {lhs, result};
+        SmallVector<mlir::Value*, 4> outputs;
+        castMemRef(inputs, outputs, rewriter, unrankedMemrefTy);
+        SmallVector<mlir::Value*, 4> args = {outputs[0], outputs[1], attrsIndexArg, opTypeArg};
+
+        rewriter.create<mlir::CallOp>(rewriter.getUnknownLoc(), callBackFunc, args);
+        rewriter.replaceOp(op, result);
+
+        return matchSuccess();
+    }
+
 #undef REWRITER
    /// End of pattern matchers
    template <typename OP>
@@ -1458,6 +1870,103 @@ namespace
        rewriter.replaceOp(op, result);
    }

+    template <typename OP>
+    void lowerPooling(Operation* op,
+                      ArrayRef<Value*> operands,
+                      PatternRewriter& rewriter,
+                      DialectLoweringPass& pass)
+    {
+        auto pooling = cast<OP>(op);
+        auto loc = pooling.getLoc();
+
+        // Retrieve/generate Values for operands and result.
+        ScopedContext scope(rewriter, loc);
+        Value* lhs = operands[0];
+        ArrayRef<Attribute> windowShape = pooling.windowShape().getValue();
+        ArrayRef<Attribute> windowStrides = pooling.windowMovementStrides().getValue();
+        ArrayRef<Attribute> padBelow = pooling.padBelow().getValue();
+        ArrayRef<Attribute> padAbove = pooling.padAbove().getValue();
+
+        Value* result = pass.buildOutputDefs(op, rewriter)[0];
+        NGRAPH_CHECK(lhs && result, "Unexpected null values in Pooling Op");
+
+        auto resultTy = result->getType().dyn_cast<MemRefType>();
+        auto resultShape = resultTy.getShape();
+        auto lhsTy = lhs->getType().dyn_cast<MemRefType>();
+        auto lhsShape = lhsTy.getShape();
+        NGRAPH_CHECK(resultTy, "Unexpected non-memref result type");
+        NGRAPH_CHECK(lhsTy, "Unexpected non-memref LHS type");
+
+        Type elemTy = resultTy.getElementType();
+        NGRAPH_CHECK(elemTy == lhsTy.getElementType(), "Types mismatch in Pooling");
+
+        NGRAPH_CHECK((lhsShape.size() == 4 && resultShape.size() == 4) ||
+                         (lhsShape.size() == 5 && resultShape.size() == 5),
+                     "MKLDNN pooling operation is only supported for 3D and 5D tensors");
+
+        auto int64Ty = rewriter.getIntegerType(64);
+        OpType ty;
+        bool includePadding = false;
+        if (auto avgPool = dyn_cast<NGAvgPoolOp>(op))
+        {
+            ty = OpType::AVGPOOL;
+            includePadding = avgPool.includePadding();
+        }
+        else if (auto avgPoolBprop = dyn_cast<NGAvgPoolBackpropOp>(op))
+        {
+            ty = OpType::AVGPOOLBACKPROP;
+            includePadding = avgPoolBprop.includePadding();
+        }
+        else if (isa<NGMaxPoolOp>(op))
+        {
+            ty = OpType::MAXPOOL;
+        }
+
+        auto unrankedMemrefTy = UnrankedMemRefType::get(elemTy, 0);
+        SmallVector<mlir::Value*, 4> inputs = {lhs, result};
+        SmallVector<mlir::Value*, 4> outputs;
+        castMemRef(inputs, outputs, rewriter, unrankedMemrefTy);
+
+        FuncOp callBackFunc =
+            pass.getCallDecl("__mlir_callback_1_input",
+                             {unrankedMemrefTy, unrankedMemrefTy, int64Ty, int64Ty},
+                             {},
+                             rewriter);
+
+        opAttrs attrs;
+        if (lhsShape.size() == 4)
+        {
+            attrs.poolAttrs2d.includePaddingInAvgComputation = includePadding;
+            for (auto i = 0; i < 2; i++)
+            {
+                attrs.poolAttrs2d.windowShape[i] = windowShape[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs2d.windowStrides[i] = windowStrides[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs2d.padBelow[i] = padBelow[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs2d.padAbove[i] = padAbove[i].cast<IntegerAttr>().getInt();
+            }
+        }
+        else if (lhsShape.size() == 5)
+        {
+            attrs.poolAttrs3d.includePaddingInAvgComputation = includePadding;
+            for (auto i = 0; i < 3; i++)
+            {
+                attrs.poolAttrs3d.windowShape[i] = windowShape[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs3d.windowStrides[i] = windowStrides[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs3d.padBelow[i] = padBelow[i].cast<IntegerAttr>().getInt();
+                attrs.poolAttrs3d.padAbove[i] = padAbove[i].cast<IntegerAttr>().getInt();
+            }
+        }
+        auto index = pass.insertAttrs(attrs);
+        auto attrsIndexArg =
+            rewriter.create<mlir::ConstantIntOp>(rewriter.getUnknownLoc(), index, 64);
+        auto opTypeArg = rewriter.create<mlir::ConstantIntOp>(
+            rewriter.getUnknownLoc(), static_cast<int64_t>(ty), 64);
+        SmallVector<mlir::Value*, 4> args = {outputs[0], outputs[1], attrsIndexArg, opTypeArg};
+
+        rewriter.create<mlir::CallOp>(rewriter.getUnknownLoc(), callBackFunc, args);
+        rewriter.replaceOp(op, result);
+    }
+
    ValueHandle createZeroConstant(mlir::Type type)
    {
        if (auto floatTy = type.dyn_cast<FloatType>())

--- a/src/contrib/mlir/backend/pass/op_lowerers.inc
+++ b/src/contrib/mlir/backend/pass/op_lowerers.inc
@@ -27,22 +27,29 @@
 MLIR_OP(NGAddOp             , true                  )
 MLIR_OP(NGArgMaxRedOp       , false                 )
 MLIR_OP(NGArgMinRedOp       , false                 )
+MLIR_OP(NGAvgPoolOp         , false                 )
+MLIR_OP(NGAvgPoolBackpropOp , false                 )
 MLIR_OP(NGConcatOp          , true                  )
 MLIR_OP(NGConvolutionOp     , false                 )
 MLIR_OP(NGDivOp             , true                  )
 MLIR_OP(NGDotOp             , false                 )
 MLIR_OP(NGGatherOp          , false                 )
+MLIR_OP(NGGemmOp            , false                 )
 MLIR_OP(NGGreaterOp         , true                  )
 MLIR_OP(NGLessOp            , true                  )
 MLIR_OP(NGGreaterEqOp       , true                  )
 MLIR_OP(NGLessEqOp          , true                  )
 MLIR_OP(NGEqOp              , true                  )
 MLIR_OP(NGNotEqOp           , true                  )
+MLIR_OP(NGMatMulOp          , false                 )
 MLIR_OP(NGMulOp             , true                  )
 MLIR_OP(NGMaxOp             , true                  )
+MLIR_OP(NGMaxPoolOp         , false                 )
+MLIR_OP(NGMaxPoolBackpropOp , false                 )
 MLIR_OP(NGMinOp             , true                  )
 MLIR_OP(NGNegOp             , true                  )
 MLIR_OP(NGReluOp            , true                  )
+MLIR_OP(NGSoftMaxOp         , false                 )
 MLIR_OP(NGSubOp             , true                  )
 MLIR_LAST_OP(NGReturnOp     , false                 )


--- a/src/contrib/mlir/core/compiler.cpp
+++ b/src/contrib/mlir/core/compiler.cpp
@@ -28,24 +28,7 @@
 #include "ngraph/descriptor/tensor.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/op/add.hpp"
-#include "ngraph/op/argmax.hpp"
-#include "ngraph/op/argmin.hpp"
-#include "ngraph/op/concat.hpp"
-#include "ngraph/op/convolution.hpp"
-#include "ngraph/op/divide.hpp"
-#include "ngraph/op/dot.hpp"
-#include "ngraph/op/experimental/compiled_kernel.hpp"
-#include "ngraph/op/gather.hpp"
-#include "ngraph/op/greater.hpp"
-#include "ngraph/op/less.hpp"
-#include "ngraph/op/maximum.hpp"
-#include "ngraph/op/minimum.hpp"
-#include "ngraph/op/multiply.hpp"
-#include "ngraph/op/negative.hpp"
-#include "ngraph/op/relu.hpp"
-#include "ngraph/op/subtract.hpp"
-#include "ngraph/op/util/index_reduction.hpp"
+#include "ngraph/ops.hpp"
 #include "ngraph/type/element_type.hpp"

 #include "contrib/mlir/utils.hpp"

--- a/src/contrib/mlir/core/ngraph_dialect/fused_ops.td
+++ b/src/contrib/mlir/core/ngraph_dialect/fused_ops.td
@@ -282,7 +282,7 @@ def NGMVN :
 }

 // MatMul Op
-def NGMatMul : 
+def NGMatMulOp :
  NG_OneResult_Op<"matmul", [NoSideEffect, DeclareOpInterfaceMethods<FusedOp>]>,
  Arguments<(ins NG_TensorType:$A, NG_TensorType:$B, 
            DefaultValuedAttr<BoolAttr, "false">:$transposeA,

--- a/src/contrib/mlir/core/ngraph_dialect/ops.cpp
+++ b/src/contrib/mlir/core/ngraph_dialect/ops.cpp
@@ -309,6 +309,55 @@ mlir::LogicalResult verifyOp(NGConvolutionOp* op)
    return mlir::success();
 }

+template <>
+mlir::LogicalResult verifyOp(NGMatMulOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
+template <>
+mlir::LogicalResult verifyOp(NGGemmOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
+template <>
+mlir::LogicalResult verifyOp(NGSoftMaxOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
+template <>
+mlir::LogicalResult verifyOp(NGAvgPoolOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
+template <>
+mlir::LogicalResult verifyOp(NGAvgPoolBackpropOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
+template <>
+mlir::LogicalResult verifyOp(NGMaxPoolOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
+template <>
+mlir::LogicalResult verifyOp(NGMaxPoolBackpropOp* op)
+{
+    // TODO(ayzhuang): Improve verification: proper shapes, etc.
+    return mlir::success();
+}
+
 namespace mlir
 {
 #include "ops_interfaces.cpp.inc"
@@ -401,7 +450,7 @@ void mlir::NGLSTMCellOp::decompose()
 void mlir::NGLSTMSequenceOp::decompose()
 {
 }
-void mlir::NGMatMul::decompose()
+void mlir::NGMatMulOp::decompose()
 {
 }
 void mlir::NGLayerNormOp::decompose()

--- a/src/contrib/mlir/core/ngraph_dialect/ops_v0.td
+++ b/src/contrib/mlir/core/ngraph_dialect/ops_v0.td
@@ -252,8 +252,8 @@ def NGAvgPoolOp :
 }

 // AvgPool for back prop
-def NGAvgPoolBackPropOp :
-    NG_OneResult_Op<"avgPoolBackProp", [NoSideEffect, OpVersion0]>,
+def NGAvgPoolBackpropOp :
+    NG_OneResult_Op<"avgPoolBackprop", [NoSideEffect, OpVersion0]>,
    Arguments<(ins I64ArrayAttr :$forwardArgShape,
               NG_TensorType    :$delta, 
               I64ArrayAttr     :$windowShape,
@@ -455,11 +455,10 @@ def NGMaxPoolOp :
 }

 // MaxPool for back prop
-def NGMaxPoolBackPropOp :
-    NG_OneResult_Op<"maxPoolBackProp", [NoSideEffect, OpVersion0]>,
+def NGMaxPoolBackpropOp :
+    NG_OneResult_Op<"maxPoolBackprop", [NoSideEffect, OpVersion0]>,
    Arguments<(ins NG_TensorType :$argForward,
               NG_TensorType     :$delta, 
-               NG_TensorType     :$resultForward,
               I64ArrayAttr      :$windowShape,
               I64ArrayAttr      :$windowMovementStrides,
               I64ArrayAttr      :$padBelow,
@@ -473,24 +472,7 @@ def NGMaxPoolBackPropOp :
    
    let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
    let verifier = [{ return verifyOp(this); }];
-    let builders = [
-      // Builder without resultForward
-      OpBuilder<
-      "Builder *builder, OperationState &tblgen_state, Type res, " 
-       "Value *argForward, Value *delta, "
-       "ArrayAttr windowShape, ArrayAttr windowMovementStrides, "
-       "ArrayAttr padBelow, ArrayAttr padAbove", [{
-        tblgen_state.addOperands(argForward);
-        tblgen_state.addOperands(delta);
-        tblgen_state.addOperands(nullptr);
-        tblgen_state.addAttribute("windowShape", windowShape);
-        tblgen_state.addAttribute("windowMovementStrides", windowMovementStrides);
-        tblgen_state.addAttribute("padBelow", padBelow);
-        tblgen_state.addAttribute("padAbove", padAbove);
-        tblgen_state.addTypes(res);
-      }]>
-    ];
-
+    
    let extraClassDeclaration = [{
      void setWindowShape(const ArrayAttr& arrayAttr)     { this->setAttr("windowShape", arrayAttr);     }
      void setWindowMovementStrides(const ArrayAttr& arrayAttr) { this->setAttr("windowMovementStrides", arrayAttr);}

--- a/src/contrib/mlir/core/ops_supported.inc
+++ b/src/contrib/mlir/core/ops_supported.inc
@@ -6,23 +6,31 @@
 MLIR_OP(Add)
 MLIR_OP(ArgMin)
 MLIR_OP(ArgMax)
+MLIR_OP(AvgPool)
+MLIR_OP(AvgPoolBackprop)
 MLIR_OP(Divide)
 MLIR_OP(Dot)
 MLIR_OP(Concat)
 MLIR_OP(Convolution)
 MLIR_OP(Gather)
+MLIR_OP(Gemm)
 MLIR_OP(Greater)
 MLIR_OP(Less)
 MLIR_OP(GreaterEq)
 MLIR_OP(LessEq)
 MLIR_OP(Equal)
 MLIR_OP(NotEqual)
+MLIR_OP(MatMul)
 MLIR_OP(Maximum)
+MLIR_OP(MaxPool)
+MLIR_OP(MaxPoolBackprop)
 MLIR_OP(Minimum)
 MLIR_OP(Multiply)
 MLIR_OP(Negative)
+MLIR_OP(Softmax)
 MLIR_OP(Subtract)
 MLIR_OP(Relu)
+
 // Add new supported ops here

 #undef MLIR_OP
--- a/src/contrib/mlir/core/pass/mlir_subgraph_extraction.cpp
+++ b/src/contrib/mlir/core/pass/mlir_subgraph_extraction.cpp
@@ -21,28 +21,7 @@
 #include "ngraph/assertion.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/op/add.hpp"
-#include "ngraph/op/argmax.hpp"
-#include "ngraph/op/argmin.hpp"
-#include "ngraph/op/concat.hpp"
-#include "ngraph/op/convolution.hpp"
-#include "ngraph/op/divide.hpp"
-#include "ngraph/op/dot.hpp"
-#include "ngraph/op/equal.hpp"
-#include "ngraph/op/experimental/compiled_kernel.hpp"
-#include "ngraph/op/gather.hpp"
-#include "ngraph/op/get_output_element.hpp"
-#include "ngraph/op/greater.hpp"
-#include "ngraph/op/greater_eq.hpp"
-#include "ngraph/op/less.hpp"
-#include "ngraph/op/less_eq.hpp"
-#include "ngraph/op/maximum.hpp"
-#include "ngraph/op/minimum.hpp"
-#include "ngraph/op/multiply.hpp"
-#include "ngraph/op/negative.hpp"
-#include "ngraph/op/not_equal.hpp"
-#include "ngraph/op/relu.hpp"
-#include "ngraph/op/subtract.hpp"
+#include "ngraph/ops.hpp"

 using namespace ngraph::descriptor;
 using namespace ngraph::op;
@@ -498,6 +477,104 @@ bool MLIRSubgraphExtractionPass::is_supported_mlir_op(std::shared_ptr<Node> node
               std::all_of(window_dilation.begin(), window_dilation.end(), is_one);
    }

+    // MKLDNN only supports softmax across single axis
+    if (TI(ngraph::op::Softmax) == TI(*node))
+    {
+        // Softmax is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+        auto softmax = static_cast<ngraph::op::Softmax*>(node.get());
+        auto arg0_shape = node->get_input_shape(0);
+        auto arg0_rank = arg0_shape.size();
+
+        return (arg0_rank == 4 || arg0_rank == 2) &&
+               node->get_input_element_type(0) == element::f32 && softmax->get_axes().size() == 1;
+    }
+
+    if (TI(ngraph::op::AvgPool) == TI(*node))
+    {
+        // AvgPool is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+        auto avg_pool = static_cast<ngraph::op::AvgPool*>(node.get());
+        auto arg0_shape = node->get_input_shape(0);
+        auto arg0_rank = arg0_shape.size();
+
+        return ((arg0_rank == 4 && avg_pool->get_window_shape().size() == 2) ||
+                (arg0_rank == 5 && avg_pool->get_window_shape().size() == 3)) &&
+               node->get_input_element_type(0) == element::f32;
+    }
+
+    if (TI(ngraph::op::AvgPoolBackprop) == TI(*node))
+    {
+        // AvgPoolBackprop is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+        auto avg_pool_backprop = static_cast<ngraph::op::AvgPoolBackprop*>(node.get());
+        auto arg0_shape = node->get_input_shape(0);
+        auto arg0_rank = arg0_shape.size();
+
+        return ((arg0_rank == 4 && avg_pool_backprop->get_window_shape().size() == 2) ||
+                (arg0_rank == 5 && avg_pool_backprop->get_window_shape().size() == 3)) &&
+               node->get_input_element_type(0) == element::f32;
+    }
+
+    if (TI(ngraph::op::MaxPoolBackprop) == TI(*node))
+    {
+        // MaxPoolBackprop is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+        auto max_pool_backprop = static_cast<ngraph::op::MaxPoolBackprop*>(node.get());
+        auto arg0_shape = node->get_input_shape(0);
+        auto arg0_rank = arg0_shape.size();
+
+        return ((arg0_rank == 4 && max_pool_backprop->get_window_shape().size() == 2) ||
+                (arg0_rank == 5 && max_pool_backprop->get_window_shape().size() == 3)) &&
+               node->get_input_element_type(0) == element::f32;
+    }
+
+    if (TI(ngraph::op::MaxPool) == TI(*node))
+    {
+        // MaxPool is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+        auto max_pool = static_cast<ngraph::op::MaxPool*>(node.get());
+        auto arg0_shape = node->get_input_shape(0);
+        auto arg0_rank = arg0_shape.size();
+
+        return ((arg0_rank == 4 && max_pool->get_window_shape().size() == 2) ||
+                (arg0_rank == 5 && max_pool->get_window_shape().size() == 3)) &&
+               node->get_input_element_type(0) == element::f32;
+    }
+
+    if (TI(ngraph::op::MatMul) == TI(*node))
+    {
+        // MatMul is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+    }
+
+    if (TI(ngraph::op::Gemm) == TI(*node))
+    {
+        // Gemm is only supported through callback
+        if (std::getenv("NGRAPH_MLIR_CALLBACK") == nullptr)
+        {
+            return false;
+        }
+    }
+
    return true;
 }


--- a/src/contrib/mlir/core/pass/ng_dialect_builder.cpp
+++ b/src/contrib/mlir/core/pass/ng_dialect_builder.cpp
@@ -26,28 +26,7 @@
 #include "ngraph/descriptor/tensor.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/node.hpp"
-#include "ngraph/op/add.hpp"
-#include "ngraph/op/argmax.hpp"
-#include "ngraph/op/argmin.hpp"
-#include "ngraph/op/concat.hpp"
-#include "ngraph/op/convolution.hpp"
-#include "ngraph/op/divide.hpp"
-#include "ngraph/op/dot.hpp"
-#include "ngraph/op/equal.hpp"
-#include "ngraph/op/experimental/compiled_kernel.hpp"
-#include "ngraph/op/gather.hpp"
-#include "ngraph/op/greater.hpp"
-#include "ngraph/op/greater_eq.hpp"
-#include "ngraph/op/less.hpp"
-#include "ngraph/op/less_eq.hpp"
-#include "ngraph/op/maximum.hpp"
-#include "ngraph/op/minimum.hpp"
-#include "ngraph/op/multiply.hpp"
-#include "ngraph/op/negative.hpp"
-#include "ngraph/op/not_equal.hpp"
-#include "ngraph/op/relu.hpp"
-#include "ngraph/op/subtract.hpp"
-#include "ngraph/op/util/index_reduction.hpp"
+#include "ngraph/ops.hpp"
 #include "ngraph/type/element_type.hpp"

 // Defines a new LLVM debug type for this file to be used by LLVM_DEBUG macro.
@@ -117,8 +96,9 @@ namespace

        // Generic op lowerer to ng dialect.
        // Simply maps ngraph tensors to values and generate an OP. No op-specific logic.
+        // Use inNum when mlir OP needs less input than its corresponding ngraph OP.
        template <typename Op>
-        mlir::Operation* createGenericOp(const ngraph::Node* ngNode);
+        mlir::Operation* createGenericOp(const ngraph::Node* ngNode, int inNum = -1);

        template <typename RedOp>
        mlir::Operation* createIndexReduction(const ngraph::Node* ngNode);
@@ -133,6 +113,9 @@ namespace
        template <typename T>
        mlir::ArrayAttr getShapeAsAttr(T ngShape);

+        /// Return the real input node corresponding to the fake node
+        ngraph::Node* getOriginArg(ngraph::Node* node) const;
+
    private:
        // Sub-graph to be compiled and executed with MLIR.
        const ngraph::op::CompiledKernel* m_compiledKernel;
@@ -220,6 +203,14 @@ mlir::ArrayAttr NgDialectConversionPass::getShapeAsAttr(T ngShape)
    return m_builder.getI64ArrayAttr(mlirShape);
 }

+ngraph::Node* NgDialectConversionPass::getOriginArg(ngraph::Node* node) const
+{
+    auto inputMap = m_compiledKernel->get_input_map();
+    auto it = inputMap.find(node->shared_from_this());
+    NGRAPH_CHECK(it != inputMap.end(), "Parameter not in CK input map");
+    return m_compiledKernel->input_values().at(it->second).get_node();
+}
+
 // Converts an nGraph Tensor into an MLIR tensor type, including the conversion of the Tensor's
 // element type.
 mlir::Type NgDialectConversionPass::getMlirType(const descriptor::Tensor* tensor)
@@ -464,17 +455,157 @@ mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::Convolutio
    return op;
 }

+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::AvgPool)
+{
+    mlir::Operation* op = NgDialectObj.createGenericOp<mlir::NGAvgPoolOp>(ngNode);
+    auto avgPoolNode = static_cast<const ngraph::op::AvgPool*>(ngNode);
+    auto avgPoolOp = llvm::cast<mlir::NGAvgPoolOp>(op);
+
+    mlir::BoolAttr boolAttr =
+        NgDialectObj.m_builder.getBoolAttr(avgPoolNode->get_include_padding_in_avg_computation());
+    avgPoolOp.setIncludePadding(boolAttr);
+
+    mlir::ArrayAttr attr = NgDialectObj.getShapeAsAttr(avgPoolNode->get_window_shape());
+    avgPoolOp.setWindowShape(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolNode->get_window_movement_strides());
+    avgPoolOp.setWindowMovementStrides(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolNode->get_padding_below());
+    avgPoolOp.setPadBelow(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolNode->get_padding_above());
+    avgPoolOp.setPadAbove(attr);
+    return op;
+}
+
+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::AvgPoolBackprop)
+{
+    mlir::Operation* op = NgDialectObj.createGenericOp<mlir::NGAvgPoolBackpropOp>(ngNode);
+    auto avgPoolBackpropNode = static_cast<const ngraph::op::AvgPoolBackprop*>(ngNode);
+    auto avgPoolBackpropOp = llvm::cast<mlir::NGAvgPoolBackpropOp>(op);
+
+    mlir::BoolAttr boolAttr = NgDialectObj.m_builder.getBoolAttr(
+        avgPoolBackpropNode->get_include_padding_in_avg_computation());
+    avgPoolBackpropOp.setIncludePadding(boolAttr);
+
+    mlir::ArrayAttr attr = NgDialectObj.getShapeAsAttr(avgPoolBackpropNode->get_window_shape());
+    avgPoolBackpropOp.setWindowShape(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolBackpropNode->get_window_movement_strides());
+    avgPoolBackpropOp.setWindowMovementStrides(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolBackpropNode->get_padding_below());
+    avgPoolBackpropOp.setPadBelow(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolBackpropNode->get_padding_above());
+    avgPoolBackpropOp.setPadAbove(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(avgPoolBackpropNode->get_forward_arg_shape());
+    avgPoolBackpropOp.setForwardArgShape(attr);
+    return op;
+}
+
+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::MaxPool)
+{
+    mlir::Operation* op = NgDialectObj.createGenericOp<mlir::NGMaxPoolOp>(ngNode);
+    auto maxPoolNode = static_cast<const ngraph::op::MaxPool*>(ngNode);
+    auto maxPoolOp = llvm::cast<mlir::NGMaxPoolOp>(op);
+
+    mlir::ArrayAttr attr = NgDialectObj.getShapeAsAttr(maxPoolNode->get_window_shape());
+    maxPoolOp.setWindowShape(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(maxPoolNode->get_window_movement_strides());
+    maxPoolOp.setWindowMovementStrides(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(maxPoolNode->get_padding_below());
+    maxPoolOp.setPadBelow(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(maxPoolNode->get_padding_above());
+    maxPoolOp.setPadAbove(attr);
+    return op;
+}
+
+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::MaxPoolBackprop)
+{
+    mlir::Operation* op = NgDialectObj.createGenericOp<mlir::NGMaxPoolBackpropOp>(ngNode, 2);
+    auto maxPoolBackpropNode = static_cast<const ngraph::op::MaxPool*>(ngNode);
+    auto maxPoolBackpropOp = llvm::cast<mlir::NGMaxPoolBackpropOp>(op);
+
+    mlir::ArrayAttr attr = NgDialectObj.getShapeAsAttr(maxPoolBackpropNode->get_window_shape());
+    maxPoolBackpropOp.setWindowShape(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(maxPoolBackpropNode->get_window_movement_strides());
+    maxPoolBackpropOp.setWindowMovementStrides(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(maxPoolBackpropNode->get_padding_below());
+    maxPoolBackpropOp.setPadBelow(attr);
+
+    attr = NgDialectObj.getShapeAsAttr(maxPoolBackpropNode->get_padding_above());
+    maxPoolBackpropOp.setPadAbove(attr);
+    return op;
+}
+
+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::MatMul)
+{
+    auto matmulNode = static_cast<const ngraph::op::MatMul*>(ngNode);
+    auto op = NgDialectObj.createGenericOp<mlir::NGMatMulOp>(ngNode);
+    auto matmulOp = llvm::cast<mlir::NGMatMulOp>(op);
+    matmulOp.setTransposeA(NgDialectObj.m_builder.getBoolAttr(matmulNode->get_transpose_a()));
+    matmulOp.setTransposeB(NgDialectObj.m_builder.getBoolAttr(matmulNode->get_transpose_b()));
+    return op;
+}
+
+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::Gemm)
+{
+    auto gemmNode = static_cast<const ngraph::op::Gemm*>(ngNode);
+    auto op = NgDialectObj.createGenericOp<mlir::NGGemmOp>(ngNode);
+    auto gemmOp = llvm::cast<mlir::NGGemmOp>(op);
+    gemmOp.setTransA(NgDialectObj.m_builder.getBoolAttr(gemmNode->get_transA()));
+    gemmOp.setTransB(NgDialectObj.m_builder.getBoolAttr(gemmNode->get_transB()));
+    gemmOp.setAlpha(NgDialectObj.m_builder.getF32FloatAttr(gemmNode->get_alpha()));
+    gemmOp.setBeta(NgDialectObj.m_builder.getF32FloatAttr(gemmNode->get_beta()));
+    return op;
+}
+
+template <>
+mlir::Operation* NgDialectConversionPass::COMPILE_OP_DECL(ngraph::op::Softmax)
+{
+    mlir::Operation* op = NgDialectObj.createGenericOp<mlir::NGSoftMaxOp>(ngNode, 1);
+    auto softmaxNode = static_cast<const ngraph::op::Softmax*>(ngNode);
+    auto softmaxOp = llvm::cast<mlir::NGSoftMaxOp>(op);
+
+    auto originArg = NgDialectObj.getOriginArg(ngNode->input_value(1).get_node());
+    auto const_op = static_cast<ngraph::op::Constant*>(originArg);
+
+    AxisSet axes = const_op->get_axis_set_val();
+    mlir::ArrayAttr attr = NgDialectObj.getShapeAsAttr(axes);
+    softmaxOp.setAxes(attr);
+    return op;
+}
+
 template <typename Op>
-mlir::Operation* NgDialectConversionPass::createGenericOp(const ngraph::Node* ngNode)
+mlir::Operation* NgDialectConversionPass::createGenericOp(const ngraph::Node* ngNode, int inNum)
 {
    std::vector<mlir::Value*> argValues;
    std::vector<mlir::Type> resTypes;
    auto inputMap = m_compiledKernel->get_input_map();
    std::shared_ptr<descriptor::Tensor> argTensor;
+    int i = 0;
    for (auto& argOutput : ngNode->input_values())
    {
+        if (inNum != -1 && i == inNum)
+        {
+            break;
+        }
        auto argOutputNode = argOutput.get_node();
-        if (as_type<op::Parameter>(argOutputNode))
+        if (is_type<op::Parameter>(argOutputNode))
        {
            auto it = inputMap.find(argOutputNode->shared_from_this());
            NGRAPH_CHECK(it != inputMap.end(), "Parameter not in CK input map");
@@ -488,6 +619,7 @@ mlir::Operation* NgDialectConversionPass::createGenericOp(const ngraph::Node* ng

        auto argV = getTensorValue(argTensor.get()).m_value;
        argValues.push_back(argV);
+        i++;
    }

    for (auto& output : ngNode->outputs())

--- a/src/contrib/mlir/runtime/cpu/callback_utils.hpp
+++ b/src/contrib/mlir/runtime/cpu/callback_utils.hpp
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <cstdint>
+
+#pragma once
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace ngmlir
+        {
+            // OpType class is used for callbacks.
+            // We pass OpType to the generic callback functions,
+            // which call the real implementation based on OpType.
+            // TODO remove those not needed once all callbacks are implemented.
+            enum class OpType
+            {
+                ADD = 0,
+                AVGPOOL,
+                AVGPOOLBACKPROP,
+                BATCHNORM3ARGS,
+                BATCHNORM5ARGS,
+                BATCHNORMBACKPROP,
+                BOUNDEDRELU,
+                CONCAT,
+                CONVERTLAYOUT,
+                CONVOLUTION,
+                CONVOLUTIONRELU,
+                CONVOLUTIONADD,
+                CONVOLUTIONBIAS,
+                CONVOLUTIONBIASADD,
+                CONVOLUTIONBACKPROPDATA,
+                CONVOLUTIONBACKPROPWEIGHTS,
+                CONVOLUTIONBACKPROPWEIGHTSBIAS,
+                GELU,
+                GELUBACKPROP,
+                GEMM,
+                GROUPCONVOLUTION,
+                GROUPCONVOLUTIONBIAS,
+                DECONVOLUTIONBIAS,
+                LEAKYRELU,
+                LRN,
+                LSTM,
+                MATMUL,
+                MAXPOOL,
+                MAXPOOLBACKPROP,
+                MAXPOOLBACKPROPFORWARD,
+                MAXPOOLBACKPROPBACKWARD,
+                MAXPOOLWITHINDICES,
+                MAXPOOLWITHINDICESBACKPROP,
+                QUANTIZE,
+                DEQUANTIZE,
+                QUANTIZEDAVGPOOL,
+                QUANTIZEDMAXPOOL,
+                QUANTIZEDCONCAT,
+                QUANTIZEDDOTBIAS,
+                QUANTIZEDMATMUL,
+                QUANTIZEDCONVOLUTION,
+                QUANTIZEDCONVOLUTIONBIAS,
+                QUANTIZEDCONVOLUTIONBIASADD,
+                QUANTIZEDCONVOLUTIONBIASSIGNEDADD,
+                QUANTIZEDCONVOLUTIONRELU,
+                RELU,
+                RELUBACKPROP,
+                RNN,
+                SIGMOID,
+                SIGMOIDBACKPROP,
+                SLICE,
+                SOFTMAX
+            };
+
+            // These structs and union are used to pass attributes to callbacks.
+            template <int N>
+            struct poolAttrs
+            {
+                bool includePaddingInAvgComputation;
+                int64_t windowShape[N];
+                int64_t windowStrides[N];
+                int64_t padBelow[N];
+                int64_t padAbove[N];
+            };
+
+            struct gemmAttrs
+            {
+                bool transposeA;
+                bool transposeB;
+                int64_t m;
+                int64_t n;
+                int64_t k;
+                int64_t lda;
+                int64_t ldb;
+                int64_t ldc;
+                float alpha;
+                float beta;
+                int64_t broadcastHint;
+            };
+
+            union opAttrs {
+                int intAttr;
+                poolAttrs<2> poolAttrs2d;
+                poolAttrs<3> poolAttrs3d;
+                gemmAttrs gemmAttrs2d;
+            };
+        } // namespace ngmlir
+    }     // namespace runtime
+} // namespace ngraph
--- a/src/contrib/mlir/runtime/cpu/cpu_callbacks.cpp
+++ b/src/contrib/mlir/runtime/cpu/cpu_callbacks.cpp
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+// NOTE: This file follows nGraph format style.
+// Follows nGraph naming convention for public APIs only, else MLIR naming convention.
+
+#include "callback_utils.hpp"
+#include "contrib/mlir/backend/cpu/cpu_backend.hpp"
+#include "cpu_runtime.hpp"
+#include "ngraph/check.hpp"
+
+#include "ngraph/runtime/cpu/cpu_kernels.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+
+using namespace ngraph;
+using namespace ngraph::runtime::ngmlir;
+
+extern std::vector<opAttrs> opAttrsVec;
+static inline opAttrs getAttrs(size_t index)
+{
+    return opAttrsVec[index];
+}
+
+static bool inline compare_mkldnn_dims(mkldnn_dims_t& arr1, mkldnn_dims_t& arr2, size_t size)
+{
+    for (auto i = 0; i < size; i++)
+    {
+        if (arr1[i] != arr2[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool
+    compare_mkldnn_strides_order(mkldnn_dims_t& strides1, mkldnn_dims_t& strides2, size_t size)
+{
+    std::vector<size_t> indices1(size, 0), indices2(size, 0);
+    for (size_t i = 0; i < size; i++)
+    {
+        indices1[i] = i;
+        indices2[i] = i;
+    }
+    std::sort(indices1.begin(), indices1.begin(), [&](const size_t& n1, const size_t& n2) {
+        return strides1[n1] < strides1[n2];
+    });
+    std::sort(indices2.begin(), indices2.begin(), [&](const size_t& n1, const size_t& n2) {
+        return strides2[n1] < strides2[n2];
+    });
+
+    for (auto i = 0; i < size; i++)
+    {
+        if (indices1[i] != indices2[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool compare_mkldnn_md_formats(const mkldnn::memory::desc& lhs,
+                                      const mkldnn::memory::desc& rhs)
+{
+    mkldnn_memory_desc_t md1 = lhs.data, md2 = rhs.data;
+
+    if (md1.format_kind != md2.format_kind)
+    {
+        return false;
+    }
+
+    if (md1.format_kind != static_cast<mkldnn_format_kind_t>(mkldnn::memory::format_kind::blocked))
+    {
+        // mkldnn not implemented yet
+        return false;
+    }
+
+    if (md1.ndims != md2.ndims)
+    {
+        return false;
+    }
+
+    auto blk1 = md1.format_desc.blocking;
+    auto blk2 = md2.format_desc.blocking;
+
+    if (blk1.inner_nblks != blk2.inner_nblks ||
+        !compare_mkldnn_dims(blk1.inner_blks, blk2.inner_blks, blk1.inner_nblks) ||
+        !compare_mkldnn_dims(blk1.inner_idxs, blk2.inner_idxs, blk1.inner_nblks))
+    {
+        return false;
+    }
+
+    return compare_mkldnn_strides_order(blk1.strides, blk2.strides, md1.ndims);
+}
+
+static mkldnn::memory convert_layout_if_diff(const mkldnn::memory::desc& lhs,
+                                             const mkldnn::memory::desc& rhs,
+                                             void* ptr,
+                                             mkldnn::engine cpu_engine)
+{
+    if (!compare_mkldnn_md_formats(lhs, rhs))
+    {
+        mkldnn::memory reorder_in = {lhs, cpu_engine, ptr};
+        mkldnn::memory reorder_out = {rhs, cpu_engine};
+        mkldnn::reorder convert(reorder_in, reorder_out);
+        std::unordered_map<int, mkldnn::memory> exec_args = {{MKLDNN_ARG_SRC, reorder_in},
+                                                             {MKLDNN_ARG_DST, reorder_out}};
+        mkldnn::stream s(cpu_engine);
+        try
+        {
+            convert.execute(s, exec_args);
+            s.wait();
+        }
+        catch (const mkldnn::error& e)
+        {
+            throw ngraph_error("Could not run mkdnn primitive " + std::string(e.message));
+        }
+        return reorder_out;
+    }
+    else
+    {
+        return mkldnn::memory{lhs, cpu_engine, ptr};
+    }
+}
+
+static void convert_output_layout(const mkldnn::memory::desc& lhs,
+                                  const mkldnn::memory::desc& rhs,
+                                  void* ptr,
+                                  mkldnn::engine cpu_engine)
+{
+    mkldnn::memory reorder_in = {rhs, cpu_engine};
+    mkldnn::memory reorder_out = {lhs, cpu_engine, ptr};
+    mkldnn::reorder convert(reorder_in, reorder_out);
+    std::unordered_map<int, mkldnn::memory> exec_args = {{MKLDNN_ARG_SRC, reorder_in},
+                                                         {MKLDNN_ARG_DST, reorder_out}};
+    mkldnn::stream s(cpu_engine);
+    try
+    {
+        convert.execute(s, exec_args);
+        s.wait();
+    }
+    catch (const mkldnn::error& e)
+    {
+        throw ngraph_error("Could not run mkdnn primitive " + std::string(e.message));
+    }
+}
+
+/// Callback for MaxPoolBackprop
+static void __mlir_mkldnn_maxpoolbackprop(size_t rank,
+                                          StaticMemRef* memRefSrc,
+                                          StaticMemRef* memRefDelta,
+                                          StaticMemRef* memRefOutput,
+                                          size_t index)
+{
+    mkldnn::memory::dims srcDims(rank);
+    mkldnn::memory::dims srcStrides(rank);
+    mkldnn::memory::dims deltaDims(rank);
+    mkldnn::memory::dims deltaStrides(rank);
+    mkldnn::memory::dims outDims(rank);
+    mkldnn::memory::dims outStrides(rank);
+    for (auto i = 0; i < rank; i++)
+    {
+        srcDims[i] = memRefSrc->shapeAndStrides[i];
+        srcStrides[i] = memRefSrc->shapeAndStrides[rank + i];
+        deltaDims[i] = memRefDelta->shapeAndStrides[i];
+        deltaStrides[i] = memRefDelta->shapeAndStrides[rank + i];
+        outDims[i] = memRefOutput->shapeAndStrides[i];
+        outStrides[i] = memRefOutput->shapeAndStrides[rank + i];
+    }
+
+    // build mkldnn primitive and execute
+    auto required_format = rank == 4 ? mkldnn::memory::FORMAT::nchw : mkldnn::memory::FORMAT::ncdhw;
+    mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32;
+    auto diff_dst_desc = mkldnn::memory::desc(deltaDims, dtype, required_format);
+    auto diff_src_desc = mkldnn::memory::desc(outDims, dtype, required_format);
+    auto src_desc_origin = mkldnn::memory::desc(srcDims, dtype, srcStrides);
+    auto diff_dst_desc_origin = mkldnn::memory::desc(deltaDims, dtype, deltaStrides);
+    auto diff_src_desc_origin = mkldnn::memory::desc(outDims, dtype, outStrides);
+
+    mkldnn::primitive_attr attr;
+    mkldnn::engine cpu_engine(mkldnn::engine::kind::cpu, 0);
+    mkldnn::pooling_forward::primitive_desc maxpool_pd_f;
+    mkldnn::pooling_backward::primitive_desc maxpool_pd_b;
+    if (rank == 4)
+    {
+        poolAttrs<2> pAttrs = getAttrs(index).poolAttrs2d;
+        auto maxpool_desc_f = mkldnn::pooling_forward::desc(
+            mkldnn::prop_kind::forward_training,
+            mkldnn::algorithm::pooling_max,
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{pAttrs.windowStrides[0], pAttrs.windowStrides[1]},
+            mkldnn::memory::dims{pAttrs.windowShape[0], pAttrs.windowShape[1]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1]});
+        auto maxpool_desc_b = mkldnn::pooling_backward::desc(
+            mkldnn::algorithm::pooling_max,
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{pAttrs.windowStrides[0], pAttrs.windowStrides[1]},
+            mkldnn::memory::dims{pAttrs.windowShape[0], pAttrs.windowShape[1]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1]});
+        maxpool_pd_f = mkldnn::pooling_forward::primitive_desc(maxpool_desc_f, attr, cpu_engine);
+        maxpool_pd_b = mkldnn::pooling_backward::primitive_desc(
+            maxpool_desc_b, attr, cpu_engine, maxpool_pd_f);
+    }
+    else if (rank == 5)
+    {
+        poolAttrs<3> pAttrs = getAttrs(index).poolAttrs3d;
+        auto maxpool_desc_f = mkldnn::pooling_forward::desc(
+            mkldnn::prop_kind::forward_training,
+            mkldnn::algorithm::pooling_max,
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{
+                pAttrs.windowStrides[0], pAttrs.windowStrides[1], pAttrs.windowStrides[2]},
+            mkldnn::memory::dims{
+                pAttrs.windowShape[0], pAttrs.windowShape[1], pAttrs.windowShape[2]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1], pAttrs.padBelow[2]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1], pAttrs.padAbove[2]});
+        auto maxpool_desc_b = mkldnn::pooling_backward::desc(
+            mkldnn::algorithm::pooling_max,
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{
+                pAttrs.windowStrides[0], pAttrs.windowStrides[1], pAttrs.windowStrides[2]},
+            mkldnn::memory::dims{
+                pAttrs.windowShape[0], pAttrs.windowShape[1], pAttrs.windowShape[2]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1], pAttrs.padBelow[2]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1], pAttrs.padAbove[2]});
+        auto maxpool_pd_f =
+            mkldnn::pooling_forward::primitive_desc(maxpool_desc_f, attr, cpu_engine);
+        maxpool_pd_f = mkldnn::pooling_forward::primitive_desc(maxpool_desc_f, attr, cpu_engine);
+        maxpool_pd_b = mkldnn::pooling_backward::primitive_desc(
+            maxpool_desc_b, attr, cpu_engine, maxpool_pd_f);
+    }
+
+    mkldnn::pooling_forward maxpool_f(maxpool_pd_f);
+    mkldnn::memory src_mem = convert_layout_if_diff(
+        src_desc_origin, maxpool_pd_b.diff_src_desc(), memRefSrc->allocatedPtr, cpu_engine);
+    mkldnn::memory dst_mem{maxpool_pd_b.diff_dst_desc(), cpu_engine};
+    mkldnn::memory workspace{maxpool_pd_f.workspace_desc(), cpu_engine};
+
+    mkldnn::pooling_backward maxpool_b(maxpool_pd_b);
+    mkldnn::memory diff_dst = convert_layout_if_diff(
+        diff_dst_desc_origin, maxpool_pd_b.diff_dst_desc(), memRefDelta->allocatedPtr, cpu_engine);
+    mkldnn::memory diff_src;
+    bool need_convert = false;
+    if (!compare_mkldnn_md_formats(diff_src_desc_origin, maxpool_pd_b.diff_src_desc()))
+    {
+        diff_src = mkldnn::memory(maxpool_pd_b.diff_src_desc(), cpu_engine);
+        need_convert = true;
+    }
+    else
+    {
+        diff_src =
+            mkldnn::memory(maxpool_pd_b.diff_src_desc(), cpu_engine, memRefOutput->allocatedPtr);
+    }
+
+    std::unordered_map<int, mkldnn::memory> exec_args_f = {
+        {MKLDNN_ARG_SRC, src_mem}, {MKLDNN_ARG_WORKSPACE, workspace}, {MKLDNN_ARG_DST, dst_mem}};
+    std::unordered_map<int, mkldnn::memory> exec_args_b = {{MKLDNN_ARG_DIFF_DST, diff_dst},
+                                                           {MKLDNN_ARG_WORKSPACE, workspace},
+                                                           {MKLDNN_ARG_DIFF_SRC, diff_src}};
+
+    mkldnn::stream s(cpu_engine);
+    try
+    {
+        maxpool_f.execute(s, exec_args_f);
+        s.wait();
+        maxpool_b.execute(s, exec_args_b);
+        s.wait();
+    }
+    catch (const mkldnn::error& e)
+    {
+        throw ngraph_error("Could not run mkdnn primitive " + std::string(e.message));
+    }
+
+    if (need_convert)
+    {
+        convert_output_layout(diff_dst_desc_origin,
+                              maxpool_pd_b.diff_dst_desc(),
+                              memRefOutput->allocatedPtr,
+                              cpu_engine);
+    }
+}
+
+/// Callback for AvgPoolBackprop
+static void __mlir_mkldnn_avgpoolbackprop(size_t rank,
+                                          StaticMemRef* memRefInput,
+                                          StaticMemRef* memRefOutput,
+                                          size_t index)
+{
+    mkldnn::memory::dims dims(rank);
+    mkldnn::memory::dims strides(rank);
+    mkldnn::memory::dims outDims(rank);
+    mkldnn::memory::dims outStrides(rank);
+    for (auto i = 0; i < rank; i++)
+    {
+        dims[i] = memRefInput->shapeAndStrides[i];
+        strides[i] = memRefInput->shapeAndStrides[rank + i];
+        outDims[i] = memRefOutput->shapeAndStrides[i];
+        outStrides[i] = memRefOutput->shapeAndStrides[rank + i];
+    }
+
+    // build mkldnn primitive and execute
+    auto required_format = rank == 4 ? mkldnn::memory::FORMAT::nchw : mkldnn::memory::FORMAT::ncdhw;
+    mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32;
+    auto diff_dst_desc = mkldnn::memory::desc(dims, dtype, required_format);
+    auto diff_src_desc = mkldnn::memory::desc(outDims, dtype, required_format);
+    auto diff_dst_desc_origin = mkldnn::memory::desc(dims, dtype, strides);
+    auto diff_src_desc_origin = mkldnn::memory::desc(outDims, dtype, outStrides);
+    mkldnn::primitive_attr attr;
+    mkldnn::engine cpu_engine(mkldnn::engine::kind::cpu, 0);
+    mkldnn::pooling_backward::primitive_desc avgpool_pd_b;
+    if (rank == 4)
+    {
+        poolAttrs<2> pAttrs = getAttrs(index).poolAttrs2d;
+        auto avgpool_desc_f = mkldnn::pooling_forward::desc(
+            mkldnn::prop_kind::forward_training,
+            (pAttrs.includePaddingInAvgComputation
+                 ? mkldnn::algorithm::pooling_avg_include_padding
+                 : mkldnn::algorithm::pooling_avg_exclude_padding),
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{pAttrs.windowStrides[0], pAttrs.windowStrides[1]},
+            mkldnn::memory::dims{pAttrs.windowShape[0], pAttrs.windowShape[1]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1]});
+        auto avgpool_desc_b = mkldnn::pooling_backward::desc(
+            (pAttrs.includePaddingInAvgComputation
+                 ? mkldnn::algorithm::pooling_avg_include_padding
+                 : mkldnn::algorithm::pooling_avg_exclude_padding),
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{pAttrs.windowStrides[0], pAttrs.windowStrides[1]},
+            mkldnn::memory::dims{pAttrs.windowShape[0], pAttrs.windowShape[1]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1]});
+        auto avgpool_pd_f =
+            mkldnn::pooling_forward::primitive_desc(avgpool_desc_f, attr, cpu_engine);
+        avgpool_pd_b = mkldnn::pooling_backward::primitive_desc(
+            avgpool_desc_b, attr, cpu_engine, avgpool_pd_f);
+    }
+    else if (rank == 5)
+    {
+        poolAttrs<3> pAttrs = getAttrs(index).poolAttrs3d;
+        auto avgpool_desc_f = mkldnn::pooling_forward::desc(
+            mkldnn::prop_kind::forward_training,
+            (pAttrs.includePaddingInAvgComputation
+                 ? mkldnn::algorithm::pooling_avg_include_padding
+                 : mkldnn::algorithm::pooling_avg_exclude_padding),
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{
+                pAttrs.windowStrides[0], pAttrs.windowStrides[1], pAttrs.windowStrides[2]},
+            mkldnn::memory::dims{
+                pAttrs.windowShape[0], pAttrs.windowShape[1], pAttrs.windowShape[2]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1], pAttrs.padBelow[2]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1], pAttrs.padAbove[2]});
+        auto avgpool_desc_b = mkldnn::pooling_backward::desc(
+            (pAttrs.includePaddingInAvgComputation
+                 ? mkldnn::algorithm::pooling_avg_include_padding
+                 : mkldnn::algorithm::pooling_avg_exclude_padding),
+            diff_src_desc,
+            diff_dst_desc,
+            mkldnn::memory::dims{
+                pAttrs.windowStrides[0], pAttrs.windowStrides[1], pAttrs.windowStrides[2]},
+            mkldnn::memory::dims{
+                pAttrs.windowShape[0], pAttrs.windowShape[1], pAttrs.windowShape[2]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1], pAttrs.padBelow[2]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1], pAttrs.padAbove[2]});
+        auto avgpool_pd_f =
+            mkldnn::pooling_forward::primitive_desc(avgpool_desc_f, attr, cpu_engine);
+        avgpool_pd_b = mkldnn::pooling_backward::primitive_desc(
+            avgpool_desc_b, attr, cpu_engine, avgpool_pd_f);
+    }
+
+    mkldnn::pooling_backward avgpool(avgpool_pd_b);
+    mkldnn::memory in = convert_layout_if_diff(
+        diff_dst_desc_origin, avgpool_pd_b.diff_dst_desc(), memRefInput->allocatedPtr, cpu_engine);
+    mkldnn::memory out;
+    bool need_convert = false;
+    if (!compare_mkldnn_md_formats(diff_src_desc_origin, avgpool_pd_b.diff_src_desc()))
+    {
+        out = mkldnn::memory(avgpool_pd_b.diff_src_desc(), cpu_engine);
+        need_convert = true;
+    }
+    else
+    {
+        out = mkldnn::memory(avgpool_pd_b.diff_src_desc(), cpu_engine, memRefOutput->allocatedPtr);
+    }
+    std::unordered_map<int, mkldnn::memory> exec_args = {{MKLDNN_ARG_DIFF_DST, in},
+                                                         {MKLDNN_ARG_DIFF_SRC, out}};
+
+    mkldnn::stream s(cpu_engine);
+    try
+    {
+        avgpool.execute(s, exec_args);
+        s.wait();
+    }
+    catch (const mkldnn::error& e)
+    {
+        throw ngraph_error("Could not run mkdnn primitive " + std::string(e.message));
+    }
+
+    if (need_convert)
+    {
+        convert_output_layout(diff_dst_desc_origin,
+                              avgpool_pd_b.diff_dst_desc(),
+                              memRefOutput->allocatedPtr,
+                              cpu_engine);
+    }
+}
+
+/// Callback for AvgPool and MaxPool
+static void __mlir_mkldnn_pooling(
+    size_t rank, StaticMemRef* memRefInput, StaticMemRef* memRefOutput, size_t index, OpType type)
+{
+    mkldnn::memory::dims dims(rank);
+    mkldnn::memory::dims strides(rank);
+    mkldnn::memory::dims outDims(rank);
+    mkldnn::memory::dims outStrides(rank);
+    for (auto i = 0; i < rank; i++)
+    {
+        dims[i] = memRefInput->shapeAndStrides[i];
+        strides[i] = memRefInput->shapeAndStrides[rank + i];
+        outDims[i] = memRefOutput->shapeAndStrides[i];
+        outStrides[i] = memRefOutput->shapeAndStrides[rank + i];
+    }
+
+    // build mkldnn primitive and execute
+    auto required_format = rank == 4 ? mkldnn::memory::FORMAT::nchw : mkldnn::memory::FORMAT::ncdhw;
+    mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32;
+    auto input_desc = mkldnn::memory::desc(dims, dtype, required_format);
+    auto result_desc = mkldnn::memory::desc(outDims, dtype, required_format);
+    auto input_desc_origin = mkldnn::memory::desc(dims, dtype, strides);
+    auto result_desc_origin = mkldnn::memory::desc(outDims, dtype, outStrides);
+    mkldnn::primitive_attr attr;
+    mkldnn::engine cpu_engine(mkldnn::engine::kind::cpu, 0);
+    mkldnn::pooling_forward::primitive_desc pool_pd;
+    if (rank == 4)
+    {
+        poolAttrs<2> pAttrs = getAttrs(index).poolAttrs2d;
+        mkldnn::algorithm alg = type == OpType::MAXPOOL
+                                    ? mkldnn::algorithm::pooling_max
+                                    : (pAttrs.includePaddingInAvgComputation
+                                           ? mkldnn::algorithm::pooling_avg_include_padding
+                                           : mkldnn::algorithm::pooling_avg_exclude_padding);
+        auto pool_desc = mkldnn::pooling_forward::desc(
+            mkldnn::prop_kind::forward_inference,
+            alg,
+            input_desc,
+            result_desc,
+            mkldnn::memory::dims{pAttrs.windowStrides[0], pAttrs.windowStrides[1]},
+            mkldnn::memory::dims{pAttrs.windowShape[0], pAttrs.windowShape[1]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1]});
+        pool_pd = mkldnn::pooling_forward::primitive_desc(pool_desc, attr, cpu_engine);
+    }
+    else if (rank == 5)
+    {
+        poolAttrs<3> pAttrs = getAttrs(index).poolAttrs3d;
+        mkldnn::algorithm alg = type == OpType::MAXPOOL
+                                    ? mkldnn::algorithm::pooling_max
+                                    : (pAttrs.includePaddingInAvgComputation
+                                           ? mkldnn::algorithm::pooling_avg_include_padding
+                                           : mkldnn::algorithm::pooling_avg_exclude_padding);
+        auto pool_desc = mkldnn::pooling_forward::desc(
+            mkldnn::prop_kind::forward_inference,
+            alg,
+            input_desc,
+            result_desc,
+            mkldnn::memory::dims{
+                pAttrs.windowStrides[0], pAttrs.windowStrides[1], pAttrs.windowStrides[2]},
+            mkldnn::memory::dims{
+                pAttrs.windowShape[0], pAttrs.windowShape[1], pAttrs.windowShape[2]},
+            mkldnn::memory::dims{pAttrs.padBelow[0], pAttrs.padBelow[1], pAttrs.padBelow[2]},
+            mkldnn::memory::dims{pAttrs.padAbove[0], pAttrs.padAbove[1], pAttrs.padAbove[2]});
+        pool_pd = mkldnn::pooling_forward::primitive_desc(pool_desc, attr, cpu_engine);
+    }
+
+    mkldnn::pooling_forward pool(pool_pd);
+    mkldnn::memory in = convert_layout_if_diff(
+        input_desc_origin, pool_pd.src_desc(), memRefInput->allocatedPtr, cpu_engine);
+    mkldnn::memory out;
+    bool need_convert = false;
+    if (!compare_mkldnn_md_formats(result_desc_origin, pool_pd.dst_desc()))
+    {
+        out = mkldnn::memory(pool_pd.dst_desc(), cpu_engine);
+        need_convert = true;
+    }
+    else
+    {
+        out = mkldnn::memory(pool_pd.dst_desc(), cpu_engine, memRefOutput->allocatedPtr);
+    }
+    std::unordered_map<int, mkldnn::memory> exec_args = {{MKLDNN_ARG_SRC, in},
+                                                         {MKLDNN_ARG_DST, out}};
+
+    mkldnn::stream s(cpu_engine);
+    try
+    {
+        pool.execute(s, exec_args);
+        s.wait();
+    }
+    catch (const mkldnn::error& e)
+    {
+        throw ngraph_error("Could not run mkdnn primitive " + std::string(e.message));
+    }
+
+    if (need_convert)
+    {
+        convert_output_layout(
+            result_desc_origin, pool_pd.dst_desc(), memRefOutput->allocatedPtr, cpu_engine);
+    }
+}
+
+/// Callback for Softmax
+static void __mlir_mkldnn_softmax(size_t rank,
+                                  StaticMemRef* memRefInput,
+                                  StaticMemRef* memRefOutput,
+                                  int index)
+{
+    mkldnn::memory::dims dims(rank);
+    mkldnn::memory::dims strides(rank);
+    for (auto i = 0; i < rank; i++)
+    {
+        dims[i] = memRefInput->shapeAndStrides[i];
+        strides[i] = memRefInput->shapeAndStrides[rank + i];
+    }
+    auto softmax_axis = getAttrs(index).intAttr;
+
+    // build mkldnn primitive and execute
+    mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32;
+    auto input_desc = mkldnn::memory::desc(dims, dtype, strides);
+    auto softmax_desc =
+        mkldnn::softmax_forward::desc(mkldnn::prop_kind::forward_scoring, input_desc, softmax_axis);
+    mkldnn::primitive_attr attr;
+    mkldnn::engine cpu_engine(mkldnn::engine::kind::cpu, 0);
+    auto softmax_pd = mkldnn::softmax_forward::primitive_desc(softmax_desc, attr, cpu_engine);
+    mkldnn::softmax_forward softmax(softmax_pd);
+
+    mkldnn::memory in{softmax_pd.src_desc(), cpu_engine, memRefInput->allocatedPtr};
+    mkldnn::memory out{softmax_pd.dst_desc(), cpu_engine, memRefOutput->allocatedPtr};
+
+    std::unordered_map<int, mkldnn::memory> exec_args = {{MKLDNN_ARG_SRC, in},
+                                                         {MKLDNN_ARG_DST, out}};
+
+    mkldnn::stream s(cpu_engine);
+    try
+    {
+        softmax.execute(s, exec_args);
+        s.wait();
+    }
+    catch (const mkldnn::error& e)
+    {
+        throw ngraph_error("Could not run mkdnn primitive " + std::string(e.message));
+    }
+}
+
+/// Callback for MatMul
+static void __mlir_cblas_sgemm(StaticMemRef* memRefmatA,
+                               StaticMemRef* memRefmatB,
+                               StaticMemRef* memRefmatC,
+                               size_t index)
+{
+    gemmAttrs gAttrs = getAttrs(index).gemmAttrs2d;
+    ;
+    cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                       gAttrs.transposeA ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                       gAttrs.transposeB ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                       gAttrs.m,
+                       gAttrs.n,
+                       gAttrs.k,
+                       1.0f,
+                       reinterpret_cast<float*>(memRefmatA->allocatedPtr),
+                       std::max<size_t>(1, gAttrs.lda),
+                       reinterpret_cast<float*>(memRefmatB->allocatedPtr),
+                       std::max<size_t>(1, gAttrs.ldb),
+                       0.0f,
+                       reinterpret_cast<float*>(memRefmatC->allocatedPtr),
+                       std::max<size_t>(1, gAttrs.ldc));
+}
+
+/// Callback for Gemm
+static void __mlir_cblas_sgemm_with_bias(StaticMemRef* memRefmatA,
+                                         StaticMemRef* memRefmatB,
+                                         StaticMemRef* memRefmatC,
+                                         StaticMemRef* memRefmatOut,
+                                         size_t index)
+{
+    gemmAttrs gAttrs = getAttrs(index).gemmAttrs2d;
+    auto transposeA = gAttrs.transposeA;
+    auto transposeB = gAttrs.transposeB;
+    auto m = gAttrs.m;
+    auto n = gAttrs.n;
+    auto k = gAttrs.k;
+    auto lda = gAttrs.lda;
+    auto ldb = gAttrs.ldb;
+    auto ldc = gAttrs.ldc;
+    auto alpha = gAttrs.alpha;
+    auto beta = gAttrs.beta;
+    auto broadcastHint = gAttrs.broadcastHint;
+
+    auto matA = reinterpret_cast<float*>(memRefmatA->allocatedPtr);
+    auto matB = reinterpret_cast<float*>(memRefmatB->allocatedPtr);
+    auto matC = reinterpret_cast<float*>(memRefmatC->allocatedPtr);
+    auto matOut = reinterpret_cast<float*>(memRefmatOut->allocatedPtr);
+
+    cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                       transposeA ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                       transposeB ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       matA,
+                       std::max<size_t>(1, lda),
+                       matB,
+                       std::max<size_t>(1, ldb),
+                       0.0f,
+                       matOut,
+                       std::max<size_t>(1, ldc));
+
+    if (broadcastHint == 0)
+    {
+        std::vector<float> ones(m, 1.0f);
+        cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                           cblas::Transpose::None,
+                           cblas::Transpose::None,
+                           m,
+                           n,
+                           1,
+                           beta,
+                           ones.data(),
+                           1,
+                           matC,
+                           std::max<size_t>(1, n),
+                           1.0f,
+                           matOut,
+                           std::max<size_t>(1, ldc));
+    }
+    else if (broadcastHint == 1)
+    {
+        std::vector<float> ones(n, 1.0f);
+        cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                           cblas::Transpose::None,
+                           cblas::Transpose::None,
+                           m,
+                           n,
+                           1,
+                           beta,
+                           matC,
+                           1,
+                           ones.data(),
+                           std::max<size_t>(1, n),
+                           1.0f,
+                           matOut,
+                           std::max<size_t>(1, ldc));
+    }
+    else if (broadcastHint == 2)
+    {
+        std::vector<float> ones(m, 1.0f);
+        std::vector<float> bias(n, *matC);
+        cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                           cblas::Transpose::None,
+                           cblas::Transpose::None,
+                           m,
+                           n,
+                           1,
+                           beta,
+                           ones.data(),
+                           1,
+                           bias.data(),
+                           std::max<size_t>(1, n),
+                           1.0f,
+                           matOut,
+                           std::max<size_t>(1, ldc));
+    }
+    else
+    {
+        std::vector<float> identity(n * n, 0.0f);
+        for (auto i = 0; i < n * n; i += n + 1)
+        {
+            identity[i] = 1.0;
+        }
+        cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                           cblas::Transpose::None,
+                           cblas::Transpose::None,
+                           m,
+                           n,
+                           n,
+                           beta,
+                           matC,
+                           std::max<size_t>(1, n),
+                           identity.data(),
+                           std::max<size_t>(1, n),
+                           1.0f,
+                           matOut,
+                           std::max<size_t>(1, ldc));
+    }
+}
+
+extern "C" void __mlir_callback_1_input(void* input, void* output, size_t index, OpType type)
+{
+    auto unrankedMemRefInput = reinterpret_cast<UnrankedMemRef*>(input);
+    auto unrankedMemRefOutput = reinterpret_cast<UnrankedMemRef*>(output);
+
+    if (type == OpType::SOFTMAX)
+    {
+        __mlir_mkldnn_softmax(unrankedMemRefInput->rank,
+                              unrankedMemRefInput->memRefDescPtr,
+                              unrankedMemRefOutput->memRefDescPtr,
+                              index);
+    }
+    else if (type == OpType::AVGPOOL || type == OpType::MAXPOOL)
+    {
+        __mlir_mkldnn_pooling(unrankedMemRefInput->rank,
+                              unrankedMemRefInput->memRefDescPtr,
+                              unrankedMemRefOutput->memRefDescPtr,
+                              index,
+                              type);
+    }
+    else if (type == OpType::AVGPOOLBACKPROP)
+    {
+        __mlir_mkldnn_avgpoolbackprop(unrankedMemRefInput->rank,
+                                      unrankedMemRefInput->memRefDescPtr,
+                                      unrankedMemRefOutput->memRefDescPtr,
+                                      index);
+    }
+    else
+    {
+        NGRAPH_UNREACHABLE("Unsupported type");
+    }
+}
+
+extern "C" void
+    __mlir_callback_2_inputs(void* input0, void* input1, void* output, size_t index, OpType type)
+{
+    auto unrankedMemRefInput0 = reinterpret_cast<UnrankedMemRef*>(input0);
+    auto unrankedMemRefInput1 = reinterpret_cast<UnrankedMemRef*>(input1);
+    auto unrankedMemRefOutput = reinterpret_cast<UnrankedMemRef*>(output);
+
+    if (type == OpType::MAXPOOLBACKPROP)
+    {
+        __mlir_mkldnn_maxpoolbackprop(unrankedMemRefInput0->rank,
+                                      unrankedMemRefInput0->memRefDescPtr,
+                                      unrankedMemRefInput1->memRefDescPtr,
+                                      unrankedMemRefOutput->memRefDescPtr,
+                                      index);
+    }
+    else if (type == OpType::MATMUL)
+    {
+        __mlir_cblas_sgemm(unrankedMemRefInput0->memRefDescPtr,
+                           unrankedMemRefInput1->memRefDescPtr,
+                           unrankedMemRefOutput->memRefDescPtr,
+                           index);
+    }
+    else
+    {
+        NGRAPH_UNREACHABLE("Unsupported type");
+    }
+}
+
+extern "C" void __mlir_callback_3_inputs(
+    void* input0, void* input1, void* input2, void* output, size_t index, OpType type)
+{
+    auto unrankedMemRefInput0 = reinterpret_cast<UnrankedMemRef*>(input0);
+    auto unrankedMemRefInput1 = reinterpret_cast<UnrankedMemRef*>(input1);
+    auto unrankedMemRefInput2 = reinterpret_cast<UnrankedMemRef*>(input2);
+    auto unrankedMemRefOutput = reinterpret_cast<UnrankedMemRef*>(output);
+
+    if (type == OpType::GEMM)
+    {
+        __mlir_cblas_sgemm_with_bias(unrankedMemRefInput0->memRefDescPtr,
+                                     unrankedMemRefInput1->memRefDescPtr,
+                                     unrankedMemRefInput2->memRefDescPtr,
+                                     unrankedMemRefOutput->memRefDescPtr,
+                                     index);
+    }
+    else
+    {
+        NGRAPH_UNREACHABLE("Unsupported type");
+    }
+}
--- a/src/contrib/mlir/runtime/cpu/cpu_runtime.cpp
+++ b/src/contrib/mlir/runtime/cpu/cpu_runtime.cpp
@@ -53,16 +53,18 @@ static llvm::cl::opt<std::string>
    clObjectFilename("ngraph-mlir-object-filename",
                     llvm::cl::desc("Dump MLIR JITted-compiled object to file jitted_mlir.o"));

-void MLIRCPURuntime::run(void* args)
+void MLIRCPURuntime::run(const std::vector<MemRefArg>& args)
 {
-    run_internal(*reinterpret_cast<std::vector<void*>*>(args));
+    // run_internal(*reinterpret_cast<std::vector<void*>*>(args), shapeVec, stridesVec);
+    run_internal(args);
 }

-void MLIRCPURuntime::run_internal(std::vector<void*>& externalTensors)
+void MLIRCPURuntime::run_internal(const std::vector<MemRefArg>& args)
 {
    // Create an MLIR execution engine. We use a null MLIR pass manager for now to make sure we
    // don't run MLIR passes that were already run. We also pass a default transformer created with
    // the default or user-provided optimization level.
+
    auto llvmTransformer = mlir::makeOptimizingTransformer(
        MLIRCPUBackend::mlirOptLevel, /*sizeLevel=*/0, MLIRCPUBackend::targetMachine.get());
    auto maybeEngine = mlir::ExecutionEngine::create(
@@ -70,14 +72,14 @@ void MLIRCPURuntime::run_internal(std::vector<void*>& externalTensors)
    NGRAPH_CHECK(maybeEngine, "failed to construct an execution engine");
    m_engine = std::move(maybeEngine.get());

-    bindArguments(externalTensors);
+    bindArguments(args);
    execute();
    cleanup();
 }

 // Binds MLIR function arguments to the proper values. This includes externally allocated tensors
 // helpers to be used inside the function.
-void MLIRCPURuntime::bindArguments(std::vector<void*>& externalTensors)
+void MLIRCPURuntime::bindArguments(const std::vector<MemRefArg>& args)
 {
    NGRAPH_CHECK(m_module, "MLIR module is not ready.");

@@ -85,13 +87,17 @@ void MLIRCPURuntime::bindArguments(std::vector<void*>& externalTensors)
    NGRAPH_CHECK(func && !func.getBlocks().empty(), "Function not found");

    // Set external arguments
-    m_externalTensors = &externalTensors;
+    m_externalTensors = &args;

    // Create list with a type-erased double pointer for each invocation arguments.
    // We currently use 'allocateMemrefArgs', which creates the arguments list per call ABI (see
    // comment below).
    // StaticMemRef is just a struct with the actual pointer to the data.

+    for (auto i = 0; i < m_externalTensors->size(); i++)
+    {
+        m_ranks.push_back((*m_externalTensors)[i].m_shape.size());
+    }
    auto expectedArguments = allocateMemrefArgs();
    NGRAPH_CHECK(expectedArguments.size(), "Arguments can't be created");
    m_invokeArgs = std::move(expectedArguments);
@@ -103,8 +109,14 @@ void MLIRCPURuntime::bindArguments(std::vector<void*>& externalTensors)
    for (size_t i = 0, numArgs = m_invokeArgs.size(); i < numArgs; ++i)
    {
        auto* memRefArg = *(reinterpret_cast<StaticMemRef**>(m_invokeArgs[i]));
-        memRefArg->allocatedPtr = (*m_externalTensors)[i];
-        memRefArg->alignedPtr = (*m_externalTensors)[i];
+        memRefArg->allocatedPtr = (*m_externalTensors)[i].m_tensor;
+        memRefArg->alignedPtr = (*m_externalTensors)[i].m_tensor;
+        auto rank = m_ranks[i];
+        for (auto j = 0; j < rank; j++)
+        {
+            memRefArg->shapeAndStrides[j] = (*m_externalTensors)[i].m_shape[j];
+            memRefArg->shapeAndStrides[rank + j] = (*m_externalTensors)[i].m_strides[j];
+        }
    }
 }

@@ -128,6 +140,7 @@ void MLIRCPURuntime::execute()
 void MLIRCPURuntime::cleanup()
 {
    // Free void double pointer arguments without freeing external tensor data.
+    int i = 0;
    for (auto* arg : m_invokeArgs)
    {
        auto* memRefArg = *(reinterpret_cast<StaticMemRef**>(arg));
@@ -148,7 +161,7 @@ SmallVector<void*, 8> MLIRCPURuntime::allocateMemrefArgs()
    SmallVector<void*, 8> args;
    for (auto i = 0; i < m_externalTensors->size(); i++)
    {
-        auto descriptor = allocateMemrefDescriptor();
+        auto descriptor = allocateMemrefDescriptor(m_ranks[i]);
        StaticMemRef** arg = reinterpret_cast<StaticMemRef**>(malloc(sizeof(StaticMemRef*)));
        *arg = descriptor;
        args.push_back(arg);
@@ -156,13 +169,17 @@ SmallVector<void*, 8> MLIRCPURuntime::allocateMemrefArgs()
    return args;
 }

-StaticMemRef* MLIRCPURuntime::allocateMemrefDescriptor()
+StaticMemRef* MLIRCPURuntime::allocateMemrefDescriptor(size_t rank)
 {
    // We only use StaticMemRef because that's what MLIR currently offers.
    // We should expand this with different types and dynamic MemRefs
-    auto* descriptor = reinterpret_cast<StaticMemRef*>(malloc(sizeof(StaticMemRef)));
+    // We allocate 2 * rank * sizeof(int64_t) for the last element "int64_t shapeAndStrides[]"
+    // in StaticMemRef because shape and strides each needs rank * sizeof(int64_t).
+    auto* descriptor =
+        reinterpret_cast<StaticMemRef*>(malloc(sizeof(StaticMemRef) + 2 * rank * sizeof(int64_t)));
    NGRAPH_CHECK(descriptor != nullptr, "NULL MemRef descriptor");
    descriptor->allocatedPtr = nullptr;
    descriptor->alignedPtr = nullptr;
+    descriptor->offset = 0;
    return descriptor;
 }
--- a/src/contrib/mlir/runtime/cpu/cpu_runtime.hpp
+++ b/src/contrib/mlir/runtime/cpu/cpu_runtime.hpp
@@ -37,7 +37,16 @@ namespace ngraph
            {
                void* allocatedPtr;
                void* alignedPtr;
+                int64_t offset;
+                int64_t shapeAndStrides[];
            };
+
+            struct UnrankedMemRef
+            {
+                int64_t rank;
+                StaticMemRef* memRefDescPtr;
+            };
+
            /// A CPU Runtime is an MLIR runtime that owns an MLIR context and a module
            /// The module should be in LLVM dialect and ready to be lowered via an MLIR
            /// ExecutionEngine. The runtime owns the context and must out-live any MLIR
@@ -46,12 +55,12 @@ namespace ngraph
            {
            public:
                /// Executes a pre-compiled subgraph
-                void run(void* args) override;
+                void run(const std::vector<MemRefArg>& args) override;

            private:
-                void run_internal(std::vector<void*>& externalTensors);
+                void run_internal(const std::vector<MemRefArg>& args);
                // Bind external tensors to MLIR module entry point
-                void bindArguments(std::vector<void*>& externalTensors);
+                void bindArguments(const std::vector<MemRefArg>& args);
                // Invokes an MLIR module entry point with bound arguments
                void execute();
                // Cleans up allocated args
@@ -61,14 +70,15 @@ namespace ngraph
                llvm::SmallVector<void*, 8> allocateMemrefArgs();

                /// Helper to allocate a mem ref object. Handles static shapes only for now.
-                StaticMemRef* allocateMemrefDescriptor();
+                StaticMemRef* allocateMemrefDescriptor(size_t);

            private:
                // Pointers to externally allocated memory for sub-graph's input and output tensors.
-                std::vector<void*>* m_externalTensors;
+                const std::vector<MemRefArg>* m_externalTensors;
                // Arguments for the MLIR function generated for the nGraph sub-graph.
                llvm::SmallVector<void*, 8> m_invokeArgs;
                std::unique_ptr<mlir::ExecutionEngine> m_engine;
+                std::vector<size_t> m_ranks;
            };
        }
    }

--- a/src/contrib/mlir/runtime/runtime.hpp
+++ b/src/contrib/mlir/runtime/runtime.hpp
@@ -33,6 +33,13 @@ namespace ngraph
    {
        namespace ngmlir
        {
+            struct MemRefArg
+            {
+                void* m_tensor;
+                std::vector<size_t> m_shape;
+                std::vector<size_t> m_strides;
+            };
+
            /// Base class for an MLIR runtime. An MLIR runtime owns the MLIR Context and owns
            /// the final compiled module. It supports invoking the module with specific arguments
            class MLIRRuntime
@@ -43,7 +50,7 @@ namespace ngraph
                /// Overload with module op
                void set_module(mlir::ModuleOp& module) { m_module = module; }
                /// Executes a pre-compiled subgraph
-                virtual void run(void* args) = 0;
+                virtual void run(const std::vector<MemRefArg>& args) = 0;

                /// Get the MLIR module that this runtime owns
                mlir::OwningModuleRef& get_module() { return m_module; }
@@ -54,4 +61,4 @@ namespace ngraph
            };
        }
    }
-}
\ No newline at end of file
+}
--- a/src/ngraph/op/fused/gemm.cpp
+++ b/src/ngraph/op/fused/gemm.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/op/add.hpp"
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/dot.hpp"
+#include "ngraph/op/fused/matmul.hpp"
 #include "ngraph/op/multiply.hpp"
 #include "ngraph/op/util/broadcasting.hpp"


--- a/src/ngraph/runtime/cpu/builder/mlir_cpu_compiled_kernel.cpp
+++ b/src/ngraph/runtime/cpu/builder/mlir_cpu_compiled_kernel.cpp
@@ -43,30 +43,69 @@ namespace ngraph
                // Tensors haven't been allocated yet so we have to keep a pointer to the pointer
                // that will hold the future memory address.
                std::vector<size_t> buffer_indices;
+                std::vector<std::vector<size_t>> shape_vec;
+                std::vector<std::vector<size_t>> strides_vec;
                for (const TensorViewWrapper& arg : args)
                {
                    auto buffer_index = external_function->get_buffer_index(arg.get_name());
                    buffer_indices.push_back(buffer_index);
+                    // Get shape and strides
+                    auto tensor_shape = arg.get_shape();
+                    std::vector<size_t> shape(tensor_shape.size());
+                    for (auto i = 0; i < tensor_shape.size(); i++)
+                    {
+                        shape[i] = tensor_shape[i];
+                    }
+                    shape_vec.push_back(shape);
+                    auto tensor_strides = arg.get_strides();
+                    std::vector<size_t> strides(tensor_strides.size());
+                    for (auto i = 0; i < tensor_strides.size(); i++)
+                    {
+                        strides[i] = tensor_strides[i];
+                    }
+                    strides_vec.push_back(strides);
                }

                for (const TensorViewWrapper& result : out)
                {
                    auto buffer_index = external_function->get_buffer_index(result.get_name());
                    buffer_indices.push_back(buffer_index);
+                    // Get shape and strides
+                    auto tensor_shape = result.get_shape();
+                    std::vector<size_t> shape(tensor_shape.size());
+                    for (auto i = 0; i < tensor_shape.size(); i++)
+                    {
+                        shape[i] = tensor_shape[i];
+                    }
+                    shape_vec.push_back(shape);
+                    auto tensor_strides = result.get_strides();
+                    std::vector<size_t> strides(tensor_strides.size());
+                    for (auto i = 0; i < tensor_strides.size(); i++)
+                    {
+                        strides[i] = tensor_strides[i];
+                    }
+                    strides_vec.push_back(strides);
                }

                // Create functor that will be executed to compile and run this CompiledKernel.
                // Note that 'double_ptr_args' must be captured by value since it's a local var.
-                auto functor = [node, buffer_indices](CPURuntimeContext* ctx,
-                                                      CPUExecutionContext* ectx) {
+                auto functor = [node, buffer_indices, shape_vec, strides_vec](
+                    CPURuntimeContext* ctx, CPUExecutionContext* ectx) {

                    // MLIR requires a list of type-erased pointer to arguments. Tensors must have
                    // been allocated at this point so we can get rid of the extra reference.
-                    std::vector<void*> ptr_args;
+                    std::vector<MemRefArg> mem_ref_arg_vec;
+                    int i = 0;
                    for (auto& buffer_index : buffer_indices)
                    {
-                        ptr_args.push_back(ctx->buffer_data[buffer_index]);
+                        MemRefArg mem_ref_arg;
+                        mem_ref_arg.m_tensor = ctx->buffer_data[buffer_index];
+                        mem_ref_arg.m_shape = shape_vec[i];
+                        mem_ref_arg.m_strides = strides_vec[i];
+                        mem_ref_arg_vec.push_back(mem_ref_arg);
+                        i++;
                    }
+
                    // Compile nodes within the CompiledKernel op.
                    CompiledKernel* compiled_kernel =
                        static_cast<CompiledKernel*>(const_cast<Node*>(node));
@@ -97,13 +136,13 @@ namespace ngraph
                        mlir_backend.codegen();
                        // Store module into runtime, and invoke.
                        mlir_runtime.set_module(mlir_backend.get_module());
-                        mlir_runtime.run(&ptr_args);
+                        mlir_runtime.run(mem_ref_arg_vec);
                    }
                    else
                    {
                        // We have found a cached runtime, just invoke.
                        MLIRCPURuntime& mlir_runtime = it->second;
-                        mlir_runtime.run(&ptr_args);
+                        mlir_runtime.run(mem_ref_arg_vec);
                    }
                };


--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -87,8 +87,10 @@
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/fused/conv_fused.hpp"
 #include "ngraph/op/fused/gelu.hpp"
+#include "ngraph/op/fused/gemm.hpp"
 #include "ngraph/op/fused/group_conv.hpp"
 #include "ngraph/op/fused/lstm_cell.hpp"
+#include "ngraph/op/fused/matmul.hpp"
 #include "ngraph/op/fused/softmax_crossentropy.hpp"
 #include "ngraph/op/gather.hpp"
 #include "ngraph/op/gather_nd.hpp"
@@ -1187,7 +1189,22 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(

    auto dex = is_direct_execution();
    auto is_supported = [dex](const Node& node) {
+#ifdef NGRAPH_MLIR_ENABLE
+        if (std::getenv("NGRAPH_MLIR") != nullptr && std::getenv("NGRAPH_MLIR_CALLBACK") != nullptr)
+        {
+            if (typeid(ngraph::op::MatMul) == typeid(node) &&
+                node.get_input_element_type(0) == element::f32)
+            {
+                return true;
+            }

+            if (typeid(ngraph::op::Gemm) == typeid(node) &&
+                node.get_input_element_type(0) == element::f32)
+            {
+                return true;
+            }
+        }
+#endif
        // this checks averts the decomposition of LSTMCell
        // we will map LSTMCell to LSTM CPU op in the later
        // graph pass

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -321,6 +321,7 @@ set(MULTI_TEST_SRC
    backend/logical_or.in.cpp
    backend/logical_xor.in.cpp
    backend/lrn.in.cpp
+    backend/matmul.in.cpp
    backend/max.in.cpp
    backend/maximum.in.cpp
    backend/min.in.cpp

--- a/test/backend/fused_op.in.cpp
+++ b/test/backend/fused_op.in.cpp
@@ -1024,6 +1024,26 @@ NGRAPH_TEST(${BACKEND_NAME}, gemm)
    test_case.run();
 }

+NGRAPH_TEST(${BACKEND_NAME}, gemm_C)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{3, 6});
+    auto B = make_shared<op::Parameter>(element::f32, Shape{6, 4});
+    auto C = make_shared<op::Parameter>(element::f32, Shape{3, 4});
+
+    auto gemm_func = make_shared<op::Gemm>(A, B, C);
+    auto function = make_shared<Function>(NodeVector{gemm_func}, ParameterVector{A, B, C});
+    auto test_case = test::NgraphTestCase(function, "${BACKEND_NAME}");
+    // A
+    test_case.add_input<float>(vector<float>(18, 1));
+    // B
+    test_case.add_input<float>(vector<float>(24, 2));
+    // C
+    test_case.add_input<float>(vector<float>(12, 1));
+    // output
+    test_case.add_expected_output<float>(Shape{3, 4}, vector<float>(12, 13));
+    test_case.run();
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, gemm_broadcast_input_C)
 {
    auto A = make_shared<op::Parameter>(element::f32, Shape{3, 6});
@@ -1041,6 +1061,48 @@ NGRAPH_TEST(${BACKEND_NAME}, gemm_broadcast_input_C)
    test_case.add_input<float>(vector<float>{1});
    // output
    test_case.add_expected_output<float>(Shape{3, 4}, vector<float>(12, 7));
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gemm_broadcast_axes_0_input_C)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{3, 6});
+    auto B = make_shared<op::Parameter>(element::f32, Shape{6, 4});
+    auto C = make_shared<op::Parameter>(element::f32, Shape{1, 4});
+
+    auto gemm_func = make_shared<op::Gemm>(A, B, C, 0.5);
+    auto function = make_shared<Function>(NodeVector{gemm_func}, ParameterVector{A, B, C});
+    auto test_case = test::NgraphTestCase(function, "${BACKEND_NAME}");
+    // A
+    test_case.add_input<float>(vector<float>(18, 1));
+    // B
+    test_case.add_input<float>(vector<float>(24, 2));
+    // C
+    test_case.add_input<float>(vector<float>{1, 2, 3, 4});
+    // output
+    test_case.add_expected_output<float>(Shape{3, 4},
+                                         vector<float>{7, 8, 9, 10, 7, 8, 9, 10, 7, 8, 9, 10});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gemm_broadcast_axes_1_input_C)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{3, 6});
+    auto B = make_shared<op::Parameter>(element::f32, Shape{6, 4});
+    auto C = make_shared<op::Parameter>(element::f32, Shape{3, 1});
+
+    auto gemm_func = make_shared<op::Gemm>(A, B, C, 0.5);
+    auto function = make_shared<Function>(NodeVector{gemm_func}, ParameterVector{A, B, C});
+    auto test_case = test::NgraphTestCase(function, "${BACKEND_NAME}");
+    // A
+    test_case.add_input<float>(vector<float>(18, 1));
+    // B
+    test_case.add_input<float>(vector<float>(24, 2));
+    // C
+    test_case.add_input<float>(vector<float>(3, 1));
+    // output
+    test_case.add_expected_output<float>(Shape{3, 4}, vector<float>(12, 7));
+    test_case.run();
 }

 NGRAPH_TEST(${BACKEND_NAME}, fused_clamp)

--- a/test/backend/matmul.in.cpp
+++ b/test/backend/matmul.in.cpp
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <cstdlib>
+#include <random>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/all_close.hpp"
+#include "util/all_close_f.hpp"
+#include "util/ndarray.hpp"
+#include "util/test_control.hpp"
+#include "util/test_tools.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_2x0_0x2)
+{
+    Shape shape_a{2, 0};
+    Shape shape_b{0, 2};
+    Shape shape_r{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    auto f = make_shared<Function>(make_shared<op::MatMul>(A, B), ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    // Overwrite the initial result vector to make sure we're not just coincidentally getting the
+    // right value.
+    copy_data(result, vector<float>{2112, 2112, 2112, 2112});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    EXPECT_TRUE(test::all_close_f((vector<float>{0, 0, 0, 0}), read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_0x2_2x0)
+{
+    Shape shape_a{0, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 0};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{0, 0};
+    auto f = make_shared<Function>(make_shared<op::MatMul>(A, B), ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    EXPECT_TRUE(test::all_close_f((vector<float>{}), read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_3x2_2x0)
+{
+    Shape shape_a{3, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 0};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{3, 0};
+    auto f = make_shared<Function>(make_shared<op::MatMul>(A, B), ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    EXPECT_TRUE(test::all_close_f((vector<float>{}), read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_2x2_2x2)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    Shape shape_r{2, 2};
+    auto f = make_shared<Function>(make_shared<op::MatMul>(A, B), ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{5, 6, 7, 8});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    EXPECT_TRUE(test::all_close_f((vector<float>{19, 22, 43, 50}), read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_2x3_3x3)
+{
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto matmul = make_shared<op::MatMul>(A, B, false, false);
+    auto f = make_shared<Function>(matmul, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+
+    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
+                                  vector<float>{30.f, 36.f, 42.f, 66.f, 81.f, 96.f}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_2x3_3x3_int64)
+{
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+    auto A = make_shared<op::Parameter>(element::i64, shape_in1);
+    auto B = make_shared<op::Parameter>(element::i64, shape_in2);
+    auto matmul = make_shared<op::MatMul>(A, B, false, false);
+    auto f = make_shared<Function>(matmul, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i64, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i64, shape_in2);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i64, shape_out);
+
+    copy_data(a, vector<int64_t>{1, 2, 3, 4, 5, 6});
+    copy_data(b, vector<int64_t>{1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+
+    EXPECT_TRUE(
+        test::all_close(read_vector<int64_t>(result), vector<int64_t>{30, 36, 42, 66, 81, 96}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_3x2_3x3_transpose)
+{
+    Shape shape_in1{3, 2};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto matmul = make_shared<op::MatMul>(A, B, true, false);
+    auto f = make_shared<Function>(matmul, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(a, vector<float>{1.f, 4.f, 2.f, 5.f, 3.f, 6.f});
+    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+
+    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
+                                  vector<float>{30.f, 36.f, 42.f, 66.f, 81.f, 96.f}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_3x2_2x3_transpose)
+{
+    Shape shape_in1{3, 2};
+    Shape shape_in2{2, 3};
+    Shape shape_out{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto matmul = make_shared<op::MatMul>(A, B, true, true);
+    auto f = make_shared<Function>(matmul, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(a, vector<float>{1.f, 4.f, 2.f, 5.f, 3.f, 6.f});
+    copy_data(b, vector<float>{1.f, 3.f, 5.f, 2.f, 4.f, 6.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+
+    EXPECT_TRUE(
+        test::all_close_f(read_vector<float>(result), vector<float>{22.f, 28.f, 49.f, 64.f}));
+}
--- a/test/mlir/affine_conversion/callback_ops.mlir
+++ b/test/mlir/affine_conversion/callback_ops.mlir
+// RUN: ngraph-opt %s -convert-ngraph-to-affine -split-input-file | FileCheck %s
+
+// Verify that operations using callbacks are properly converted to standard call.
+
+// -----
+
+// Softmax Op
+// CHECK-LABEL: func @simple_softmax
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: %0 = memref_cast %arg0 : memref<2x3xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg2 : memref<2x3xf32> to memref<*xf32>
+//       CHECK: call @__mlir_callback_1_input(%0, %1, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_softmax(%arg0: !ng.tensor<2x3xf32>, %arg1: !ng.tensor<1x!ng.i64>) -> !ng.tensor<2x3xf32> {
+  %0 = "ng.softmax"(%arg0) {axes = [0]} : (!ng.tensor<2x3xf32>) -> !ng.tensor<2x3xf32>
+  "ng.return"(%0) : (!ng.tensor<2x3xf32>) -> ()
+}
+
+// -----
+
+// Gemm Op
+// CHECK-LABEL: func @simple_gemm
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: %0 = memref_cast %arg0 : memref<3x6xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg1 : memref<6x4xf32> to memref<*xf32>
+//       CHECK: %2 = memref_cast %arg2 : memref<3x4xf32> to memref<*xf32>
+//       CHECK: %3 = memref_cast %arg3 : memref<3x4xf32> to memref<*xf32>
+//       CHECK: call @__mlir_callback_3_inputs(%0, %1, %2, %3, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_gemm(%arg0: !ng.tensor<3x6xf32>, %arg1: !ng.tensor<6x4xf32>, %arg2: !ng.tensor<3x4xf32>) -> !ng.tensor<3x4xf32> {
+  %0 = "ng.gemm"(%arg0, %arg1, %arg2) {alpha = 1.000000e+00 : f32, beta = 1.000000e+00 : f32, transA = false, transB = false} : (!ng.tensor<3x6xf32>, !ng.tensor<6x4xf32>, !ng.tensor<3x4xf32>) -> !ng.tensor<3x4xf32>
+  "ng.return"(%0) : (!ng.tensor<3x4xf32>) -> ()
+}
+
+// -----
+
+// MatMul Op
+// CHECK-LABEL: func @simple_matmul
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: %0 = memref_cast %arg0 : memref<3x2xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg1 : memref<2x3xf32> to memref<*xf32>
+//       CHECK: %2 = memref_cast %arg2 : memref<2x2xf32> to memref<*xf32>
+//       CHECK: call @__mlir_callback_2_inputs(%0, %1, %2, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_matmul(%arg0: !ng.tensor<3x2xf32>, %arg1: !ng.tensor<2x3xf32>) -> !ng.tensor<2x2xf32> {
+  %0 = "ng.matmul"(%arg0, %arg1) {transposeA = true, transposeB = true} : (!ng.tensor<3x2xf32>, !ng.tensor<2x3xf32>) -> !ng.tensor<2x2xf32>
+  "ng.return"(%0) : (!ng.tensor<2x2xf32>) -> ()
+}
+
+// -----
+
+// AvePool Op
+// CHECK-LABEL: func @simple_avgpool
+//       CHECK: %0 = memref_cast %arg0 : memref<2x1x3x3xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg1 : memref<2x1x3x3xf32> to memref<*xf32>
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: call @__mlir_callback_1_input(%0, %1, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_avgpool(%arg0: !ng.tensor<2x1x3x3xf32>) -> !ng.tensor<2x1x3x3xf32> {
+  %0 = "ng.avgPool"(%arg0) {includePadding = true, padAbove = [1, 1], padBelow = [0, 0], windowMovementStrides = [1, 1], windowShape = [2, 2]} : (!ng.tensor<2x1x3x3xf32>) -> !ng.tensor<2x1x3x3xf32>
+  "ng.return"(%0) : (!ng.tensor<2x1x3x3xf32>) -> ()
+}
+
+// -----
+
+// AvgPoolBackprop Op
+// CHECK-LABEL: func @simple_avgpoolbackprop
+//       CHECK: %0 = memref_cast %arg0 : memref<2x2x2x2xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg1 : memref<2x2x3x3xf32> to memref<*xf32>
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: call @__mlir_callback_1_input(%0, %1, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_avgpoolbackprop(%arg0: !ng.tensor<2x2x2x2xf32>) -> !ng.tensor<2x2x3x3xf32> {
+  %0 = "ng.avgPoolBackprop"(%arg0) {forwardArgShape = [2, 2, 3, 3], includePadding = false, padAbove = [0, 0], padBelow = [0, 0], windowMovementStrides = [1, 1], windowShape = [2, 2]} : (!ng.tensor<2x2x2x2xf32>) -> !ng.tensor<2x2x3x3xf32>
+  "ng.return"(%0) : (!ng.tensor<2x2x3x3xf32>) -> ()
+}
+
+// -----
+
+// MaxPool Op
+// CHECK-LABEL: func @simple_maxpool
+//       CHECK: %0 = memref_cast %arg0 : memref<64x3x7x8x10xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg1 : memref<64x3x9x6x5xf32> to memref<*xf32>
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: call @__mlir_callback_1_input(%0, %1, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_maxpool(%arg0: !ng.tensor<64x3x7x8x10xf32>) -> !ng.tensor<64x3x9x6x5xf32> {
+  %0 = "ng.maxPool"(%arg0) {padAbove = [6, 4, 5], padBelow = [5, 6, 4], windowMovementStrides = [2, 3, 4], windowShape = [2, 3, 2]} : (!ng.tensor<64x3x7x8x10xf32>) -> !ng.tensor<64x3x9x6x5xf32>
+  "ng.return"(%0) : (!ng.tensor<64x3x9x6x5xf32>) -> ()
+}
+
+// -----
+
+// MaxPoolBackprop Op
+// CHECK-LABEL: func @simple_maxpoolbackprop
+//       CHECK: %0 = memref_cast %arg0 : memref<2x2x5x5xf32> to memref<*xf32>
+//       CHECK: %1 = memref_cast %arg1 : memref<2x2x4x3xf32> to memref<*xf32>
+//       CHECK: %2 = memref_cast %arg2 : memref<2x2x5x5xf32> to memref<*xf32>
+//       CHECK: %[[C1:.*]] = constant 0 : i64
+//       CHECK: %[[C2:.*]] = constant {{[0-9]+}} : i64
+//       CHECK: call @__mlir_callback_2_inputs(%0, %1, %2, %[[C1]], %[[C2]]) : (memref<*xf32>, memref<*xf32>, memref<*xf32>, i64, i64) -> ()
+func @simple_maxpoolbackprop(%arg0: !ng.tensor<2x2x5x5xf32>, %arg1: !ng.tensor<2x2x4x3xf32>) -> !ng.tensor<2x2x5x5xf32> {
+  %0 = "ng.maxPoolBackprop"(%arg0, %arg1) {padAbove = [0, 0], padBelow = [0, 0], windowMovementStrides = [1, 1], windowShape = [2, 3]} : (!ng.tensor<2x2x5x5xf32>, !ng.tensor<2x2x4x3xf32>) -> !ng.tensor<2x2x5x5xf32>
+  "ng.return"(%0) : (!ng.tensor<2x2x5x5xf32>) -> ()
+}