[MLIR] In-place memory optimization for elt-wise and concat ops. (#3832)

* AliasMap WIP * Added liveness info * WIP * WIP: Tests * WIP: LIT tests * Added knobs for mem optimization pass. More LIT tests * Revert affine_lowerer change * More elaborate comment * Minor fixes * style-apply * Rename liveness * Switch to Analysis framework * Fix optimization conditions * Remove LIT test * style * Switch to equivalence relationship impl of non-alias relationship * refined comment * Switch non-alias to equivalence relationship * Fix bad merge * Adding tests. WIP * Added buffer size tracking and unit-tests * Added LIT and unit-tests * Turn optimization ON * style * fix unit-tests * Fix useCount * Fix copyright and typo * Refine few comments, remove new lines * style fix Co-authored-by: Scott Cyphers <diyessi@users.noreply.github.com> Co-authored-by: Sang Ik Lee <sang.ik.lee@intel.com>

[MLIR] In-place memory optimization for elt-wise and concat ops. (#3832)
* AliasMap WIP * Added liveness info * WIP * WIP: Tests * WIP: LIT tests * Added knobs for mem optimization pass. More LIT tests * Revert affine_lowerer change * More elaborate comment * Minor fixes * style-apply * Rename liveness * Switch to Analysis framework * Fix optimization conditions * Remove LIT test * style * Switch to equivalence relationship impl of non-alias relationship * refined comment * Switch non-alias to equivalence relationship * Fix bad merge * Adding tests. WIP * Added buffer size tracking and unit-tests * Added LIT and unit-tests * Turn optimization ON * style * fix unit-tests * Fix useCount * Fix copyright and typo * Refine few comments, remove new lines * style fix Co-authored-by: Scott Cyphers <diyessi@users.noreply.github.com> Co-authored-by: Sang Ik Lee <sang.ik.lee@intel.com>
6de4893b · Nagy Mostafa · Sang Ik Lee · b3db038e · 6de4893b · 6de4893b
Commit 6de4893b authored Jan 14, 2020 by Nagy Mostafa Committed by Sang Ik Lee Jan 14, 2020
11 changed files
--- a/src/contrib/mlir/CMakeLists.txt
+++ b/src/contrib/mlir/CMakeLists.txt
@@ -25,7 +25,7 @@ add_subdirectory(tools/ngraph-opt)
 set(SRC
    backend/cpu/cpu_backend.cpp
    backend/pass/affine_lowerer.cpp
-    backend/pass/memory_optimization.cpp
+    backend/analysis/memory_analysis.cpp
    core/compiler.cpp
    core/ngraph_dialect/dialect.cpp
    core/ngraph_dialect/type.cpp

--- a/src/contrib/mlir/backend/analysis/memory_analysis.cpp
+++ b/src/contrib/mlir/backend/analysis/memory_analysis.cpp
--- a/src/contrib/mlir/backend/pass/memory_optimization.hpp
+++ b/src/contrib/mlir/backend/pass/memory_optimization.hpp
@@ -20,8 +20,60 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include <unordered_map>
+#include "ngraph/check.hpp"
 namespace mlir
 {
-    std::unique_ptr<Pass> createMemoryOptimizationPass();
+    // BufferInfo
+    struct BufferInfo
+    {
+        // Buffer Id. If -1 then invalid buffer.
+        int m_bufferId;
+        // Offset into the buffer
+        int m_offset;
+        bool isValid() const { return m_bufferId != -1; }
+    };
+    struct MemoryAnalysis
+    {
+        using BufferInfoMap = std::unordered_map<Operation*, BufferInfo>;
+        using BufferSizeMap = std::unordered_map<unsigned, unsigned>;
+        // Compute this analysis with the provided operation.
+        MemoryAnalysis(Operation* op);
+        BufferInfo getBufferInfo(Operation* op)
+        {
+            auto it = m_bufferInfo.find(op);
+            if (it == m_bufferInfo.end())
+            {
+                return {-1, -1};
+            }
+            return it->second;
+        }
+        void setBufferInfo(Operation* op, BufferInfo bufferInfo) { m_bufferInfo[op] = bufferInfo; }
+        void setBufferSize(unsigned bufferId, unsigned size)
+        {
+            auto it = m_bufferSize.find(bufferId);
+            if (it != m_bufferSize.end())
+            {
+                it->second = (size > it->second) ? size : it->second;
+            }
+            else
+            {
+                m_bufferSize[bufferId] = size;
+            }
+        }
+        unsigned getBufferSize(unsigned bufferId)
+        {
+            auto it = m_bufferSize.find(bufferId);
+            NGRAPH_CHECK(it != m_bufferSize.end(), "Buffer has no size!");
+            return it->second;
+        }
+    private:
+        // Records assignment of BufferInfo to each inplace op
+        BufferInfoMap m_bufferInfo;
+        // Records buffer size required for each buffer id in bytes
+        BufferSizeMap m_bufferSize;
+    };
 }
--- a/src/contrib/mlir/backend/cpu/cpu_backend.cpp
+++ b/src/contrib/mlir/backend/cpu/cpu_backend.cpp
@@ -19,7 +19,6 @@
 #include "cpu_backend.hpp"
 #include "contrib/mlir/backend/pass/affine_lowerer.hpp"
-#include "contrib/mlir/backend/pass/memory_optimization.hpp"
 #include "contrib/mlir/utils.hpp"
 #include "ngraph/check.hpp"
@@ -160,7 +159,6 @@ void MLIRCPUBackend::init()
 void MLIRCPUBackend::codegen()
 {
-    optimizeNgDialect();
    lowerNgDialect();
 }
@@ -261,18 +259,3 @@ void MLIRCPUBackend::optimizeAffineDialect()
    // Run Std dialect optimizations.
    // TODO
 }
-void MLIRCPUBackend::optimizeNgDialect()
-{
-    mlir::PassManager pm(&m_context);
-    mlir::applyPassManagerCLOptions(pm);
-    if (clEnableNgInPlaceMemoryOpt)
-    {
-        pm.addPass(mlir::createMemoryOptimizationPass());
-    }
-    if (failed(pm.run(m_module.get())))
-    {
-        NGRAPH_CHECK(false, "MLIR pass manager failed");
-    }
-}
--- a/src/contrib/mlir/backend/pass/affine_lowerer.cpp
+++ b/src/contrib/mlir/backend/pass/affine_lowerer.cpp
@@ -19,11 +19,13 @@
 #include "affine_lowerer.hpp"
+#include "contrib/mlir/backend/analysis/memory_analysis.hpp"
 #include "contrib/mlir/core/ngraph_dialect/ops.hpp"
 #include "contrib/mlir/core/ngraph_dialect/type.hpp"
 #include "ngraph/assertion.hpp"
 #include <llvm/ADT/DenseSet.h>
+#include <llvm/Support/Debug.h>
 #include <mlir/EDSC/Builders.h>
 #include <mlir/EDSC/Helpers.h>
 #include <mlir/EDSC/Intrinsics.h>
@@ -165,6 +167,8 @@ namespace
    ValueHandle createZeroConstant(mlir::Type type);
    ValueHandle createOneConstant(mlir::Type type);
+    bool isInPlaceConcat(mlir::Operation* op, DialectLoweringPass& pass);
    /// Conversion from types in the nGraph dialect to the Standard dialect.
    class NGraphTypeConverter : public TypeConverter
    {
@@ -184,29 +188,25 @@ namespace
        void runOnModule() override;
        SmallVector<Value*, 4> buildOutputDefs(Operation* op, PatternRewriter& rewriter);
-        /// Allocates a linear buffer for a temporary tensor
+        /// Allocates a linear buffer for a temporary memref that shares its
-        Value* createTempBuffer(Type type, PatternRewriter& rewriter);
+        /// underlying memory. Used in conjunction with createTempMemref
+        Value* createTempBuffer(int bufferId, PatternRewriter& rewriter);
        /// Creates an allocation or view of a memref.
        /// type     MemRef Type
        /// buffer   Optional buffer value to create view over
        /// offset   Optional offset into the buffer this view starts at
        ///
-        /// If buffer is null, a new allocation of a memref is created.
+        /// If buffer is null it allocates a Memref directly and Offset is ignored.
-        /// Offset is ignored.  If buffer is non-null, then we create a temp
+        /// If not, it creates a view over the pre-allocated buffer at the given offset.
-        /// view over a pre-allocated buffer (see createTempBuffer)
        Value*
            createTempMemref(Type type, Value* buffer, unsigned offset, PatternRewriter& rewriter);
        /// Inserts dealloc Ops for each temporary allocated by AllocOp
        void insertDeallocs(PatternRewriter& rewriter);
        NGraphTypeConverter& getTypeConverter() { return typeConverter; }
+        MemoryAnalysis* getMemAnalysis() const { return m_memAnalysis; }
    private:
        /// Collect a set of patterns to convert from the nGraph dialect to Affine dialect.
        void populateNGraphToAffineConversionPatterns(OwningRewritePatternList& patterns);
        void findOutputValues();
        void insertNoAliasArgAttrs();
@@ -219,7 +219,7 @@ namespace
        // Track pre-assigned buffers  for each Value and re-use it if one is available.
        using IdToMemRefMap = std::unordered_map<unsigned, Value*>;
        IdToMemRefMap m_id_to_memref;
+        MemoryAnalysis* m_memAnalysis;
        // TODO: Workaround for findOutputValues and buildOutputDefs. See NGCPU-470.
        std::string funcName;
    };
@@ -232,6 +232,9 @@ namespace
        populateNGraphToAffineConversionPatterns(patterns);
+        // Get Memory analysis for in-place memory optimizations
+        m_memAnalysis = &getAnalysis<MemoryAnalysis>();
        // Create target that defines legal ops for nGraph dialect to be lowered to.
        ConversionTarget target(getContext());
@@ -336,24 +339,25 @@ namespace
                // will re-use the same buffer.
                auto tensorType = origResult->getType().cast<NGTensorType>();
                Value* newResult = nullptr;
-                Attribute bufferIdAttr = getBufferId(op);
+                auto bufferInfo = m_memAnalysis->getBufferInfo(op);
                Type memRefType = typeConverter.convertType(tensorType);
                Value* bufferValue = nullptr;
-                if (!bufferIdAttr)
+                if (!bufferInfo.isValid())
                {
                    // Allocate new memref
                    newResult = createTempMemref(memRefType, nullptr, 0, rewriter);
                }
                else
                {
-                    unsigned bufferId = bufferIdAttr.cast<IntegerAttr>().getInt();
+                    unsigned bufferId = bufferInfo.m_bufferId;
+                    unsigned offset = bufferInfo.m_offset;
                    // Re-use a buffer if it exist, else create a new one and update map
                    IdToMemRefMap::iterator it = m_id_to_memref.find(bufferId);
                    if (it == m_id_to_memref.end())
                    {
                        // create a new buffer
-                        bufferValue = createTempBuffer(memRefType, rewriter);
+                        bufferValue = createTempBuffer(bufferId, rewriter);
                        m_id_to_memref[bufferId] = bufferValue;
                    }
                    else
@@ -361,7 +365,7 @@ namespace
                        bufferValue = it->second;
                    }
                    // Create a temp view over the linear buffer
-                    newResult = createTempMemref(memRefType, bufferValue, 0, rewriter);
+                    newResult = createTempMemref(memRefType, bufferValue, offset, rewriter);
                }
                NGRAPH_CHECK(newResult != nullptr, "Temp memref value is not set");
                newResults.push_back(newResult);
@@ -370,18 +374,17 @@ namespace
        return newResults;
    }
-    Value* DialectLoweringPass::createTempBuffer(Type type, PatternRewriter& rewriter)
+    Value* DialectLoweringPass::createTempBuffer(int bufferId, PatternRewriter& rewriter)
    {
-        MemRefType memRefType = type.cast<MemRefType>();
+        unsigned sizeInBytes = getMemAnalysis()->getBufferSize(bufferId);
+        NGRAPH_CHECK(bufferId >= 0, "Invalid buffer id to allocate");
-        NGRAPH_CHECK(memRefType.hasStaticShape(), "Dynamic shapes are not supported");
+        NGRAPH_CHECK(sizeInBytes > 0, "Zero buffer allocation?");
-        // deduce linear buffer shape
-        unsigned sizeInBytes = memRefType.getSizeInBits() / 8;
+        LLVM_DEBUG(llvm::dbgs() << "Allocating buffer of size " << sizeInBytes << " bytes\n");
        MemRefType bufferType =
-            MemRefType::get({sizeInBytes}, IntegerType::get(8, type.getContext()), {});
+            MemRefType::get({sizeInBytes}, IntegerType::get(8, rewriter.getContext()), {});
+        // TODO: Set alignment
        Value* alloc = rewriter.create<mlir::AllocOp>(rewriter.getUnknownLoc(), bufferType);
        memRefsToDealloc.push_back(alloc);
@@ -404,7 +407,6 @@ namespace
                                                 unsigned offset,
                                                 PatternRewriter& rewriter)
    {
-        NGRAPH_CHECK(offset == 0, "Only zero offset is supported");
        MemRefType memRefType = type.cast<MemRefType>();
        if (buffer)
        {
@@ -414,7 +416,7 @@ namespace
            // linear
            // buffer
            // This is simply (d0, d1, d2, .. dN-1) --> d0 * S0 + d1 * S1 ... + dN-1 * SN-1
-            // Where Si is the stride along the i_th dimension
+            // Where Si is the stride along the i_th dimension in elements
            auto shape = memRefType.getShape();
            SmallVector<int64_t, 4> strides(shape.size(), 0);
            strides[shape.size() - 1] = 1;
@@ -1503,6 +1505,71 @@ namespace
        }
        NGRAPH_UNREACHABLE("Unsupported type");
    }
+    // Given a concat op, it will check if dst and operands have
+    // a valid buffer/offset assignment that will make this op
+    // valid in-place
+    bool isInPlaceConcat(mlir::Operation* op, DialectLoweringPass& pass)
+    {
+        NGRAPH_CHECK(isa<NGConcatOp>(op), "Expecting concat operation");
+        auto concat = cast<NGConcatOp>(op);
+        auto concatAxis = concat.concatenation_axis();
+        auto result = concat.getResult();
+        auto shape = (result->getType().cast<NGTensorType>()).getShape();
+        auto memAnalysis = pass.getMemAnalysis();
+        BufferInfo bufferInfo = memAnalysis->getBufferInfo(op);
+        if (!bufferInfo.isValid())
+        {
+            // no buffer assignment to dst, nothing to do
+            return false;
+        }
+        auto dstBufferId = bufferInfo.m_bufferId;
+        auto dstOffset = bufferInfo.m_offset;
+        LLVM_DEBUG(llvm::dbgs() << ">> Check in-place concat\n");
+        LLVM_DEBUG(op->dump());
+        for (auto i = 0; i < shape.size(); i++)
+        {
+            if (i == concatAxis)
+            {
+                break;
+            }
+            if (shape[i] != 1)
+            {
+                LLVM_DEBUG(llvm::dbgs() << "Axis FAIL. Skipping instruction\n");
+                return false;
+            }
+        }
+        LLVM_DEBUG(llvm::dbgs() << "Axis OK\n");
+        // Check if the buffer id and offsets are consistent with what's exepcted
+        LLVM_DEBUG(llvm::dbgs() << "Dst (id, offset) = (" << dstBufferId << ", " << dstOffset
+                                << ")\n");
+        // relative offset in the buffer
+        int opndOffset = 0;
+        for (auto opnd : op->getOperands())
+        {
+            bufferInfo = memAnalysis->getBufferInfo(opnd->getDefiningOp());
+            auto srcBufferId = bufferInfo.m_bufferId;
+            auto srcOffset = bufferInfo.m_offset;
+            LLVM_DEBUG(llvm::dbgs() << "Src (id, offset) = (" << srcBufferId << ", " << srcOffset
+                                    << ")\n");
+            if (!bufferInfo.isValid() || srcBufferId != dstBufferId ||
+                srcOffset != (opndOffset + dstOffset))
+            {
+                // mismatch in buffer IDs or offsets
+                LLVM_DEBUG(llvm::dbgs() << "Buffer ID and Offsets FAIL. Skipping instruction\n");
+                return false;
+            }
+            auto tensorType = opnd->getType().cast<NGTensorType>();
+            opndOffset += tensorType.getNumElements();
+        }
+        LLVM_DEBUG(llvm::dbgs() << "Buffer ID and Offsets OK\n");
+        return true;
+    }
 } // namespace
 namespace mlir

--- a/src/contrib/mlir/backend/pass/memory_optimization.cpp
+++ b/src/contrib/mlir/backend/pass/memory_optimization.cpp
-//*****************************************************************************
-// Copyright 2017-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-// NOTE: This file follows nGraph format style and MLIR naming convention since it does
-// not expose public API to the rest of nGraph codebase and heavily depends on MLIR API.
-#include "contrib/mlir/core/compiler.hpp"
-#include "contrib/mlir/core/ngraph_dialect/ops.hpp"
-#include "contrib/mlir/core/ngraph_dialect/type.hpp"
-#include "ngraph/assertion.hpp"
-#include <llvm/ADT/DenseSet.h>
-#include <map>
-#include <mlir/EDSC/Builders.h>
-#include <mlir/EDSC/Helpers.h>
-#include <mlir/EDSC/Intrinsics.h>
-#include <mlir/IR/AffineExpr.h>
-#include <mlir/IR/IntegerSet.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/StandardTypes.h>
-#include <mlir/Pass/Pass.h>
-#include <mlir/Transforms/DialectConversion.h>
-// anonymous namespace
-// no need to expose any of the following outside of this file
-namespace
-{
-    using namespace ngraph::runtime;
-    using namespace ngraph::runtime::ngmlir;
-    using namespace mlir;
-    /// Memory Optimization pass
-    /// - Tries to perform operations in place where applicable by assigning a virtual buffer ID
-    ///    to values. Those are used later in affine lowering pass to create or re-use memrefs
-    class MemoryOptimizationPass : public mlir::FunctionPass<MemoryOptimizationPass>
-    {
-    public:
-        MemoryOptimizationPass()
-        {
-            m_inplaceOps = {
-#define MLIR_OP(OP, INPLACE) {OP::getOperationName().str(), INPLACE},
-#include "contrib/mlir/backend/pass/op_lowerers.inc"
-            };
-        }
-        void runOnFunction() override;
-    private:
-        bool isSafeInPlace(mlir::Operation* op);
-        std::unordered_map<std::string, bool> m_inplaceOps;
-        static unsigned bufferId;
-    };
-    unsigned MemoryOptimizationPass::bufferId = 0;
-    void MemoryOptimizationPass::runOnFunction()
-    {
-        auto f = getFunction();
-        f.walk([&](mlir::Operation* op) {
-            if (!isSafeInPlace(op))
-            {
-                return;
-            }
-            if (op->getNumResults() > 1)
-            {
-                return;
-            }
-            auto defVal = op->getResult(0);
-            // If the defined value is an output of the sub-graph, cannot do it in place
-            for (auto use = defVal->use_begin(); use != defVal->use_end(); use++)
-            {
-                auto useOp = use->getOwner();
-                if (isa<NGReturnOp>(useOp))
-                {
-                    return;
-                }
-            }
-            // Check if we can re-use the buffer of any of the inputs. Conjunction of the following:
-            // - single use value or all uses in the current op
-            // - not an input argument
-            // TODO: Check instead if last post-dominating (dataflow-wise) use.
-            for (auto opnd = op->operand_begin(); opnd != op->operand_end(); opnd++)
-            {
-                auto val = *opnd;
-                // we optimize if the val has one use or if all uses are in the current op
-                bool optimize;
-                optimize = val->hasOneUse();
-                if (!optimize)
-                {
-                    optimize = true;
-                    // check if all uses are in the current op
-                    for (auto use = val->use_begin(); use != val->use_end(); use++)
-                    {
-                        if (use->getOwner() != op)
-                        {
-                            optimize = false;
-                        }
-                    }
-                }
-                if (optimize)
-                {
-                    // do we have a buffer id attached to this value
-                    auto defOp = val->getDefiningOp();
-                    // If no defining op, then this is a block arg, skip operand
-                    if (!defOp)
-                    {
-                        continue;
-                    }
-                    IntegerAttr attr = getBufferId(defOp);
-                    if (!attr)
-                    {
-                        // attach a new buffer id
-                        attr = setBufferId(defOp, this->bufferId++);
-                    }
-                    // propagate attribute to dst, and we are done
-                    setBufferId(op, attr);
-                    return;
-                }
-            }
-        });
-    }
-    bool MemoryOptimizationPass::isSafeInPlace(mlir::Operation* op)
-    {
-        auto it = m_inplaceOps.find(op->getName().getStringRef().str());
-        return it != m_inplaceOps.end() ? it->second : false;
-    }
-}
-namespace mlir
-{
-    std::unique_ptr<Pass> createMemoryOptimizationPass()
-    {
-        return std::make_unique<MemoryOptimizationPass>();
-    }
-} // namespace mlir
--- a/src/contrib/mlir/backend/pass/op_lowerers.inc
+++ b/src/contrib/mlir/backend/pass/op_lowerers.inc
@@ -27,7 +27,7 @@
 MLIR_OP(NGAddOp             , true                  )
 MLIR_OP(NGArgMaxRedOp       , false                 )
 MLIR_OP(NGArgMinRedOp       , false                 )
-MLIR_OP(NGConcatOp          , false                 )
+MLIR_OP(NGConcatOp          , true                  )
 MLIR_OP(NGConvolutionOp     , false                 )
 MLIR_OP(NGDivOp             , true                  )
 MLIR_OP(NGDotOp             , false                 )

--- a/src/contrib/mlir/core/ngraph_dialect/ops.cpp
+++ b/src/contrib/mlir/core/ngraph_dialect/ops.cpp
@@ -309,28 +309,6 @@ mlir::LogicalResult verifyOp(NGConvolutionOp* op)
    return mlir::success();
 }
-static std::string getBufferIdAttrName()
-{
-    return "ng.buffer_id";
-}
-void setBufferId(mlir::Operation* op, mlir::IntegerAttr attr)
-{
-    op->setAttr(getBufferIdAttrName(), attr);
-}
-mlir::IntegerAttr setBufferId(mlir::Operation* op, unsigned val)
-{
-    auto attr = mlir::IntegerAttr::get(IntegerType::get(32, op->getContext()), val);
-    setBufferId(op, attr);
-    return attr;
-}
-mlir::IntegerAttr getBufferId(mlir::Operation* op)
-{
-    return op->getAttrOfType<mlir::IntegerAttr>(getBufferIdAttrName());
-}
 namespace mlir
 {
 #include "ops_interfaces.cpp.inc"

--- a/src/contrib/mlir/core/ngraph_dialect/ops.hpp
+++ b/src/contrib/mlir/core/ngraph_dialect/ops.hpp
@@ -41,7 +41,3 @@ namespace mlir
 #include "ops.h.inc"
 #undef GET_OP_CLASSES
 }
-void setBufferId(mlir::Operation* op, mlir::IntegerAttr attr);
-mlir::IntegerAttr setBufferId(mlir::Operation* op, unsigned val);
-mlir::IntegerAttr getBufferId(mlir::Operation* op);
--- a/test/backend/concat.in.cpp
+++ b/test/backend/concat.in.cpp
@@ -349,6 +349,143 @@ NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_propagate_2d_tensor)
        (vector<float>{3, 7, 2}), read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
 }
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_tree_1)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 4, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(A, B);
+    auto concat = make_shared<op::Concat>(NodeVector{add1, add2}, 1);
+    auto f = make_shared<Function>(make_shared<op::Add>(concat, concat), ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected;
+    expected.resize(8, 4);
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_tree_2)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 8, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(A, B);
+    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2}, 1);
+    auto concat2 = make_shared<op::Concat>(NodeVector{add1, add2}, 1);
+    auto concat12 = make_shared<op::Concat>(NodeVector{concat1, concat2}, 1);
+    auto f = make_shared<Function>(make_shared<op::Add>(concat12, concat12), ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected;
+    expected.resize(16, 4);
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_tree_3)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 16, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto concat1 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat2 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat3 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat4 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat12 = make_shared<op::Concat>(NodeVector{concat1, concat2}, 1);
+    auto concat34 = make_shared<op::Concat>(NodeVector{concat3, concat4}, 1);
+    auto concat14 = make_shared<op::Concat>(NodeVector{concat12, concat34}, 1);
+    auto f = make_shared<Function>(make_shared<op::Add>(concat14, concat14), ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected;
+    expected.resize(32, 2);
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_add_concat)
+{
+    Shape shape{2, 2};
+    Shape shape_r{4, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(add1, add1);
+    auto concat = make_shared<op::Concat>(NodeVector{add1, add2}, 0);
+    auto add3 = make_shared<op::Add>(concat, concat);
+    auto f = make_shared<Function>(add3, ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected = {4, 4, 4, 4, 8, 8, 8, 8};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_add_concat_2)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 6, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(A, B);
+    auto add3 = make_shared<op::Add>(A, B);
+    auto add4 = make_shared<op::Add>(A, B);
+    auto add5 = make_shared<op::Add>(A, B);
+    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2, add3}, 1);
+    auto concat2 = make_shared<op::Concat>(NodeVector{add4, add2, add5}, 1);
+    auto add6 = make_shared<op::Add>(concat1, concat2);
+    auto f = make_shared<Function>(add6, ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected = {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
 // from numpy import *
 // a=linspace(1,2*3*4*3*2,2*3*4*3*2)
 // b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)

--- a/test/mlir/affine_conversion/memory_opt.mlir
+++ b/test/mlir/affine_conversion/memory_opt.mlir
+// RUN: ngraph-opt %s --split-input-file --ngraph-memory-opt --ngraph-memory-opt-concat --ngraph-memory-opt-eltwise  -convert-ngraph-to-affine  | FileCheck %s
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 * 2 + d1)
+// CHECK-LABEL: test0	
+// CHECK: %[[B:.*]] = alloc() : memref<16xi8>
+// CHECK: std.view %[[B]][][] : memref<16xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<16xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<16xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: dealloc %[[B]] : memref<16xi8>
+func @test0(%arg0: !ng.tensor<2x2xf32>, %arg1: !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32> {
+  %0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %1 = "ng.add"(%0, %0) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %2 = "ng.add"(%1, %1) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %3 = "ng.add"(%2, %2) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  "ng.return"(%3) : (!ng.tensor<2x2xf32>) -> ()
+}
+// -----
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 * 2 + d1)
+// CHECK-DAG: #[[MAP1:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 * 2 + d1 + 4)
+// CHECK-LABEL: test1
+// CHECK: %[[B:.*]] = alloc() : memref<32xi8>
+// CHECK: std.view %[[B]][][] : memref<32xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<32xi8> to memref<2x2xf32, #[[MAP1]]>
+// CHECK: std.view %[[B]][][] : memref<32xi8> to memref<4x2xf32, #[[MAP0]]>
+// CHECK: dealloc %[[B]] : memref<32xi8>
+func @test1(%arg0: !ng.tensor<2x2xf32>, %arg1: !ng.tensor<2x2xf32>) -> !ng.tensor<4x2xf32> {
+  %0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %1 = "ng.add"(%0, %0) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %2 = "ng.concat"(%0, %1) {concatenation_axis = 0} : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<4x2xf32>
+  %3 = "ng.add"(%2, %2) : (!ng.tensor<4x2xf32>, !ng.tensor<4x2xf32>) -> !ng.tensor<4x2xf32>
+  "ng.return"(%3) : (!ng.tensor<4x2xf32>) -> ()
+}
+// -----
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP1:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2 + 4)
+// CHECK-DAG: #[[MAP2:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP3:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 16 + d1 * 2 + d2)
+// CHECK-LABEL: test2
+// CHECK: %[[B1:.*]] = alloc() : memref<32xi8>
+// CHECK: std.view %[[B1]][][] : memref<32xi8> to memref<1x2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B1]][][] : memref<32xi8> to memref<1x2x2xf32, #[[MAP1]]>
+// CHECK: std.view %[[B1]][][] : memref<32xi8> to memref<1x4x2xf32, #[[MAP2]]>
+// CHECK: %[[B2:.*]] = alloc() : memref<64xi8>
+// CHECK: std.view %[[B2]][][] : memref<64xi8> to memref<1x8x2xf32, #[[MAP3]]>
+// CHECK: std.view %[[B2]][][] : memref<64xi8> to memref<1x8x2xf32, #[[MAP3]]>
+func @test2(%arg0: !ng.tensor<1x2x2xf32>, %arg1: !ng.tensor<1x2x2xf32>) -> (!ng.tensor<1x4x2xf32>, !ng.tensor<1x8x2xf32>){
+  %0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+  %1 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+  // inplace
+  %2 = "ng.concat"(%0, %1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  // cannot be done inplace, %3 and %2 cannot alias
+  %3 = "ng.concat"(%0, %1, %2) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x8x2xf32>
+  // inplace destructive. %3 and %2 cannot alias
+  %4 = "ng.add"(%3, %3) : (!ng.tensor<1x8x2xf32>, !ng.tensor<1x8x2xf32>) -> !ng.tensor<1x8x2xf32>
+  // no inplace, result is output
+  %5 = "ng.add"(%2, %2) : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x4x2xf32>
+  // no inplace, result is output
+  %6 = "ng.add"(%4, %4) : (!ng.tensor<1x8x2xf32>, !ng.tensor<1x8x2xf32>) -> !ng.tensor<1x8x2xf32>
+  "ng.return"(%5, %6) : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x8x2xf32>) -> ()
+}
+// -----
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP8:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2 + 8)
+// CHECK-DAG: #[[MAP9:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2 + 16)
+// CHECK-DAG: #[[MAP10:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2 + 24)
+// CHECK-DAG: #[[MAP11:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 16 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP12:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 16 + d1 * 2 + d2 + 16)
+// CHECK-DAG: #[[MAP13:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 32 + d1 * 2 + d2)
+// CHECK-LABEL: test3
+// CHECK: %[[B:.*]] = alloc() : memref<128xi8>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP8]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP9]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP10]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x8x2xf32, #[[MAP11]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x8x2xf32, #[[MAP12]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x16x2xf32, #[[MAP13]]>
+// CHECK: dealloc %[[B]] : memref<128xi8>
+func @test3(%arg0: !ng.tensor<1x2x2xf32>, %arg1: !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x16x2xf32> {
+  %0 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %1 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %2 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %3 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %4 = "ng.concat"(%0, %1) {concatenation_axis = 1} : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x8x2xf32>
+  %5 = "ng.concat"(%2, %3) {concatenation_axis = 1} : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x8x2xf32>
+  %6 = "ng.concat"(%4, %5) {concatenation_axis = 1} : (!ng.tensor<1x8x2xf32>, !ng.tensor<1x8x2xf32>) -> !ng.tensor<1x16x2xf32>
+  %7 = "ng.add"(%6, %6) : (!ng.tensor<1x16x2xf32>, !ng.tensor<1x16x2xf32>) -> !ng.tensor<1x16x2xf32>
+  "ng.return"(%7) : (!ng.tensor<1x16x2xf32>) -> ()
+}
+// -----
+//CHECK-DAG: #[[MAP4:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2 + 4)
+//CHECK-DAG: #[[MAP5:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2)
+//CHECK-DAG: #[[MAP6:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2 + 8)
+//CHECK-DAG: #[[MAP12:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 12 + d1 * 2 + d2)
+// CHECK-LABEL: test4
+//CHECK: %[[B1:.*]] = alloc() : memref<1x2x2xf32>
+//CHECK: %[[B2:.*]] = alloc() : memref<48xi8>
+//CHECK: std.view %[[B2]][][] : memref<48xi8> to memref<1x2x2xf32, #[[MAP4]]>
+//CHECK: %[[B3:.*]] = alloc() : memref<1x2x2xf32>
+//CHECK: std.view %[[B2]][][] : memref<48xi8> to memref<1x2x2xf32, #[[MAP5]]>
+//CHECK: std.view %[[B2]][][] : memref<48xi8> to memref<1x2x2xf32, #[[MAP6]]>
+//CHECK: %[[B4:.*]] = alloc() : memref<1x6x2xf32>
+//CHECK: std.view %1[][] : memref<48xi8> to memref<1x6x2xf32, #[[MAP12]]>
+//CHECK: dealloc %[[B1]] : memref<1x2x2xf32>
+//CHECK: dealloc %[[B2]] : memref<48xi8>
+//CHECK: dealloc %[[B3]] : memref<1x2x2xf32>
+//CHECK: dealloc %[[B4]] : memref<1x6x2xf32>
+func @test4(%arg0: !ng.tensor<1x2x2xf32>, %arg1: !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x8x2xf32> {
+    %S0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %S1 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %S2 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %R0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %R2 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    // pre-existing assignment of S1 in %D2 prevents assignment for %D1 concat
+    %D1 = "ng.concat"(%S0, %S1, %S2) {concatenation_axis = 1} :  (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x6x2xf32>
+    %D2 = "ng.concat"(%R0, %S1, %R2) {concatenation_axis = 1} :  (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x6x2xf32>
+    %D3 = "ng.add"(%D1, %D2)    : (!ng.tensor<1x6x2xf32>, !ng.tensor<1x6x2xf32>) -> !ng.tensor<1x6x2xf32>
+    "ng.return"(%D3) : (!ng.tensor<1x6x2xf32>) -> ()
+}