[MLIR] In-place memory optimization for elt-wise and concat ops. (#3832)

* AliasMap WIP * Added liveness info * WIP * WIP: Tests * WIP: LIT tests * Added knobs for mem optimization pass. More LIT tests * Revert affine_lowerer change * More elaborate comment * Minor fixes * style-apply * Rename liveness * Switch to Analysis framework * Fix optimization conditions * Remove LIT test * style * Switch to equivalence relationship impl of non-alias relationship * refined comment * Switch non-alias to equivalence relationship * Fix bad merge * Adding tests. WIP * Added buffer size tracking and unit-tests * Added LIT and unit-tests * Turn optimization ON * style * fix unit-tests * Fix useCount * Fix copyright and typo * Refine few comments, remove new lines * style fix Co-authored-by: Scott Cyphers <diyessi@users.noreply.github.com> Co-authored-by: Sang Ik Lee <sang.ik.lee@intel.com>

[MLIR] In-place memory optimization for elt-wise and concat ops. (#3832)
* AliasMap WIP * Added liveness info * WIP * WIP: Tests * WIP: LIT tests * Added knobs for mem optimization pass. More LIT tests * Revert affine_lowerer change * More elaborate comment * Minor fixes * style-apply * Rename liveness * Switch to Analysis framework * Fix optimization conditions * Remove LIT test * style * Switch to equivalence relationship impl of non-alias relationship * refined comment * Switch non-alias to equivalence relationship * Fix bad merge * Adding tests. WIP * Added buffer size tracking and unit-tests * Added LIT and unit-tests * Turn optimization ON * style * fix unit-tests * Fix useCount * Fix copyright and typo * Refine few comments, remove new lines * style fix Co-authored-by: Scott Cyphers <diyessi@users.noreply.github.com> Co-authored-by: Sang Ik Lee <sang.ik.lee@intel.com>
6de4893b · Nagy Mostafa · Sang Ik Lee · b3db038e · 6de4893b · 6de4893b
Commit 6de4893b authored Jan 14, 2020 by Nagy Mostafa Committed by Sang Ik Lee Jan 14, 2020
11 changed files
--- a/src/contrib/mlir/CMakeLists.txt
+++ b/src/contrib/mlir/CMakeLists.txt
@@ -25,7 +25,7 @@ add_subdirectory(tools/ngraph-opt)
 set(SRC
    backend/cpu/cpu_backend.cpp
    backend/pass/affine_lowerer.cpp
-    backend/pass/memory_optimization.cpp
+    backend/analysis/memory_analysis.cpp
    core/compiler.cpp
    core/ngraph_dialect/dialect.cpp
    core/ngraph_dialect/type.cpp

--- a/src/contrib/mlir/backend/analysis/memory_analysis.cpp
+++ b/src/contrib/mlir/backend/analysis/memory_analysis.cpp
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+// NOTE: This file follows nGraph format style and MLIR naming convention since it does
+// not expose public API to the rest of nGraph codebase and heavily depends on MLIR API.
+#include "memory_analysis.hpp"
+#include "contrib/mlir/core/compiler.hpp"
+#include "contrib/mlir/core/ngraph_dialect/ops.hpp"
+#include "contrib/mlir/core/ngraph_dialect/type.hpp"
+#include <llvm/ADT/BitVector.h>
+#include <llvm/ADT/DenseSet.h>
+#include <map>
+#include <mlir/EDSC/Builders.h>
+#include <mlir/EDSC/Helpers.h>
+#include <mlir/EDSC/Intrinsics.h>
+#include <mlir/IR/AffineExpr.h>
+#include <mlir/IR/IntegerSet.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Transforms/DialectConversion.h>
+static llvm::cl::opt<bool> clEnableNgInPlaceMemory(
+    "ngraph-memory-opt",
+    llvm::cl::init(true),
+    llvm::cl::desc("Enable ngraph dialect in-place memory optimization pass"));
+static llvm::cl::opt<bool>
+    clEnableNgInPlaceConcat("ngraph-memory-opt-concat",
+                            llvm::cl::init(true),
+                            llvm::cl::desc("Enable inplace concat optimization"));
+static llvm::cl::opt<bool>
+    clEnableNgInPlaceEltWise("ngraph-memory-opt-eltwise",
+                             llvm::cl::init(true),
+                             llvm::cl::desc("Enable inplace element wise optimization"));
+// anonymous namespace
+// no need to expose any of the following outside of this file
+namespace
+{
+    using namespace ngraph::runtime;
+    using namespace ngraph::runtime::ngmlir;
+    using namespace mlir;
+    // A helper data-structure to track cannot alias relationship between
+    // tensor syms. If NoAlias[T] contains S, then T and S cannot alias.
+    // The relationship is an equivalence (transitive, symmetric, reflexive)
+    // Initially each sym is put in its own equivalence class (set).
+    // If two syms a and b are found to be non-alias (equivalent),
+    // their equivalence classes are unioned
+    class AliasRelation
+    {
+    public:
+        /// Initialize the relationship for a number of syms
+        void init(std::unordered_set<Value*>& symbols);
+        /// Checks if values a and b can alias
+        bool canAlias(Value* a, Value* b);
+        void insertNoAlias(Value* a, Value* b);
+    private:
+        using BV = llvm::BitVector;
+        std::unordered_map<Value*, unsigned> m_valueToIdx;
+        std::unordered_map<unsigned, Value*> m_idxToValue;
+        std::unordered_map<Value*, BV*> m_valueToSet;
+        SmallVector<BV, 10> m_sets;
+    };
+    // Simple single basic block liveness analysis
+    // TODO: Replace with MLIR's liveness analysis
+    class LivenessAnalysis
+    {
+    public:
+        bool isLive(Value* v);
+        void setLive(Value* v);
+        void kill(Value* v);
+        void getLiveValues(llvm::SmallVectorImpl<Value*>& values);
+        void reset();
+    private:
+        unsigned m_maxIdx = 0;
+        SmallVector<bool, 10> m_liveness;
+        std::unordered_map<Value*, unsigned> m_valueToIdx;
+    };
+    // Memory Assignment analysis
+    // Tries to find operations that can be done in place where applicable
+    // by assigning a virtual buffer ID to values.
+    // The buffer assignment is used later in affine lowering pass to create
+    // or re-use memrefs
+    class MemoryAssignment
+    {
+    public:
+        MemoryAssignment(MemoryAnalysis* memAnalysis)
+            : m_memAnalysis(memAnalysis)
+        {
+            m_inplaceOps = {
+#define MLIR_OP(OP, INPLACE) {OP::getOperationName().str(), INPLACE},
+#include "contrib/mlir/backend/pass/op_lowerers.inc"
+            };
+            m_bufferId = 0;
+        }
+        void run(ModuleOp* module);
+    private:
+        void processDestructiveInPlace(mlir::Operation* op);
+        void processConcat(mlir::Operation* op);
+        bool isSafeInPlace(mlir::Operation* op);
+        bool isInputOrOutputValue(mlir::Value* value);
+        LivenessAnalysis m_liveness;
+        AliasRelation m_aliasRelation;
+        std::unordered_map<std::string, bool> m_inplaceOps;
+        int m_bufferId;
+        MemoryAnalysis* m_memAnalysis;
+    };
+    // helpers
+    // Determines the buffer size a value needs based on its type
+    // offset is where that value should start in the buffer
+    static unsigned getBufferSizeForOperand(mlir::Value* value, int offset);
+    // Go backwards over instructions
+    //
+    // Re-use buffers if none of the dst/srcs are input/output of the sub-graph
+    //
+    // For destructive in-place ops (elt-wise):
+    //      - Find first src where it is last use (src is dead).
+    //        If all srcs are last-use, then pick one with lower number of uses.
+    //        If no src is found, bail out.
+    //      - If dst has pre-assigned buffer/offset, then copy them to src.
+    //        If not, assign new buffer to both dst and src.
+    //      - Mark all live syms at this point to not alias src
+    //
+    // For non-Destructive in-place ops:
+    //      Concat:
+    //          - Reuse buffer if
+    //              - Concat axis is most-significant non-one axis, and
+    //              - all operands can alias dest.
+    //          - If dst has an assignment, copy it over to srcs as long as
+    //          there is no conflicting src pre-assignment
+    //          - If dst has no assignment, and all srcs have no assignment,
+    //          assign new buffer to dst and srcs
+    //
+    //      Slice: TBD
+    //      Reshape: TBD
+    //
+    // Update liveness info
+    void MemoryAssignment::run(ModuleOp* module)
+    {
+        if (!clEnableNgInPlaceMemory)
+        {
+            // Optimization disabled
+            return;
+        }
+        SmallVector<FuncOp, 2> funcOps(module->getOps<FuncOp>());
+        if (funcOps.size() > 1 || funcOps.empty())
+        {
+            // single func for now
+            return;
+        }
+        auto f = funcOps.back();
+        auto& blocks = f.getBlocks();
+        if (blocks.size() != 1)
+        {
+            // single block func for now
+            return;
+        }
+        auto& block = *(blocks.begin());
+        // count number of syms in the code and initialize alias relationship
+        std::unordered_set<Value*> syms;
+        for (auto it = block.begin(); it != block.end(); it++)
+        {
+            Operation* op = &(*it);
+            for (auto it : op->getResults())
+            {
+                Value* v = it;
+                if (syms.find(v) == syms.end())
+                {
+                    syms.insert(v);
+                }
+            }
+            for (auto it : op->getOperands())
+            {
+                Value* v = it;
+                if (syms.find(v) == syms.end())
+                {
+                    syms.insert(v);
+                }
+            }
+        }
+        m_aliasRelation.init(syms);
+        // scan instructions backwards
+        for (auto it = block.rbegin(); it != block.rend(); it++)
+        {
+            Operation* op = &(*it);
+            if (isSafeInPlace(op))
+            {
+                // TODO: replace with Op Interface check
+                if (dyn_cast<NGConcatOp>(op))
+                {
+                    if (clEnableNgInPlaceConcat)
+                        processConcat(op);
+                }
+                else
+                {
+                    if (clEnableNgInPlaceEltWise)
+                        processDestructiveInPlace(op);
+                }
+            }
+            // update liveness info
+            for (auto dit : op->getResults())
+            {
+                m_liveness.kill(dit);
+            }
+            for (auto uit : op->getOperands())
+            {
+                m_liveness.setLive(uit);
+            }
+        }
+    }
+    void MemoryAssignment::processConcat(mlir::Operation* op)
+    {
+        auto concat = cast<mlir::NGConcatOp>(op);
+        {
+            // concat on the highest non-one axis
+            auto concatAxis = concat.concatenation_axis();
+            auto result = concat.getResult();
+            auto shape = (result->getType().cast<NGTensorType>()).getShape();
+            std::vector<int> opndOffsets;
+            BufferInfo bufferInfo;
+            int bufferId = -1, baseOffset = 0;
+            unsigned bufferSize = 0;
+            if (isInputOrOutputValue(op->getResult(0)))
+            {
+                // dst is output, bail out
+                return;
+            };
+            for (auto i = 0; i < shape.size(); i++)
+            {
+                if (i == concatAxis)
+                {
+                    break;
+                }
+                if (shape[i] != 1)
+                {
+                    return;
+                }
+            }
+            // check that all operands and dst can alias
+            // and that none is input or output
+            for (auto opnd : op->getOperands())
+            {
+                if (!m_aliasRelation.canAlias(result, opnd) || isInputOrOutputValue(opnd))
+                {
+                    return;
+                }
+            }
+            // calculate relative offsets in the output buffer
+            int opndOffset = 0;
+            for (auto i = 0; i < op->getNumOperands(); i++)
+            {
+                if (i == 0)
+                {
+                    opndOffsets.push_back(0);
+                }
+                else
+                {
+                    auto opnd = op->getOperand(i - 1);
+                    auto tensorType = opnd->getType().cast<NGTensorType>();
+                    opndOffset += tensorType.getNumElements();
+                    opndOffsets.push_back(opndOffset);
+                }
+            }
+            // check for consistent pre-existing buffer assignments
+            bufferInfo = m_memAnalysis->getBufferInfo(op);
+            // if dest has an assignment
+            if (bufferInfo.isValid())
+            {
+                // set buffer ID and base offset to that of dest's
+                bufferId = bufferInfo.m_bufferId;
+                baseOffset = bufferInfo.m_offset;
+                // check if we can re-use it for all src operands
+                int bufferOffset = 0;
+                for (auto i = 0; i < op->getNumOperands(); i++)
+                {
+                    auto opnd = op->getOperand(i);
+                    auto defOp = opnd->getDefiningOp();
+                    NGRAPH_CHECK(defOp != nullptr, "Defining operation expected");
+                    // calculate expected absolute offset in the buffer
+                    bufferOffset = baseOffset + opndOffsets[i];
+                    bufferInfo = m_memAnalysis->getBufferInfo(defOp);
+                    if (bufferInfo.isValid())
+                    {
+                        if (bufferInfo.m_bufferId != bufferId ||
+                            bufferInfo.m_offset != bufferOffset)
+                        {
+                            // buffer ID or offset mismatch, bailout
+                            return;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // dst has no buffer assignment
+                // TODO:
+                // We can re-use an existing assignment of a src operand if
+                // Every other src either:
+                //    a. has a matching pre-assigned buffer ID and offset, or
+                //    b. is unassigned a buffer/offset, and the computed offset is valid
+                //       (non-negative), and no other live tensor aliases the chunk
+                //       of the buffer we want to assign.
+                //       To achieve this, we need to track buffer->{tensor,offset,size} and
+                //       perform the check
+                //
+                // Example:
+                // V1   = Concat    S0 (?), S1{0,16}, S2 (?)
+                // R0   = ...
+                // R2   = ...
+                // V2   = Concat    R0{0, 0}, S1 {0,16}, R2{0,32}
+                //
+                // For the first concat, we could use the assignment of S1 (from second concat)
+                // to define assignments for S0 and S2, and since R0, R2 are dead, no live tensors
+                // alias into the buffer, and the assignment is valid.
+                //
+                // On the other hand, the following is invalid
+                // Example:
+                // R0   = ...
+                // V1   = Concat    S0(?), S1(0,16), S2(?)
+                // R2   = ...
+                // V2   = Concat    R0, S1{0,16}, R2
+                // Reusing assignment of S1 in the first concat will cause S0 and R0 to alias.
+                // And since R0 is alive the write to R0 will overwrite S0.
+                // For now, assign only if all srcs have no prior assignments
+                for (auto opnd : op->getOperands())
+                {
+                    if (m_memAnalysis->getBufferInfo(opnd->getDefiningOp()).isValid())
+                    {
+                        return;
+                    }
+                }
+            }
+            // We didn't find any pre-existing buffer assignment, create a new buffer
+            if (bufferId == -1)
+            {
+                bufferId = m_bufferId++;
+                baseOffset = 0;
+            }
+            // adjust the buffer size based on this instruction
+            // max size is determined from dst offset and type
+            bufferSize = getBufferSizeForOperand(op->getResult(0), baseOffset);
+            m_memAnalysis->setBufferSize(bufferId, bufferSize);
+            // Update analysis map. No need to check if we are over-writing previous entries
+            // since they should all match.
+            m_memAnalysis->setBufferInfo(op, {bufferId, baseOffset});
+            for (auto i = 0; i < op->getNumOperands(); i++)
+            {
+                auto opnd = op->getOperand(i);
+                auto defOp = opnd->getDefiningOp();
+                NGRAPH_CHECK(defOp != nullptr, "Defining operation expected");
+                auto opndOffset = baseOffset + opndOffsets[i];
+                m_memAnalysis->setBufferInfo(defOp, {bufferId, opndOffset});
+            }
+        }
+    }
+    void MemoryAssignment::processDestructiveInPlace(mlir::Operation* op)
+    {
+        NGRAPH_CHECK(op->getNumResults() == 1, "Destructive in-place with multi-def ?");
+        Value* use = nullptr;
+        int useCount = -1;
+        if (isInputOrOutputValue(op->getResult(0)))
+        {
+            // dst is output, bail out
+            return;
+        };
+        // pick a dead operand that is not an input or output with the least number of uses
+        for (auto opnd : op->getOperands())
+        {
+            if (!m_liveness.isLive(opnd) && !isInputOrOutputValue(opnd))
+            {
+                int uses = 0;
+                for (auto& i : opnd->getUses())
+                {
+                    uses++;
+                }
+                if (useCount == -1 || uses < useCount)
+                {
+                    use = opnd;
+                    useCount = uses;
+                }
+            }
+        }
+        if (!use)
+        {
+            return;
+        }
+        // assign new buffer or copy buffer info from dst
+        auto bufferInfo = m_memAnalysis->getBufferInfo(op);
+        if (!bufferInfo.isValid())
+        {
+            // attach a new buffer id, and 0 offset on obth src and result
+            bufferInfo = {m_bufferId++, 0};
+            m_memAnalysis->setBufferInfo(op, bufferInfo);
+            m_memAnalysis->setBufferInfo(use->getDefiningOp(), bufferInfo);
+        }
+        else
+        {
+            // copy result buffer id and offset to src
+            m_memAnalysis->setBufferInfo(use->getDefiningOp(), bufferInfo);
+        }
+        auto bufferSize = 0;
+        bufferSize = getBufferSizeForOperand(op->getResult(0), bufferInfo.m_offset);
+        m_memAnalysis->setBufferSize(bufferInfo.m_bufferId, bufferSize);
+        // update aliasing info
+        // use value cannot alias any live value
+        SmallVector<Value*, 10> liveValues;
+        m_liveness.getLiveValues(liveValues);
+        for (auto& value : liveValues)
+        {
+            m_aliasRelation.insertNoAlias(use, value);
+        }
+    }
+    bool MemoryAssignment::isInputOrOutputValue(mlir::Value* value)
+    {
+        auto defOp = value->getDefiningOp();
+        // If no defining op, then this is a block arg, skip operand
+        //
+        // TODO: This check is assuming single BB function, improve to handle control-flow.
+        // In which case, we have to track block args to all pred branches that feed them,
+        // all the way up to the initial def, if any, or entry block arg. This is preferably
+        // done as a pre-pass to capture all inputs/output values.
+        if (!defOp)
+        {
+            return true;
+        }
+        // If the defined value is an output of the sub-graph, cannot do it in place
+        //
+        // TODO: Improve to support control flow. Track value use-chain along branches/block-args,
+        // if we hit a use in a return, it is an output value.
+        for (auto& use : value->getUses())
+        {
+            auto useOp = use.getOwner();
+            if (isa<NGReturnOp>(useOp))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+    // TODO Change this to use interfaces.
+    bool MemoryAssignment::isSafeInPlace(mlir::Operation* op)
+    {
+        auto it = m_inplaceOps.find(op->getName().getStringRef().str());
+        return it != m_inplaceOps.end() ? it->second : false;
+    }
+    void AliasRelation::init(std::unordered_set<Value*>& symbols)
+    {
+        unsigned numSyms = symbols.size();
+        m_sets.resize(numSyms);
+        for (auto& bv : m_sets)
+        {
+            bv.resize(numSyms);
+        }
+        // populate id->value and value->id maps
+        unsigned i = 0;
+        for (auto v : symbols)
+        {
+            m_idxToValue[i] = v;
+            m_valueToIdx[v] = i;
+            m_valueToSet[v] = &m_sets[i];
+            // set bit for that value
+            m_sets[i].set(i);
+            i++;
+        }
+    }
+    bool AliasRelation::canAlias(Value* a, Value* b)
+    {
+        // check if a and b are in the same set
+        return m_valueToSet[a] != m_valueToSet[b];
+    }
+    void AliasRelation::insertNoAlias(Value* a, Value* b)
+    {
+        // union the two sets that a and b belong to
+        // update the maps accordingly
+        if (!canAlias(a, b))
+        {
+            // nothing to do
+            return;
+        }
+        // union the two sets of a and b
+        BV* aSet = m_valueToSet[a];
+        BV* bSet = m_valueToSet[b];
+        BV uSet = (*aSet);
+        uSet |= (*bSet);
+        // replace aSet with union
+        auto pSet = m_valueToSet[a];
+        *pSet = uSet;
+        // update value to set maps
+        for (auto it = pSet->set_bits_begin(); it != pSet->set_bits_end(); it++)
+        {
+            unsigned id = *it;
+            auto value = m_idxToValue[id];
+            m_valueToSet[value] = pSet;
+        }
+    }
+    void LivenessAnalysis::reset()
+    {
+        m_valueToIdx.clear();
+        m_liveness.clear();
+        m_maxIdx = 0;
+    }
+    void LivenessAnalysis::getLiveValues(llvm::SmallVectorImpl<Value*>& values)
+    {
+        for (auto& entry : m_valueToIdx)
+        {
+            if (m_liveness[entry.second])
+            {
+                values.push_back(entry.first);
+            }
+        }
+    }
+    bool LivenessAnalysis::isLive(Value* v)
+    {
+        auto it = m_valueToIdx.find(v);
+        if (it == m_valueToIdx.end())
+        {
+            return false;
+        }
+        return m_liveness[it->second];
+    }
+    void LivenessAnalysis::setLive(Value* v)
+    {
+        auto it = m_valueToIdx.find(v);
+        if (it == m_valueToIdx.end())
+        {
+            m_valueToIdx[v] = m_maxIdx++;
+            m_liveness.push_back(true);
+            NGRAPH_CHECK(m_liveness.size() == m_maxIdx);
+        }
+        else
+        {
+            m_liveness[it->second] = true;
+        }
+    }
+    void LivenessAnalysis::kill(Value* v)
+    {
+        auto it = m_valueToIdx.find(v);
+        if (it == m_valueToIdx.end())
+        {
+            // already dead
+            return;
+        }
+        m_liveness[it->second] = false;
+    }
+    // helpers
+    unsigned getBufferSizeForOperand(mlir::Value* value, int offset)
+    {
+        auto tensorType = value->getType().dyn_cast<NGTensorType>();
+        NGRAPH_CHECK(tensorType, "Invalid type to find buffer size for");
+        unsigned bufferSize = offset * std::ceil(tensorType.getElementBitWidth() / 8);
+        bufferSize += tensorType.getSizeInBytes();
+        return bufferSize;
+    }
+}
+namespace mlir
+{
+    MemoryAnalysis::MemoryAnalysis(Operation* op)
+    {
+        MemoryAssignment memoryAssignment(this);
+        auto moduleOp = dyn_cast<ModuleOp>(op);
+        NGRAPH_CHECK(moduleOp != nullptr, "Expecting FuncOp for anaylsis");
+        memoryAssignment.run(&moduleOp);
+    }
+} // namespace mlir
--- a/src/contrib/mlir/backend/pass/memory_optimization.hpp
+++ b/src/contrib/mlir/backend/pass/memory_optimization.hpp
@@ -20,8 +20,60 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
+#include <unordered_map>
+#include "ngraph/check.hpp"
 namespace mlir
 {
-    std::unique_ptr<Pass> createMemoryOptimizationPass();
+    // BufferInfo
+    struct BufferInfo
+    {
+        // Buffer Id. If -1 then invalid buffer.
+        int m_bufferId;
+        // Offset into the buffer
+        int m_offset;
+        bool isValid() const { return m_bufferId != -1; }
+    };
+    struct MemoryAnalysis
+    {
+        using BufferInfoMap = std::unordered_map<Operation*, BufferInfo>;
+        using BufferSizeMap = std::unordered_map<unsigned, unsigned>;
+        // Compute this analysis with the provided operation.
+        MemoryAnalysis(Operation* op);
+        BufferInfo getBufferInfo(Operation* op)
+        {
+            auto it = m_bufferInfo.find(op);
+            if (it == m_bufferInfo.end())
+            {
+                return {-1, -1};
+            }
+            return it->second;
+        }
+        void setBufferInfo(Operation* op, BufferInfo bufferInfo) { m_bufferInfo[op] = bufferInfo; }
+        void setBufferSize(unsigned bufferId, unsigned size)
+        {
+            auto it = m_bufferSize.find(bufferId);
+            if (it != m_bufferSize.end())
+            {
+                it->second = (size > it->second) ? size : it->second;
+            }
+            else
+            {
+                m_bufferSize[bufferId] = size;
+            }
+        }
+        unsigned getBufferSize(unsigned bufferId)
+        {
+            auto it = m_bufferSize.find(bufferId);
+            NGRAPH_CHECK(it != m_bufferSize.end(), "Buffer has no size!");
+            return it->second;
+        }
+    private:
+        // Records assignment of BufferInfo to each inplace op
+        BufferInfoMap m_bufferInfo;
+        // Records buffer size required for each buffer id in bytes
+        BufferSizeMap m_bufferSize;
+    };
 }
--- a/src/contrib/mlir/backend/cpu/cpu_backend.cpp
+++ b/src/contrib/mlir/backend/cpu/cpu_backend.cpp
@@ -19,7 +19,6 @@
 #include "cpu_backend.hpp"
 #include "contrib/mlir/backend/pass/affine_lowerer.hpp"
-#include "contrib/mlir/backend/pass/memory_optimization.hpp"
 #include "contrib/mlir/utils.hpp"
 #include "ngraph/check.hpp"
@@ -160,7 +159,6 @@ void MLIRCPUBackend::init()
 void MLIRCPUBackend::codegen()
 {
-    optimizeNgDialect();
    lowerNgDialect();
 }
@@ -261,18 +259,3 @@ void MLIRCPUBackend::optimizeAffineDialect()
    // Run Std dialect optimizations.
    // TODO
 }
-void MLIRCPUBackend::optimizeNgDialect()
-{
-    mlir::PassManager pm(&m_context);
-    mlir::applyPassManagerCLOptions(pm);
-    if (clEnableNgInPlaceMemoryOpt)
-    {
-        pm.addPass(mlir::createMemoryOptimizationPass());
-    }
-    if (failed(pm.run(m_module.get())))
-    {
-        NGRAPH_CHECK(false, "MLIR pass manager failed");
-    }
-}
--- a/src/contrib/mlir/backend/pass/affine_lowerer.cpp
+++ b/src/contrib/mlir/backend/pass/affine_lowerer.cpp
@@ -19,11 +19,13 @@
 #include "affine_lowerer.hpp"
+#include "contrib/mlir/backend/analysis/memory_analysis.hpp"
 #include "contrib/mlir/core/ngraph_dialect/ops.hpp"
 #include "contrib/mlir/core/ngraph_dialect/type.hpp"
 #include "ngraph/assertion.hpp"
 #include <llvm/ADT/DenseSet.h>
+#include <llvm/Support/Debug.h>
 #include <mlir/EDSC/Builders.h>
 #include <mlir/EDSC/Helpers.h>
 #include <mlir/EDSC/Intrinsics.h>
@@ -165,6 +167,8 @@ namespace
    ValueHandle createZeroConstant(mlir::Type type);
    ValueHandle createOneConstant(mlir::Type type);
+    bool isInPlaceConcat(mlir::Operation* op, DialectLoweringPass& pass);
    /// Conversion from types in the nGraph dialect to the Standard dialect.
    class NGraphTypeConverter : public TypeConverter
    {
@@ -184,29 +188,25 @@ namespace
        void runOnModule() override;
        SmallVector<Value*, 4> buildOutputDefs(Operation* op, PatternRewriter& rewriter);
-        /// Allocates a linear buffer for a temporary tensor
+        /// Allocates a linear buffer for a temporary memref that shares its
-        Value* createTempBuffer(Type type, PatternRewriter& rewriter);
+        /// underlying memory. Used in conjunction with createTempMemref
+        Value* createTempBuffer(int bufferId, PatternRewriter& rewriter);
        /// Creates an allocation or view of a memref.
        /// type     MemRef Type
        /// buffer   Optional buffer value to create view over
        /// offset   Optional offset into the buffer this view starts at
        ///
-        /// If buffer is null, a new allocation of a memref is created.
+        /// If buffer is null it allocates a Memref directly and Offset is ignored.
-        /// Offset is ignored.  If buffer is non-null, then we create a temp
+        /// If not, it creates a view over the pre-allocated buffer at the given offset.
-        /// view over a pre-allocated buffer (see createTempBuffer)
        Value*
            createTempMemref(Type type, Value* buffer, unsigned offset, PatternRewriter& rewriter);
        /// Inserts dealloc Ops for each temporary allocated by AllocOp
        void insertDeallocs(PatternRewriter& rewriter);
        NGraphTypeConverter& getTypeConverter() { return typeConverter; }
+        MemoryAnalysis* getMemAnalysis() const { return m_memAnalysis; }
    private:
        /// Collect a set of patterns to convert from the nGraph dialect to Affine dialect.
        void populateNGraphToAffineConversionPatterns(OwningRewritePatternList& patterns);
        void findOutputValues();
        void insertNoAliasArgAttrs();
@@ -219,7 +219,7 @@ namespace
        // Track pre-assigned buffers  for each Value and re-use it if one is available.
        using IdToMemRefMap = std::unordered_map<unsigned, Value*>;
        IdToMemRefMap m_id_to_memref;
+        MemoryAnalysis* m_memAnalysis;
        // TODO: Workaround for findOutputValues and buildOutputDefs. See NGCPU-470.
        std::string funcName;
    };
@@ -232,6 +232,9 @@ namespace
        populateNGraphToAffineConversionPatterns(patterns);
+        // Get Memory analysis for in-place memory optimizations
+        m_memAnalysis = &getAnalysis<MemoryAnalysis>();
        // Create target that defines legal ops for nGraph dialect to be lowered to.
        ConversionTarget target(getContext());
@@ -336,24 +339,25 @@ namespace
                // will re-use the same buffer.
                auto tensorType = origResult->getType().cast<NGTensorType>();
                Value* newResult = nullptr;
-                Attribute bufferIdAttr = getBufferId(op);
+                auto bufferInfo = m_memAnalysis->getBufferInfo(op);
                Type memRefType = typeConverter.convertType(tensorType);
                Value* bufferValue = nullptr;
-                if (!bufferIdAttr)
+                if (!bufferInfo.isValid())
                {
                    // Allocate new memref
                    newResult = createTempMemref(memRefType, nullptr, 0, rewriter);
                }
                else
                {
-                    unsigned bufferId = bufferIdAttr.cast<IntegerAttr>().getInt();
+                    unsigned bufferId = bufferInfo.m_bufferId;
+                    unsigned offset = bufferInfo.m_offset;
                    // Re-use a buffer if it exist, else create a new one and update map
                    IdToMemRefMap::iterator it = m_id_to_memref.find(bufferId);
                    if (it == m_id_to_memref.end())
                    {
                        // create a new buffer
-                        bufferValue = createTempBuffer(memRefType, rewriter);
+                        bufferValue = createTempBuffer(bufferId, rewriter);
                        m_id_to_memref[bufferId] = bufferValue;
                    }
                    else
@@ -361,7 +365,7 @@ namespace
                        bufferValue = it->second;
                    }
                    // Create a temp view over the linear buffer
-                    newResult = createTempMemref(memRefType, bufferValue, 0, rewriter);
+                    newResult = createTempMemref(memRefType, bufferValue, offset, rewriter);
                }
                NGRAPH_CHECK(newResult != nullptr, "Temp memref value is not set");
                newResults.push_back(newResult);
@@ -370,18 +374,17 @@ namespace
        return newResults;
    }
-    Value* DialectLoweringPass::createTempBuffer(Type type, PatternRewriter& rewriter)
+    Value* DialectLoweringPass::createTempBuffer(int bufferId, PatternRewriter& rewriter)
    {
-        MemRefType memRefType = type.cast<MemRefType>();
+        unsigned sizeInBytes = getMemAnalysis()->getBufferSize(bufferId);
+        NGRAPH_CHECK(bufferId >= 0, "Invalid buffer id to allocate");
-        NGRAPH_CHECK(memRefType.hasStaticShape(), "Dynamic shapes are not supported");
+        NGRAPH_CHECK(sizeInBytes > 0, "Zero buffer allocation?");
-        // deduce linear buffer shape
-        unsigned sizeInBytes = memRefType.getSizeInBits() / 8;
+        LLVM_DEBUG(llvm::dbgs() << "Allocating buffer of size " << sizeInBytes << " bytes\n");
        MemRefType bufferType =
-            MemRefType::get({sizeInBytes}, IntegerType::get(8, type.getContext()), {});
+            MemRefType::get({sizeInBytes}, IntegerType::get(8, rewriter.getContext()), {});
+        // TODO: Set alignment
        Value* alloc = rewriter.create<mlir::AllocOp>(rewriter.getUnknownLoc(), bufferType);
        memRefsToDealloc.push_back(alloc);
@@ -404,7 +407,6 @@ namespace
                                                 unsigned offset,
                                                 PatternRewriter& rewriter)
    {
-        NGRAPH_CHECK(offset == 0, "Only zero offset is supported");
        MemRefType memRefType = type.cast<MemRefType>();
        if (buffer)
        {
@@ -414,7 +416,7 @@ namespace
            // linear
            // buffer
            // This is simply (d0, d1, d2, .. dN-1) --> d0 * S0 + d1 * S1 ... + dN-1 * SN-1
-            // Where Si is the stride along the i_th dimension
+            // Where Si is the stride along the i_th dimension in elements
            auto shape = memRefType.getShape();
            SmallVector<int64_t, 4> strides(shape.size(), 0);
            strides[shape.size() - 1] = 1;
@@ -1503,6 +1505,71 @@ namespace
        }
        NGRAPH_UNREACHABLE("Unsupported type");
    }
+    // Given a concat op, it will check if dst and operands have
+    // a valid buffer/offset assignment that will make this op
+    // valid in-place
+    bool isInPlaceConcat(mlir::Operation* op, DialectLoweringPass& pass)
+    {
+        NGRAPH_CHECK(isa<NGConcatOp>(op), "Expecting concat operation");
+        auto concat = cast<NGConcatOp>(op);
+        auto concatAxis = concat.concatenation_axis();
+        auto result = concat.getResult();
+        auto shape = (result->getType().cast<NGTensorType>()).getShape();
+        auto memAnalysis = pass.getMemAnalysis();
+        BufferInfo bufferInfo = memAnalysis->getBufferInfo(op);
+        if (!bufferInfo.isValid())
+        {
+            // no buffer assignment to dst, nothing to do
+            return false;
+        }
+        auto dstBufferId = bufferInfo.m_bufferId;
+        auto dstOffset = bufferInfo.m_offset;
+        LLVM_DEBUG(llvm::dbgs() << ">> Check in-place concat\n");
+        LLVM_DEBUG(op->dump());
+        for (auto i = 0; i < shape.size(); i++)
+        {
+            if (i == concatAxis)
+            {
+                break;
+            }
+            if (shape[i] != 1)
+            {
+                LLVM_DEBUG(llvm::dbgs() << "Axis FAIL. Skipping instruction\n");
+                return false;
+            }
+        }
+        LLVM_DEBUG(llvm::dbgs() << "Axis OK\n");
+        // Check if the buffer id and offsets are consistent with what's exepcted
+        LLVM_DEBUG(llvm::dbgs() << "Dst (id, offset) = (" << dstBufferId << ", " << dstOffset
+                                << ")\n");
+        // relative offset in the buffer
+        int opndOffset = 0;
+        for (auto opnd : op->getOperands())
+        {
+            bufferInfo = memAnalysis->getBufferInfo(opnd->getDefiningOp());
+            auto srcBufferId = bufferInfo.m_bufferId;
+            auto srcOffset = bufferInfo.m_offset;
+            LLVM_DEBUG(llvm::dbgs() << "Src (id, offset) = (" << srcBufferId << ", " << srcOffset
+                                    << ")\n");
+            if (!bufferInfo.isValid() || srcBufferId != dstBufferId ||
+                srcOffset != (opndOffset + dstOffset))
+            {
+                // mismatch in buffer IDs or offsets
+                LLVM_DEBUG(llvm::dbgs() << "Buffer ID and Offsets FAIL. Skipping instruction\n");
+                return false;
+            }
+            auto tensorType = opnd->getType().cast<NGTensorType>();
+            opndOffset += tensorType.getNumElements();
+        }
+        LLVM_DEBUG(llvm::dbgs() << "Buffer ID and Offsets OK\n");
+        return true;
+    }
 } // namespace
 namespace mlir

--- a/src/contrib/mlir/backend/pass/memory_optimization.cpp
+++ b/src/contrib/mlir/backend/pass/memory_optimization.cpp
-//*****************************************************************************
-// Copyright 2017-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-// NOTE: This file follows nGraph format style and MLIR naming convention since it does
-// not expose public API to the rest of nGraph codebase and heavily depends on MLIR API.
-#include "contrib/mlir/core/compiler.hpp"
-#include "contrib/mlir/core/ngraph_dialect/ops.hpp"
-#include "contrib/mlir/core/ngraph_dialect/type.hpp"
-#include "ngraph/assertion.hpp"
-#include <llvm/ADT/DenseSet.h>
-#include <map>
-#include <mlir/EDSC/Builders.h>
-#include <mlir/EDSC/Helpers.h>
-#include <mlir/EDSC/Intrinsics.h>
-#include <mlir/IR/AffineExpr.h>
-#include <mlir/IR/IntegerSet.h>
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/StandardTypes.h>
-#include <mlir/Pass/Pass.h>
-#include <mlir/Transforms/DialectConversion.h>
-// anonymous namespace
-// no need to expose any of the following outside of this file
-namespace
-{
-    using namespace ngraph::runtime;
-    using namespace ngraph::runtime::ngmlir;
-    using namespace mlir;
-    /// Memory Optimization pass
-    /// - Tries to perform operations in place where applicable by assigning a virtual buffer ID
-    ///    to values. Those are used later in affine lowering pass to create or re-use memrefs
-    class MemoryOptimizationPass : public mlir::FunctionPass<MemoryOptimizationPass>
-    {
-    public:
-        MemoryOptimizationPass()
-        {
-            m_inplaceOps = {
-#define MLIR_OP(OP, INPLACE) {OP::getOperationName().str(), INPLACE},
-#include "contrib/mlir/backend/pass/op_lowerers.inc"
-            };
-        }
-        void runOnFunction() override;
-    private:
-        bool isSafeInPlace(mlir::Operation* op);
-        std::unordered_map<std::string, bool> m_inplaceOps;
-        static unsigned bufferId;
-    };
-    unsigned MemoryOptimizationPass::bufferId = 0;
-    void MemoryOptimizationPass::runOnFunction()
-    {
-        auto f = getFunction();
-        f.walk([&](mlir::Operation* op) {
-            if (!isSafeInPlace(op))
-            {
-                return;
-            }
-            if (op->getNumResults() > 1)
-            {
-                return;
-            }
-            auto defVal = op->getResult(0);
-            // If the defined value is an output of the sub-graph, cannot do it in place
-            for (auto use = defVal->use_begin(); use != defVal->use_end(); use++)
-            {
-                auto useOp = use->getOwner();
-                if (isa<NGReturnOp>(useOp))
-                {
-                    return;
-                }
-            }
-            // Check if we can re-use the buffer of any of the inputs. Conjunction of the following:
-            // - single use value or all uses in the current op
-            // - not an input argument
-            // TODO: Check instead if last post-dominating (dataflow-wise) use.
-            for (auto opnd = op->operand_begin(); opnd != op->operand_end(); opnd++)
-            {
-                auto val = *opnd;
-                // we optimize if the val has one use or if all uses are in the current op
-                bool optimize;
-                optimize = val->hasOneUse();
-                if (!optimize)
-                {
-                    optimize = true;
-                    // check if all uses are in the current op
-                    for (auto use = val->use_begin(); use != val->use_end(); use++)
-                    {
-                        if (use->getOwner() != op)
-                        {
-                            optimize = false;
-                        }
-                    }
-                }
-                if (optimize)
-                {
-                    // do we have a buffer id attached to this value
-                    auto defOp = val->getDefiningOp();
-                    // If no defining op, then this is a block arg, skip operand
-                    if (!defOp)
-                    {
-                        continue;
-                    }
-                    IntegerAttr attr = getBufferId(defOp);
-                    if (!attr)
-                    {
-                        // attach a new buffer id
-                        attr = setBufferId(defOp, this->bufferId++);
-                    }
-                    // propagate attribute to dst, and we are done
-                    setBufferId(op, attr);
-                    return;
-                }
-            }
-        });
-    }
-    bool MemoryOptimizationPass::isSafeInPlace(mlir::Operation* op)
-    {
-        auto it = m_inplaceOps.find(op->getName().getStringRef().str());
-        return it != m_inplaceOps.end() ? it->second : false;
-    }
-}
-namespace mlir
-{
-    std::unique_ptr<Pass> createMemoryOptimizationPass()
-    {
-        return std::make_unique<MemoryOptimizationPass>();
-    }
-} // namespace mlir
--- a/src/contrib/mlir/backend/pass/op_lowerers.inc
+++ b/src/contrib/mlir/backend/pass/op_lowerers.inc
@@ -27,7 +27,7 @@
 MLIR_OP(NGAddOp             , true                  )
 MLIR_OP(NGArgMaxRedOp       , false                 )
 MLIR_OP(NGArgMinRedOp       , false                 )
-MLIR_OP(NGConcatOp          , false                 )
+MLIR_OP(NGConcatOp          , true                  )
 MLIR_OP(NGConvolutionOp     , false                 )
 MLIR_OP(NGDivOp             , true                  )
 MLIR_OP(NGDotOp             , false                 )

--- a/src/contrib/mlir/core/ngraph_dialect/ops.cpp
+++ b/src/contrib/mlir/core/ngraph_dialect/ops.cpp
@@ -309,28 +309,6 @@ mlir::LogicalResult verifyOp(NGConvolutionOp* op)
    return mlir::success();
 }
-static std::string getBufferIdAttrName()
-{
-    return "ng.buffer_id";
-}
-void setBufferId(mlir::Operation* op, mlir::IntegerAttr attr)
-{
-    op->setAttr(getBufferIdAttrName(), attr);
-}
-mlir::IntegerAttr setBufferId(mlir::Operation* op, unsigned val)
-{
-    auto attr = mlir::IntegerAttr::get(IntegerType::get(32, op->getContext()), val);
-    setBufferId(op, attr);
-    return attr;
-}
-mlir::IntegerAttr getBufferId(mlir::Operation* op)
-{
-    return op->getAttrOfType<mlir::IntegerAttr>(getBufferIdAttrName());
-}
 namespace mlir
 {
 #include "ops_interfaces.cpp.inc"

--- a/src/contrib/mlir/core/ngraph_dialect/ops.hpp
+++ b/src/contrib/mlir/core/ngraph_dialect/ops.hpp
@@ -41,7 +41,3 @@ namespace mlir
 #include "ops.h.inc"
 #undef GET_OP_CLASSES
 }
-void setBufferId(mlir::Operation* op, mlir::IntegerAttr attr);
-mlir::IntegerAttr setBufferId(mlir::Operation* op, unsigned val);
-mlir::IntegerAttr getBufferId(mlir::Operation* op);
--- a/test/backend/concat.in.cpp
+++ b/test/backend/concat.in.cpp
@@ -349,6 +349,143 @@ NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_propagate_2d_tensor)
        (vector<float>{3, 7, 2}), read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
 }
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_tree_1)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 4, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(A, B);
+    auto concat = make_shared<op::Concat>(NodeVector{add1, add2}, 1);
+    auto f = make_shared<Function>(make_shared<op::Add>(concat, concat), ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected;
+    expected.resize(8, 4);
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_tree_2)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 8, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(A, B);
+    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2}, 1);
+    auto concat2 = make_shared<op::Concat>(NodeVector{add1, add2}, 1);
+    auto concat12 = make_shared<op::Concat>(NodeVector{concat1, concat2}, 1);
+    auto f = make_shared<Function>(make_shared<op::Add>(concat12, concat12), ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected;
+    expected.resize(16, 4);
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_tree_3)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 16, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto concat1 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat2 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat3 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat4 = make_shared<op::Concat>(NodeVector{A, B}, 1);
+    auto concat12 = make_shared<op::Concat>(NodeVector{concat1, concat2}, 1);
+    auto concat34 = make_shared<op::Concat>(NodeVector{concat3, concat4}, 1);
+    auto concat14 = make_shared<op::Concat>(NodeVector{concat12, concat34}, 1);
+    auto f = make_shared<Function>(make_shared<op::Add>(concat14, concat14), ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected;
+    expected.resize(32, 2);
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_add_concat)
+{
+    Shape shape{2, 2};
+    Shape shape_r{4, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(add1, add1);
+    auto concat = make_shared<op::Concat>(NodeVector{add1, add2}, 0);
+    auto add3 = make_shared<op::Add>(concat, concat);
+    auto f = make_shared<Function>(add3, ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected = {4, 4, 4, 4, 8, 8, 8, 8};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_add_concat_2)
+{
+    Shape shape{1, 2, 2};
+    Shape shape_r{1, 6, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto add2 = make_shared<op::Add>(A, B);
+    auto add3 = make_shared<op::Add>(A, B);
+    auto add4 = make_shared<op::Add>(A, B);
+    auto add5 = make_shared<op::Add>(A, B);
+    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2, add3}, 1);
+    auto concat2 = make_shared<op::Concat>(NodeVector{add4, add2, add5}, 1);
+    auto add6 = make_shared<op::Add>(concat1, concat2);
+    auto f = make_shared<Function>(add6, ParameterVector{A, B});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    vector<float> expected = {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
 // from numpy import *
 // a=linspace(1,2*3*4*3*2,2*3*4*3*2)
 // b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)

--- a/test/mlir/affine_conversion/memory_opt.mlir
+++ b/test/mlir/affine_conversion/memory_opt.mlir
+// RUN: ngraph-opt %s --split-input-file --ngraph-memory-opt --ngraph-memory-opt-concat --ngraph-memory-opt-eltwise  -convert-ngraph-to-affine  | FileCheck %s
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 * 2 + d1)
+// CHECK-LABEL: test0	
+// CHECK: %[[B:.*]] = alloc() : memref<16xi8>
+// CHECK: std.view %[[B]][][] : memref<16xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<16xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<16xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: dealloc %[[B]] : memref<16xi8>
+func @test0(%arg0: !ng.tensor<2x2xf32>, %arg1: !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32> {
+  %0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %1 = "ng.add"(%0, %0) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %2 = "ng.add"(%1, %1) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %3 = "ng.add"(%2, %2) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  "ng.return"(%3) : (!ng.tensor<2x2xf32>) -> ()
+}
+// -----
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 * 2 + d1)
+// CHECK-DAG: #[[MAP1:[a-zA-Z0-9]+]] = (d0, d1) -> (d0 * 2 + d1 + 4)
+// CHECK-LABEL: test1
+// CHECK: %[[B:.*]] = alloc() : memref<32xi8>
+// CHECK: std.view %[[B]][][] : memref<32xi8> to memref<2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<32xi8> to memref<2x2xf32, #[[MAP1]]>
+// CHECK: std.view %[[B]][][] : memref<32xi8> to memref<4x2xf32, #[[MAP0]]>
+// CHECK: dealloc %[[B]] : memref<32xi8>
+func @test1(%arg0: !ng.tensor<2x2xf32>, %arg1: !ng.tensor<2x2xf32>) -> !ng.tensor<4x2xf32> {
+  %0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %1 = "ng.add"(%0, %0) : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<2x2xf32>
+  %2 = "ng.concat"(%0, %1) {concatenation_axis = 0} : (!ng.tensor<2x2xf32>, !ng.tensor<2x2xf32>) -> !ng.tensor<4x2xf32>
+  %3 = "ng.add"(%2, %2) : (!ng.tensor<4x2xf32>, !ng.tensor<4x2xf32>) -> !ng.tensor<4x2xf32>
+  "ng.return"(%3) : (!ng.tensor<4x2xf32>) -> ()
+}
+// -----
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP1:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2 + 4)
+// CHECK-DAG: #[[MAP2:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP3:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 16 + d1 * 2 + d2)
+// CHECK-LABEL: test2
+// CHECK: %[[B1:.*]] = alloc() : memref<32xi8>
+// CHECK: std.view %[[B1]][][] : memref<32xi8> to memref<1x2x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B1]][][] : memref<32xi8> to memref<1x2x2xf32, #[[MAP1]]>
+// CHECK: std.view %[[B1]][][] : memref<32xi8> to memref<1x4x2xf32, #[[MAP2]]>
+// CHECK: %[[B2:.*]] = alloc() : memref<64xi8>
+// CHECK: std.view %[[B2]][][] : memref<64xi8> to memref<1x8x2xf32, #[[MAP3]]>
+// CHECK: std.view %[[B2]][][] : memref<64xi8> to memref<1x8x2xf32, #[[MAP3]]>
+func @test2(%arg0: !ng.tensor<1x2x2xf32>, %arg1: !ng.tensor<1x2x2xf32>) -> (!ng.tensor<1x4x2xf32>, !ng.tensor<1x8x2xf32>){
+  %0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+  %1 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+  // inplace
+  %2 = "ng.concat"(%0, %1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  // cannot be done inplace, %3 and %2 cannot alias
+  %3 = "ng.concat"(%0, %1, %2) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x8x2xf32>
+  // inplace destructive. %3 and %2 cannot alias
+  %4 = "ng.add"(%3, %3) : (!ng.tensor<1x8x2xf32>, !ng.tensor<1x8x2xf32>) -> !ng.tensor<1x8x2xf32>
+  // no inplace, result is output
+  %5 = "ng.add"(%2, %2) : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x4x2xf32>
+  // no inplace, result is output
+  %6 = "ng.add"(%4, %4) : (!ng.tensor<1x8x2xf32>, !ng.tensor<1x8x2xf32>) -> !ng.tensor<1x8x2xf32>
+  "ng.return"(%5, %6) : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x8x2xf32>) -> ()
+}
+// -----
+// CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP8:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2 + 8)
+// CHECK-DAG: #[[MAP9:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2 + 16)
+// CHECK-DAG: #[[MAP10:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 8 + d1 * 2 + d2 + 24)
+// CHECK-DAG: #[[MAP11:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 16 + d1 * 2 + d2)
+// CHECK-DAG: #[[MAP12:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 16 + d1 * 2 + d2 + 16)
+// CHECK-DAG: #[[MAP13:[a-zA-Z0-9]+]]  = (d0, d1, d2) -> (d0 * 32 + d1 * 2 + d2)
+// CHECK-LABEL: test3
+// CHECK: %[[B:.*]] = alloc() : memref<128xi8>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP0]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP8]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP9]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x4x2xf32, #[[MAP10]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x8x2xf32, #[[MAP11]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x8x2xf32, #[[MAP12]]>
+// CHECK: std.view %[[B]][][] : memref<128xi8> to memref<1x16x2xf32, #[[MAP13]]>
+// CHECK: dealloc %[[B]] : memref<128xi8>
+func @test3(%arg0: !ng.tensor<1x2x2xf32>, %arg1: !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x16x2xf32> {
+  %0 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %1 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %2 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %3 = "ng.concat"(%arg0, %arg1) {concatenation_axis = 1} : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x4x2xf32>
+  %4 = "ng.concat"(%0, %1) {concatenation_axis = 1} : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x8x2xf32>
+  %5 = "ng.concat"(%2, %3) {concatenation_axis = 1} : (!ng.tensor<1x4x2xf32>, !ng.tensor<1x4x2xf32>) -> !ng.tensor<1x8x2xf32>
+  %6 = "ng.concat"(%4, %5) {concatenation_axis = 1} : (!ng.tensor<1x8x2xf32>, !ng.tensor<1x8x2xf32>) -> !ng.tensor<1x16x2xf32>
+  %7 = "ng.add"(%6, %6) : (!ng.tensor<1x16x2xf32>, !ng.tensor<1x16x2xf32>) -> !ng.tensor<1x16x2xf32>
+  "ng.return"(%7) : (!ng.tensor<1x16x2xf32>) -> ()
+}
+// -----
+//CHECK-DAG: #[[MAP4:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2 + 4)
+//CHECK-DAG: #[[MAP5:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2)
+//CHECK-DAG: #[[MAP6:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 4 + d1 * 2 + d2 + 8)
+//CHECK-DAG: #[[MAP12:[a-zA-Z0-9]+]] = (d0, d1, d2) -> (d0 * 12 + d1 * 2 + d2)
+// CHECK-LABEL: test4
+//CHECK: %[[B1:.*]] = alloc() : memref<1x2x2xf32>
+//CHECK: %[[B2:.*]] = alloc() : memref<48xi8>
+//CHECK: std.view %[[B2]][][] : memref<48xi8> to memref<1x2x2xf32, #[[MAP4]]>
+//CHECK: %[[B3:.*]] = alloc() : memref<1x2x2xf32>
+//CHECK: std.view %[[B2]][][] : memref<48xi8> to memref<1x2x2xf32, #[[MAP5]]>
+//CHECK: std.view %[[B2]][][] : memref<48xi8> to memref<1x2x2xf32, #[[MAP6]]>
+//CHECK: %[[B4:.*]] = alloc() : memref<1x6x2xf32>
+//CHECK: std.view %1[][] : memref<48xi8> to memref<1x6x2xf32, #[[MAP12]]>
+//CHECK: dealloc %[[B1]] : memref<1x2x2xf32>
+//CHECK: dealloc %[[B2]] : memref<48xi8>
+//CHECK: dealloc %[[B3]] : memref<1x2x2xf32>
+//CHECK: dealloc %[[B4]] : memref<1x6x2xf32>
+func @test4(%arg0: !ng.tensor<1x2x2xf32>, %arg1: !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x8x2xf32> {
+    %S0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %S1 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %S2 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %R0 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    %R2 = "ng.add"(%arg0, %arg1) : (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x2x2xf32>
+    // pre-existing assignment of S1 in %D2 prevents assignment for %D1 concat
+    %D1 = "ng.concat"(%S0, %S1, %S2) {concatenation_axis = 1} :  (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x6x2xf32>
+    %D2 = "ng.concat"(%R0, %S1, %R2) {concatenation_axis = 1} :  (!ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>, !ng.tensor<1x2x2xf32>) -> !ng.tensor<1x6x2xf32>
+    %D3 = "ng.add"(%D1, %D2)    : (!ng.tensor<1x6x2xf32>, !ng.tensor<1x6x2xf32>) -> !ng.tensor<1x6x2xf32>
+    "ng.return"(%D3) : (!ng.tensor<1x6x2xf32>) -> ()
+}