diff --git a/src/contrib/mlir/compiler.cpp b/src/contrib/mlir/compiler.cpp index d94adc82486b8ef37e1e4d45062b9ef94ccb832b..c448e4a95bc3ac0c8cf316d581a8133e9a552bfa 100644 --- a/src/contrib/mlir/compiler.cpp +++ b/src/contrib/mlir/compiler.cpp @@ -23,6 +23,7 @@ #include "dialect/ops.hpp" #include "dialect/type.hpp" #include "lowerer.hpp" +#include "ngraph/check.hpp" #include "ngraph/descriptor/tensor.hpp" #include "ngraph/graph_util.hpp" #include "ngraph/node.hpp" @@ -45,11 +46,14 @@ #include "ngraph/type/element_type.hpp" #include <llvm/ADT/STLExtras.h> +#include <llvm/Analysis/TargetTransformInfo.h> +#include <llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h> #include <llvm/IR/Module.h> #include <llvm/Support/ErrorOr.h> #include <llvm/Support/MemoryBuffer.h> #include <llvm/Support/SourceMgr.h> #include <llvm/Support/TargetSelect.h> +#include <llvm/Target/TargetMachine.h> #include <mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h> #include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h> #include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h> @@ -65,17 +69,40 @@ #include <memory> #include <mutex> +// Defines a new LLVM debug type for this file to be used by LLVM_DEBUG macro. +#define DEBUG_TYPE "mlir-compiler" + using llvm::SmallVector; using llvm::StringRef; using llvm::make_unique; using llvm::ArrayRef; + +using namespace ngraph; using namespace ngraph::runtime::ngmlir; static llvm::cl::opt<bool> - clEnableAffineLoopFusion("enable-affine-loop-fusion", + clEnableAffineLoopFusion("affine-loop-fusion", llvm::cl::init(false), llvm::cl::desc("Enable loop fusion optimization in Affine dialect")); +static llvm::cl::opt<bool> + clEnableAffineLoopTiling("affine-loop-tile", + llvm::cl::init(false), + llvm::cl::desc("Enable loop tiling optimization in Affine dialect")); + +static llvm::cl::opt<unsigned> + clLoopTilingCacheLevel("affine-loop-tile-cache-level", + llvm::cl::init(2), + llvm::cl::desc("Cache level to which to apply affine loop tiling.")); + +static llvm::cl::opt<unsigned> clLoopTilingCacheSize( + "affine-loop-tile-cache-size", + llvm::cl::init(0), + llvm::cl::desc( + "Cache size to use in affine loop tiling. If not zero, it overrides the cache-size " + "inferred from the host CPU using for the cache level specified by " + "-loop-tile-cache-level.")); + #define COMPILE_OP_DECL(op_name) \ create_op<op_name>(MLIRCompiler & compiler, const ngraph::Node* ng_node) @@ -300,8 +327,85 @@ void MLIRCompiler::lower_ng_dialect() m_engine = std::move(maybeEngine.get()); } +/// Creates target machine for current host. +static llvm::Expected<std::unique_ptr<llvm::TargetMachine>> createDefaultTargetMachine() +{ + auto machineBuilder = llvm::orc::JITTargetMachineBuilder::detectHost(); + if (!machineBuilder) + { + return machineBuilder.takeError(); + } + + // Retrieve host CPU sub-target features. + llvm::SubtargetFeatures subtargetFeatures; + llvm::StringMap<bool> featureMap; + llvm::sys::getHostCPUFeatures(featureMap); + for (auto& feature : featureMap) + { + subtargetFeatures.AddFeature(feature.first(), feature.second); + } + + // Relocation model and code model are kept to default values. + machineBuilder->setCPU(llvm::sys::getHostCPUName()); + machineBuilder->setCodeGenOptLevel(llvm::CodeGenOpt::Aggressive); + machineBuilder->addFeatures(subtargetFeatures.getFeatures()); + + return machineBuilder->createTargetMachine(); +} + +/// Returns the cache level size from `targetInfo` for the `cacheLevel` provided. If `userCacheSize` +/// is not zero, it returns `userCacheSize`. +static unsigned getCacheLevelSize(llvm::TargetTransformInfo& targetInfo, + unsigned cacheLevel, + unsigned userCacheSize) +{ + if (userCacheSize) + { + return userCacheSize; + } + + llvm::Optional<unsigned> optCacheLevelSize; + switch (cacheLevel) + { + case 1: + optCacheLevelSize = targetInfo.getCacheSize(llvm::TargetTransformInfo::CacheLevel::L1D); + break; + case 2: + optCacheLevelSize = targetInfo.getCacheSize(llvm::TargetTransformInfo::CacheLevel::L2D); + break; + default: + NGRAPH_UNREACHABLE("Unsupported cache level: ", cacheLevel, ". Only 1 and 2 are supported"); + } + + NGRAPH_CHECK(optCacheLevelSize.hasValue() && "Cache level size is not available in TTI"); + return optCacheLevelSize.getValue(); +} + +// Receives affine dialect as input and applies affine and standard dialect based optimizations. +// Lowering from affine dialect to standard dialect happens along the way. Output consists of +// standard dialect only ops. void MLIRCompiler::optimize() { + // Create target machine with all the current host features. + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); + auto expectedTargetMachine = createDefaultTargetMachine(); + NGRAPH_CHECK(expectedTargetMachine, "Invalid target machine"); + auto targetMachine = std::move(*expectedTargetMachine); + + // Create target transform info to obtain some target information to be used in MLIR + // optimizations. This is a temporary attempt to retrieve some target information by reusing + // LLVM TTI infra while MLIR does not have target model. + llvm::LLVMContext llvmContext; + auto module = make_unique<llvm::Module>("test", llvmContext); + module->setDataLayout(targetMachine->createDataLayout()); + auto ttiSetupFunc = llvm::cast<llvm::Function>( + module + ->getOrInsertFunction("__ngraph_tti_setup", + llvm::FunctionType::get(llvm::Type::getVoidTy(llvmContext), {})) + .getCallee()); + auto targetInfo = targetMachine->getTargetTransformInfo(*ttiSetupFunc); + // Run Affine dialect optimizations. mlir::PassManager pm_opts; if (clEnableAffineLoopFusion) @@ -309,6 +413,18 @@ void MLIRCompiler::optimize() pm_opts.addPass(mlir::createLoopFusionPass()); } + if (clEnableAffineLoopTiling) + { + unsigned cacheLevelSize = + getCacheLevelSize(targetInfo, clLoopTilingCacheLevel, clLoopTilingCacheSize); + LLVM_DEBUG(llvm::dbgs() << "Enabling Affine Loop Tiling for cache level " + << clLoopTilingCacheLevel + << ": " + << cacheLevelSize + << " bytes.\n"); + pm_opts.addPass(mlir::createLoopTilingPass(cacheLevelSize)); + } + auto opt_res = pm_opts.run(m_module.get()); NGRAPH_CHECK(succeeded(opt_res), "Affine optimizations failed"); dump_mlir_module("Affine Dialect Dump (Post-Optimizations):");