Commit 14624c03 authored by Diego Caballero's avatar Diego Caballero Committed by Scott Cyphers

[MLIR] Enable affine loop tiling (#3397)

* [MLIR] Enable affine loop tiling

This PR enables loop tiling optimization in affine dialect. It
introduces the following flags for configuration.
  - affine-loop-tile: enables/disables the optimization. Disabled by
    default.
  - loop-tile-cache-level: provides the cache level to which apply loop
    tiling to. Cache level size is obtained from LLVM's TTI.
  - loop-tile-cache-size: provides a cache level size that overrides
    cache information from TTI.

The current use of TTI is a bit hacky since we have to pass a fake
LLVM's function to make it work. However, this should be enough to get
some basic target information until we have a target model in MLIR or
find a better approach.

* Address feedback

* Rename flags
parent e53d2e9a
......@@ -23,6 +23,7 @@
#include "dialect/ops.hpp"
#include "dialect/type.hpp"
#include "lowerer.hpp"
#include "ngraph/check.hpp"
#include "ngraph/descriptor/tensor.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/node.hpp"
......@@ -45,11 +46,14 @@
#include "ngraph/type/element_type.hpp"
#include <llvm/ADT/STLExtras.h>
#include <llvm/Analysis/TargetTransformInfo.h>
#include <llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h>
#include <llvm/IR/Module.h>
#include <llvm/Support/ErrorOr.h>
#include <llvm/Support/MemoryBuffer.h>
#include <llvm/Support/SourceMgr.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Target/TargetMachine.h>
#include <mlir/Conversion/ControlFlowToCFG/ConvertControlFlowToCFG.h>
#include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h>
#include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h>
......@@ -65,17 +69,40 @@
#include <memory>
#include <mutex>
// Defines a new LLVM debug type for this file to be used by LLVM_DEBUG macro.
#define DEBUG_TYPE "mlir-compiler"
using llvm::SmallVector;
using llvm::StringRef;
using llvm::make_unique;
using llvm::ArrayRef;
using namespace ngraph;
using namespace ngraph::runtime::ngmlir;
static llvm::cl::opt<bool>
clEnableAffineLoopFusion("enable-affine-loop-fusion",
clEnableAffineLoopFusion("affine-loop-fusion",
llvm::cl::init(false),
llvm::cl::desc("Enable loop fusion optimization in Affine dialect"));
static llvm::cl::opt<bool>
clEnableAffineLoopTiling("affine-loop-tile",
llvm::cl::init(false),
llvm::cl::desc("Enable loop tiling optimization in Affine dialect"));
static llvm::cl::opt<unsigned>
clLoopTilingCacheLevel("affine-loop-tile-cache-level",
llvm::cl::init(2),
llvm::cl::desc("Cache level to which to apply affine loop tiling."));
static llvm::cl::opt<unsigned> clLoopTilingCacheSize(
"affine-loop-tile-cache-size",
llvm::cl::init(0),
llvm::cl::desc(
"Cache size to use in affine loop tiling. If not zero, it overrides the cache-size "
"inferred from the host CPU using for the cache level specified by "
"-loop-tile-cache-level."));
#define COMPILE_OP_DECL(op_name) \
create_op<op_name>(MLIRCompiler & compiler, const ngraph::Node* ng_node)
......@@ -300,8 +327,85 @@ void MLIRCompiler::lower_ng_dialect()
m_engine = std::move(maybeEngine.get());
}
/// Creates target machine for current host.
static llvm::Expected<std::unique_ptr<llvm::TargetMachine>> createDefaultTargetMachine()
{
auto machineBuilder = llvm::orc::JITTargetMachineBuilder::detectHost();
if (!machineBuilder)
{
return machineBuilder.takeError();
}
// Retrieve host CPU sub-target features.
llvm::SubtargetFeatures subtargetFeatures;
llvm::StringMap<bool> featureMap;
llvm::sys::getHostCPUFeatures(featureMap);
for (auto& feature : featureMap)
{
subtargetFeatures.AddFeature(feature.first(), feature.second);
}
// Relocation model and code model are kept to default values.
machineBuilder->setCPU(llvm::sys::getHostCPUName());
machineBuilder->setCodeGenOptLevel(llvm::CodeGenOpt::Aggressive);
machineBuilder->addFeatures(subtargetFeatures.getFeatures());
return machineBuilder->createTargetMachine();
}
/// Returns the cache level size from `targetInfo` for the `cacheLevel` provided. If `userCacheSize`
/// is not zero, it returns `userCacheSize`.
static unsigned getCacheLevelSize(llvm::TargetTransformInfo& targetInfo,
unsigned cacheLevel,
unsigned userCacheSize)
{
if (userCacheSize)
{
return userCacheSize;
}
llvm::Optional<unsigned> optCacheLevelSize;
switch (cacheLevel)
{
case 1:
optCacheLevelSize = targetInfo.getCacheSize(llvm::TargetTransformInfo::CacheLevel::L1D);
break;
case 2:
optCacheLevelSize = targetInfo.getCacheSize(llvm::TargetTransformInfo::CacheLevel::L2D);
break;
default:
NGRAPH_UNREACHABLE("Unsupported cache level: ", cacheLevel, ". Only 1 and 2 are supported");
}
NGRAPH_CHECK(optCacheLevelSize.hasValue() && "Cache level size is not available in TTI");
return optCacheLevelSize.getValue();
}
// Receives affine dialect as input and applies affine and standard dialect based optimizations.
// Lowering from affine dialect to standard dialect happens along the way. Output consists of
// standard dialect only ops.
void MLIRCompiler::optimize()
{
// Create target machine with all the current host features.
llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();
auto expectedTargetMachine = createDefaultTargetMachine();
NGRAPH_CHECK(expectedTargetMachine, "Invalid target machine");
auto targetMachine = std::move(*expectedTargetMachine);
// Create target transform info to obtain some target information to be used in MLIR
// optimizations. This is a temporary attempt to retrieve some target information by reusing
// LLVM TTI infra while MLIR does not have target model.
llvm::LLVMContext llvmContext;
auto module = make_unique<llvm::Module>("test", llvmContext);
module->setDataLayout(targetMachine->createDataLayout());
auto ttiSetupFunc = llvm::cast<llvm::Function>(
module
->getOrInsertFunction("__ngraph_tti_setup",
llvm::FunctionType::get(llvm::Type::getVoidTy(llvmContext), {}))
.getCallee());
auto targetInfo = targetMachine->getTargetTransformInfo(*ttiSetupFunc);
// Run Affine dialect optimizations.
mlir::PassManager pm_opts;
if (clEnableAffineLoopFusion)
......@@ -309,6 +413,18 @@ void MLIRCompiler::optimize()
pm_opts.addPass(mlir::createLoopFusionPass());
}
if (clEnableAffineLoopTiling)
{
unsigned cacheLevelSize =
getCacheLevelSize(targetInfo, clLoopTilingCacheLevel, clLoopTilingCacheSize);
LLVM_DEBUG(llvm::dbgs() << "Enabling Affine Loop Tiling for cache level "
<< clLoopTilingCacheLevel
<< ": "
<< cacheLevelSize
<< " bytes.\n");
pm_opts.addPass(mlir::createLoopTilingPass(cacheLevelSize));
}
auto opt_res = pm_opts.run(m_module.get());
NGRAPH_CHECK(succeeded(opt_res), "Affine optimizations failed");
dump_mlir_module("Affine Dialect Dump (Post-Optimizations):");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment