CPU: Use transpose kernel from MKL in 2D Reshape

Also, add more codegen options

CPU: Use transpose kernel from MKL in 2D Reshape
Also, add more codegen options
31eb5c46 · Jaikrishnan Menon · 0df3792f · 31eb5c46 · 31eb5c46 · 31eb5c46
Commit 31eb5c46 authored Oct 31, 2017 by Jaikrishnan Menon
Show whitespace changes
Inline Side-by-side

Showing with 43 additions and 3 deletions

compiler.cpp src/ngraph/codegen/compiler.cpp +11 -2

cpu_kernels.hpp src/ngraph/runtime/cpu/cpu_kernels.hpp +12 -0

emitter.cpp src/ngraph/runtime/cpu/emitter.cpp +20 -1

No files found.
--- a/src/ngraph/codegen/compiler.cpp
+++ b/src/ngraph/codegen/compiler.cpp
@@ -143,10 +143,17 @@ std::unique_ptr<llvm::Module> execution_state::compile(const string& source, con
    LO->OpenMP = 1;
    LO->OpenMPUseTLS = 1;
-    if (debuginfo_enabled)
-    {
    // CodeGen options
    auto& CGO = Clang->getInvocation().getCodeGenOpts();
+    CGO.OptimizationLevel = 3;
+    CGO.RelocationModel = "static";
+    CGO.ThreadModel = "posix";
+    CGO.OmitLeafFramePointer = 1;
+    CGO.VectorizeLoop = 1;
+    CGO.VectorizeSLP = 1;
+    if (debuginfo_enabled)
+    {
        CGO.setDebugInfo(codegenoptions::FullDebugInfo);
    }
@@ -161,6 +168,8 @@ std::unique_ptr<llvm::Module> execution_state::compile(const string& source, con
    // Enable various target features
    // Most of these are for Eigen
    auto &TO = Clang->getInvocation().getTargetOpts();
+    // TODO: This needs to be configurable and selected carefully
+    TO.CPU = "broadwell";
    TO.FeaturesAsWritten.emplace_back("+sse4.1");
    TO.FeaturesAsWritten.emplace_back("+sse4.2");
    TO.FeaturesAsWritten.emplace_back("+avx");

--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
@@ -86,3 +86,15 @@ namespace cblas
                     const ngraph::element::Int64::type ldc);
    }
 }
+namespace mkl
+{
+    extern "C" {
+    void MKL_Somatcopy(char ordering,
+                       char trans,
+                       size_t rows, size_t cols,
+                       const ngraph::element::Float32::type alpha,
+                       const ngraph::element::Float32::type* A, size_t lda,
+                       ngraph::element::Float32::type* B, size_t ldb);
+    }
+}
--- a/src/ngraph/runtime/cpu/emitter.cpp
+++ b/src/ngraph/runtime/cpu/emitter.cpp
@@ -979,7 +979,25 @@ void Emitter::EMITTER_DECL(EmitReshape)
        auto arg0_layout = inputs[0].get_layout<DenseTensorViewLayout>();
        auto out_layout = outputs[0].get_layout<DenseTensorViewLayout>();
-        TU += "    {\n"
+        // Emit an MKL transpose call if possible
+        if (result_element_type == ngraph::element::Float32::element_type())
+        {
+            TU +=
+                "    {\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(result_element_type)] +
+                ">(" + to_string(inputs[0].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        mkl::MKL_Somatcopy('R', 'T', " + to_string(arg_shape[0]) + ",\n"
+                "                          " + to_string(arg_shape[1]) + ", 1.0f,\n"
+                "                           arg0, " + to_string(arg_shape[1]) + ",\n"
+                "                           out, " + to_string(arg_shape[0]) + ");\n"
+                "    }\n";
+        }
+        else
+        {
+            TU +=
+                "    {\n"
                "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(result_element_type)] +
                ">(" + to_string(inputs[0].get_index()) + ");\n"
                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(result_element_type)] +
@@ -990,6 +1008,7 @@ void Emitter::EMITTER_DECL(EmitReshape)
                EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) + ").transpose();\n"
                "    }\n";
        }
+    }
    // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
    else
    {