Merge pull request #222 from NervanaSystems/jmenon/cpu_kernels

CPU Backend: More ops and kernels

Merge pull request #222 from NervanaSystems/jmenon/cpu_kernels
CPU Backend: More ops and kernels
f5768063 · Robert Kimball · GitHub · 69a2d4aa · 792d3328 · f5768063
Unverified Commit f5768063 authored Nov 07, 2017 by Robert Kimball Committed by GitHub Nov 07, 2017
10 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -99,18 +99,21 @@ include_directories(
    "${EIGEN_INCLUDE_DIR}"
    )
-if(LLVM_INCLUDE_DIR)
+if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
+    MKLDNN_INCLUDE_DIR)
    find_package(ZLIB REQUIRED)
-    include_directories(SYSTEM ${LLVM_INCLUDE_DIR})
+    include_directories(SYSTEM ${LLVM_INCLUDE_DIR} ${MKLDNN_INCLUDE_DIR})
-    link_directories(${LLVM_LIB_DIR})
+    link_directories(${LLVM_LIB_DIR} ${MKLDNN_LIB_DIR})
    # Add sources for the CPU backend
    # and all its dependencies
    set(SRC ${SRC}
        codegen/compiler.cpp
        runtime/cpu/call_frame.cpp
-        runtime/cpu/cpu_manager.cpp
        runtime/cpu/cpu_backend.cpp
+        runtime/cpu/cpu_manager.cpp
+        runtime/cpu/cpu_kernels.cpp
        runtime/cpu/emitter.cpp
        runtime/cpu/external_function.cpp
        )
@@ -129,7 +132,7 @@ endif()
 add_library(ngraph SHARED ${SRC})
 target_include_directories(ngraph PUBLIC "${NGRAPH_INCLUDE_PATH}")
-if(LLVM_LINK_LIBS)
+if(NGRAPH_CPU_ENABLE AND LLVM_LINK_LIBS)
    target_link_libraries(ngraph LINK_PRIVATE ${LLVM_LINK_LIBS})
 endif()
@@ -137,8 +140,10 @@ if (APPLE)
    set_property(TARGET ngraph PROPERTY PREFIX "lib")
    set_property(TARGET ngraph PROPERTY OUTPUT_NAME "ngraph.so")
    set_property(TARGET ngraph PROPERTY SUFFIX "")
-else()
+endif()
-    include_directories("${MKLDNN_INCLUDE_DIR}")
+if(NGRAPH_CPU_ENABLE AND MKLDNN_LIB_DIR)
+    target_link_libraries(ngraph LINK_PRIVATE mkldnn)
 endif()
 #-----------------------------------------------------------------------------------------------
@@ -178,6 +183,10 @@ endif()
 add_dependencies(ngraph eigen)
-if(NOT LLVM_PACKAGED AND LLVM_INCLUDE_DIR)
+if(NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR)
    add_dependencies(ngraph ext_llvm)
 endif()
+if(NGRAPH_CPU_ENABLE AND MKLDNN_INCLUDE_DIR)
+    add_dependencies(ngraph ext_mkldnn)
+endif()
--- a/src/ngraph/codegen/compiler.cpp
+++ b/src/ngraph/codegen/compiler.cpp
@@ -145,10 +145,19 @@ std::unique_ptr<llvm::Module> execution_state::compile(const string& source, con
    LO->OpenMP = 1;
    LO->OpenMPUseTLS = 1;
+    // CodeGen options
+    auto& CGO = Clang->getInvocation().getCodeGenOpts();
+    CGO.OptimizationLevel = 3;
+    CGO.RelocationModel = "static";
+    CGO.ThreadModel = "posix";
+    CGO.FloatABI = "hard";
+    CGO.OmitLeafFramePointer = 1;
+    CGO.VectorizeLoop = 1;
+    CGO.VectorizeSLP = 1;
+    CGO.CXAAtExit = 0;
    if (debuginfo_enabled)
    {
-        // CodeGen options
-        auto& CGO = Clang->getInvocation().getCodeGenOpts();
        CGO.setDebugInfo(codegenoptions::FullDebugInfo);
    }
@@ -163,6 +172,12 @@ std::unique_ptr<llvm::Module> execution_state::compile(const string& source, con
    // Enable various target features
    // Most of these are for Eigen
    auto& TO = Clang->getInvocation().getTargetOpts();
+    // TODO: This needs to be configurable and selected carefully
+    TO.CPU = "broadwell";
+    TO.FeaturesAsWritten.emplace_back("+sse");
+    TO.FeaturesAsWritten.emplace_back("+sse2");
+    TO.FeaturesAsWritten.emplace_back("+sse3");
+    TO.FeaturesAsWritten.emplace_back("+ssse3");
    TO.FeaturesAsWritten.emplace_back("+sse4.1");
    TO.FeaturesAsWritten.emplace_back("+sse4.2");
    TO.FeaturesAsWritten.emplace_back("+avx");

--- a/src/ngraph/runtime/cpu/call_frame.cpp
+++ b/src/ngraph/runtime/cpu/call_frame.cpp
@@ -22,12 +22,13 @@ using namespace ngraph::runtime::cpu;
 CallFrame::CallFrame(EntryPoint compiled_function,
                     size_t n_outputs,
                     size_t n_inputs,
-                     const TensorViewPtrs& temps)
+                     const TensorViewPtrs& temps,
+                     const std::vector<std::shared_ptr<CallFrame>>& callees)
    : m_n_outputs(n_outputs)
    , m_n_inputs(n_inputs)
-    , m_tensor_views(n_inputs + n_outputs + temps.size())
+    , m_tensor_views(n_outputs + n_inputs + temps.size())
    , m_compiled_function(compiled_function)
+    , m_callees(callees)
 {
    copy(temps.begin(), temps.end(), m_tensor_views.begin() + m_n_outputs + m_n_inputs);
 }
@@ -40,7 +41,7 @@ void CallFrame::tensor_call(
    copy(inputs.begin(), inputs.end(), m_tensor_views.begin() + m_n_outputs);
    // Invoke compiled computation
-    m_compiled_function(this, m_tensor_views);
+    m_compiled_function(this, m_tensor_views, m_callees);
    // Don't hold onto inputs/outputs
    fill_n(m_tensor_views.begin(), m_n_outputs + m_n_inputs, nullptr);

--- a/src/ngraph/runtime/cpu/call_frame.hpp
+++ b/src/ngraph/runtime/cpu/call_frame.hpp
@@ -31,8 +31,10 @@ namespace ngraph
        namespace cpu
        {
            class CallFrame;
            using EntryPoint = std::function<void(ngraph::runtime::cpu::CallFrame*,
-                                                  ngraph::runtime::TensorViewPtrs&)>;
+                                                  ngraph::runtime::TensorViewPtrs&,
+                                                  const std::vector<std::shared_ptr<CallFrame>>&)>;
            // Compile and execute graphs
            class CallFrame : public ngraph::runtime::CallFrame
@@ -41,7 +43,8 @@ namespace ngraph
                CallFrame(EntryPoint compiled_function,
                          size_t n_outputs,
                          size_t n_inputs,
-                          const TensorViewPtrs& temps);
+                          const TensorViewPtrs& temps,
+                          const std::vector<std::shared_ptr<CallFrame>>& callees);
                /// @brief Invoke the function with values matching the signature of the function.
                ///
@@ -73,6 +76,7 @@ namespace ngraph
                TensorViewPtrs m_tensor_views;
                bool m_return;
                EntryPoint m_compiled_function;
+                std::vector<std::shared_ptr<CallFrame>> m_callees;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_kernels.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include "ngraph/runtime/cpu/cpu_kernels.hpp"
--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/types/element_type.hpp"
+// CBLAS types and wrappers
+namespace cblas
+{
+    enum class Layout
+    {
+        RowMajor = 101,
+        ColMajor = 102
+    };
+    enum class Transpose
+    {
+        None = 111,
+        Transpose = 112,
+        ConjTrans = 113
+    };
+    enum class UpperLower
+    {
+        Upper = 121,
+        Lower = 122
+    };
+    enum class Diag
+    {
+        NonUnit = 131,
+        Unit = 132
+    };
+    enum class Side
+    {
+        Left = 141,
+        Right = 142
+    };
+    enum class Storage
+    {
+        Packed = 151
+    };
+    enum class Ident
+    {
+        AMatrix = 161,
+        BMatrix = 162
+    };
+    enum class Offset
+    {
+        RowOffset = 171,
+        ColOffset = 172,
+        FixOffset = 173
+    };
+    extern "C" {
+    void cblas_sgemm(const Layout layout,
+                     const Transpose TransA,
+                     const Transpose TransB,
+                     const ngraph::element::Int64::type M,
+                     const ngraph::element::Int64::type N,
+                     const ngraph::element::Int64::type K,
+                     const ngraph::element::Float32::type alpha,
+                     const ngraph::element::Float32::type* A,
+                     const ngraph::element::Int64::type lda,
+                     const ngraph::element::Float32::type* B,
+                     const ngraph::element::Int64::type ldb,
+                     const ngraph::element::Float32::type beta,
+                     ngraph::element::Float32::type* C,
+                     const ngraph::element::Int64::type ldc);
+    }
+}
+namespace mkl
+{
+    extern "C" {
+    void MKL_Somatcopy(char ordering,
+                       char trans,
+                       size_t rows,
+                       size_t cols,
+                       const ngraph::element::Float32::type alpha,
+                       const ngraph::element::Float32::type* A,
+                       size_t lda,
+                       ngraph::element::Float32::type* B,
+                       size_t ldb);
+    }
+}
--- a/src/ngraph/runtime/cpu/emitter.cpp
+++ b/src/ngraph/runtime/cpu/emitter.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // ----------------------------------------------------------------------------
+#include <algorithm>
 #include <iostream>
 #include <string>
 #include <typeindex>
@@ -23,7 +24,13 @@
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/concatenate.hpp"
 #include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/function_call.hpp"
 #include "ngraph/ops/get_tuple_element.hpp"
+#include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/sum.hpp"
+#include "ngraph/runtime/cpu/call_frame.hpp"
 #include "ngraph/runtime/cpu/emitter.hpp"
 #include "ngraph/runtime/cpu/external_function.hpp"
 #include "ngraph/runtime/tensor_view_info.hpp"
@@ -177,30 +184,58 @@ void Emitter::EMITTER_DECL(EmitDot)
        auto arg1_layout = inputs[1].get_layout<DenseTensorViewLayout>();
        auto out_layout = outputs[0].get_layout<DenseTensorViewLayout>();
-        TU +=
+        // Emit an MKL SGEMM call if possible
-            "    {\n"
+        // clang-format off
-            "        auto arg0 = call_frame->get_tensor_view_data<" +
+        if (arg0_element_type == ngraph::element::Float32::element_type())
-            element_type_names[TI(arg0_element_type)] + ">(" + to_string(inputs[0].get_index()) +
+        {
-            ");\n"
+            TU +=
-            "        auto arg1 = call_frame->get_tensor_view_data<" +
+                "    {\n"
-            element_type_names[TI(arg0_element_type)] + ">(" + to_string(inputs[1].get_index()) +
+                "        auto arg0 = call_frame->get_tensor_view_data<" +
-            ");\n"
+                element_type_names[TI(arg0_element_type)] + ">(" + to_string(inputs[0].get_index()) +
-            "        auto out  = call_frame->get_tensor_view_data<" +
+                ");\n"
-            element_type_names[TI(arg0_element_type)] + ">(" + to_string(outputs[0].get_index()) +
+                "        auto arg1 = call_frame->get_tensor_view_data<" +
-            ");\n"
+                element_type_names[TI(arg0_element_type)] + ">(" + to_string(inputs[1].get_index()) +
-            "        EigenMatrix<" +
+                ");\n"
-            element_type_names[TI(arg0_element_type)] + ">(out, " +
+                "        auto out  = call_frame->get_tensor_view_data<" +
-            EIGEN_MATRIX_FORMAT(out_layout->get_shape(), out_layout->get_strides()) +
+                element_type_names[TI(arg0_element_type)] + ">(" + to_string(outputs[0].get_index()) +
-            ") = \n"
+                ");\n"
-            "        EigenMatrix<" +
+                "        cblas::cblas_sgemm(cblas::Layout::RowMajor, cblas::Transpose::None, cblas::Transpose::None, " +
-            element_type_names[TI(arg0_element_type)] + ">(arg0, " +
+                to_string(arg0_shape[0]) + ", " + to_string(arg1_shape[1]) + ", " + to_string(arg0_shape[1]) + ",\n"
-            EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) +
+                "        1.0f, arg0, " + to_string(max(1UL, arg0_shape[1])) + ", arg1, " + to_string(max(1UL, arg1_shape[1])) + ", 0.0f,\n"
-            ") * "
+                "        out, " + to_string(max(1UL, arg1_shape[1])) + ");\n"
-            "EigenMatrix<" +
+                "    }\n";
-            element_type_names[TI(arg0_element_type)] + ">(arg1, " +
+        }
-            EIGEN_MATRIX_FORMAT(arg1_layout->get_shape(), arg1_layout->get_strides()) +
+        // clang-format on
-            ");\n"
+        else
-            "    }\n";
+        {
+            TU +=
+                "    {\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" +
+                element_type_names[TI(arg0_element_type)] + ">(" +
+                to_string(inputs[0].get_index()) +
+                ");\n"
+                "        auto arg1 = call_frame->get_tensor_view_data<" +
+                element_type_names[TI(arg0_element_type)] + ">(" +
+                to_string(inputs[1].get_index()) +
+                ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" +
+                element_type_names[TI(arg0_element_type)] + ">(" +
+                to_string(outputs[0].get_index()) +
+                ");\n"
+                "        EigenMatrix<" +
+                element_type_names[TI(arg0_element_type)] + ">(out, " +
+                EIGEN_MATRIX_FORMAT(out_layout->get_shape(), out_layout->get_strides()) +
+                ") = \n"
+                "        EigenMatrix<" +
+                element_type_names[TI(arg0_element_type)] + ">(arg0, " +
+                EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) +
+                ") * "
+                "EigenMatrix<" +
+                element_type_names[TI(arg0_element_type)] + ">(arg1, " +
+                EIGEN_MATRIX_FORMAT(arg1_layout->get_shape(), arg1_layout->get_strides()) +
+                ");\n"
+                "    }\n";
+        }
    }
    else
    {
@@ -501,7 +536,26 @@ void Emitter::EMITTER_DECL(EmitMaximum)
          "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
                   EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
          "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
-                   EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").max("
+                   EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").max(\n"
+          "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg1, "
+                   EIGEN_VECTOR_FORMAT(inputs[1].get_layout<DenseTensorViewLayout>()->get_size()) "));\n"
+          "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitMinimum)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU += "    {\n"
+          "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" + to_string(inputs[0].get_index()) + ");\n"
+          "        auto arg1 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" + to_string(inputs[1].get_index()) + ");\n"
+          "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" + to_string(outputs[0].get_index()) + ");\n"
+          "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+                   EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+          "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+                   EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").min(\n"
          "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg1, "
                   EIGEN_VECTOR_FORMAT(inputs[1].get_layout<DenseTensorViewLayout>()->get_size()) "));\n"
          "    }\n";
@@ -882,3 +936,775 @@ void Emitter::EMITTER_DECL(EmitConvert)
          ".template cast<typename " + element_type_names[TI(result_element_type)] + "::type>();\n"
          "    }\n";
 }
+void Emitter::EMITTER_DECL(EmitConstant)
+{
+    auto c = static_cast<const op::Constant*>(n);
+    auto c_tensor_type = dynamic_pointer_cast<const TensorViewType>(c->get_value_type());
+    assert(c_tensor_type);
+    auto& c_element_type = c_tensor_type->get_element_type();
+    auto c_value_strings = c->get_value_strings();
+    TU +=
+        "    {\n"
+        "        call_frame->get_parameterized_tensor_view<" +
+        element_type_names[TI(c_element_type)] + ">(" + to_string(outputs[0].get_index()) +
+        ")->get_vector() = std::vector<" + element_type_names[TI(c_element_type)] + "::type>{";
+    for (size_t i = 0; i < c_value_strings.size(); i++)
+    {
+        if (i)
+            TU += ", ";
+        TU += c_value_strings[i];
+    }
+    TU += "};\n    }\n";
+}
+void Emitter::EMITTER_DECL(EmitReshape)
+{
+    auto reshape = static_cast<const op::Reshape*>(n);
+    auto arg_type = reshape->get_arguments().at(0)->get_value_type();
+    auto arg_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(arg_type);
+    assert(arg_tensor_view_type);
+    auto arg_shape = arg_tensor_view_type->get_shape();
+    auto arg_rank = arg_shape.size();
+    auto result_type = reshape->get_value_type();
+    auto result_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(result_type);
+    assert(result_tensor_view_type);
+    auto result_shape = result_tensor_view_type->get_shape();
+    auto& result_element_type = result_tensor_view_type->get_element_type();
+    auto input_order = reshape->get_input_order();
+    bool same_layout = std::is_sorted(input_order.begin(), input_order.end());
+    size_t result_shape_product = 1;
+    for (auto i : result_shape)
+    {
+        result_shape_product *= i;
+    }
+    // If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor, we can just copy.
+    if (same_layout || result_shape_product < 2)
+    {
+        TU +=
+            "    {\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(result_element_type)] + ">(" +
+            to_string(outputs.at(0).get_index()) +
+            ")->get_vector() =\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(result_element_type)] + ">(" +
+            to_string(inputs.at(0).get_index()) +
+            ")->get_vector();\n"
+            "    }\n";
+    }
+    // If there *is* a layout change in the 2D case, we transpose the input.
+    else if (arg_rank == 2)
+    {
+        auto arg0_layout = inputs[0].get_layout<DenseTensorViewLayout>();
+        auto out_layout = outputs[0].get_layout<DenseTensorViewLayout>();
+        // Emit an MKL transpose call if possible
+        // clang-format off
+        if (result_element_type == ngraph::element::Float32::element_type())
+        {
+            TU +=
+                "    {\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(result_element_type)] +
+                ">(" + to_string(inputs[0].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        mkl::MKL_Somatcopy('R', 'T', " + to_string(arg_shape[0]) + ",\n"
+                "                          " + to_string(arg_shape[1]) + ", 1.0f,\n"
+                "                           arg0, " + to_string(arg_shape[1]) + ",\n"
+                "                           out, " + to_string(arg_shape[0]) + ");\n"
+                "    }\n";
+        }
+        // clang-format on
+        else
+        {
+            TU +=
+                "    {\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" +
+                element_type_names[TI(result_element_type)] + ">(" +
+                to_string(inputs[0].get_index()) +
+                ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" +
+                element_type_names[TI(result_element_type)] + ">(" +
+                to_string(outputs[0].get_index()) +
+                ");\n"
+                "        EigenMatrix<" +
+                element_type_names[TI(result_element_type)] + ">(out, " +
+                EIGEN_MATRIX_FORMAT(out_layout->get_shape(), out_layout->get_strides()) +
+                ") =\n"
+                "        EigenMatrix<" +
+                element_type_names[TI(result_element_type)] + ">(arg0, " +
+                EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) +
+                ").transpose();\n"
+                "    }\n";
+        }
+    }
+    // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
+    else
+    {
+        throw ngraph_error(
+            "Axis permutation in reshape is not implemented yet for tensors with rank>2");
+    }
+}
+void Emitter::EMITTER_DECL(EmitFunctionCall)
+{
+    auto function_call = static_cast<const op::FunctionCall*>(n);
+    auto function = function_call->get_function();
+    std::shared_ptr<ExternalFunction> external;
+    try
+    {
+        external = function_map.at(function);
+    }
+    catch (const std::out_of_range)
+    {
+        external = make_shared<ExternalFunction>(function);
+        function_map.insert({function, external});
+    }
+    std::shared_ptr<CallFrame> cf =
+        std::dynamic_pointer_cast<CallFrame>(external->make_call_frame());
+    ef->get_callees().emplace_back(cf);
+    TU +=
+        "    {\n"
+        "        auto cf = callees.at(" +
+        to_string(ef->get_callees().size() - 1) +
+        ");\n"
+        "        std::vector<std::shared_ptr<ngraph::runtime::Value>> inputs;\n"
+        "        std::vector<std::shared_ptr<ngraph::runtime::Value>> outputs;\n";
+    for (const auto& in : inputs)
+    {
+        TU += "        inputs.emplace_back(call_frame->get_tensor_view(" +
+              to_string(in.get_index()) + "));\n";
+    }
+    for (const auto& out : outputs)
+    {
+        TU += "        outputs.emplace_back(call_frame->get_tensor_view(" +
+              to_string(out.get_index()) + "));\n";
+    }
+    TU +=
+        "        (*cf)(inputs, outputs);\n"
+        "    }\n";
+}
+// TODO: This and other ops include comments/notes that
+// we don't want to just copy-paste here. Figure out a better way
+// or just point to ngvm/external_function.cpp with a note that
+// the compiled version of these ops is intended to have semantics identical
+// to what's seen there (for now atleast)
+void Emitter::EMITTER_DECL(EmitReduce)
+{
+    auto reduce = static_cast<const op::Reduce*>(n);
+    auto reduction_function = reduce->get_reduction_function();
+    std::shared_ptr<ExternalFunction> external;
+    try
+    {
+        external = function_map.at(reduction_function);
+    }
+    catch (const std::out_of_range)
+    {
+        external = make_shared<ExternalFunction>(reduction_function);
+        function_map.insert({reduction_function, external});
+    }
+    auto reductee_type = reduce->get_arguments().at(0)->get_value_type();
+    auto reductee_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(reductee_type);
+    assert(reductee_tensor_view_type);
+    auto reductee_shape = reductee_tensor_view_type->get_shape();
+    auto f_result_type = reduction_function->get_result_type();
+    auto f_result_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(f_result_type);
+    assert(f_result_tensor_view_type);
+    auto& f_result_element_type = f_result_tensor_view_type->get_element_type();
+    auto result_type = reduce->get_value_type();
+    auto result_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(result_type);
+    assert(result_tensor_view_type);
+    auto result_shape = result_tensor_view_type->get_shape();
+    auto& reduction_axes = reduce->get_reduction_axes();
+    auto arg0_layout = inputs[0].get_layout<DenseTensorViewLayout>();
+    // Trivial case: no reduction axes (this includes the scalar-reductee case).
+    if (reduction_axes.empty())
+    {
+        TU +=
+            "    {\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(f_result_element_type)] + ">(" +
+            to_string(outputs.at(0).get_index()) +
+            ")->get_vector() =\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(f_result_element_type)] + ">(" +
+            to_string(inputs.at(0).get_index()) +
+            ")->get_vector();\n"
+            "    }\n";
+    }
+    // Behavior for zero-size axes bears some explanation here. XLA's reduce
+    // operator provides an "base" element (usually, but not necessarily,
+    // an identity element) that it apparently *may* choose to insert anywhere
+    // in the reduction any number of times. For example, given:
+    //
+    //   reduce{{1,2,3},b,+)
+    //
+    // any of the following are valid reductions (I think!):
+    //
+    //   b+(b+1+2)+3
+    //   b+(1+(2+3))
+    //   (1+2)+3 (I think!)
+    //
+    // etc. Here we will choose never to instantiate the base element, which
+    // works well with Eigen's default behavior for non-zero-length axes. The
+    // exceptional case is when we reduce on a zero-length axis. In this case,
+    // Eigen's default behavior is to put a zero in the output,  which is not
+    // what we want, so we detect that case here and override with a copy
+    // instruction (for reduce-to-scalar) or a broadcast (for reduce-to-vector)
+    // from the base element.
+    //
+    // What I'm actually not sure about is whether the identity element is
+    // required to appear at least once. If so, this will need to be reworked,
+    // assuming we actually want to mimic XLA's semantics that closely, which
+    // we may not.
+    else if ((reductee_shape.size() == 1 && reduction_axes == AxisSet{0}) ||
+             (reductee_shape.size() == 2 && reduction_axes == AxisSet{0, 1}))
+    {
+        if (reductee_shape.at(0) == 0 || (reductee_shape.size() == 2 && reductee_shape.at(1) == 0))
+        {
+            TU +=
+                "    {\n"
+                "        call_frame->get_parameterized_tensor_view<" +
+                element_type_names[TI(f_result_element_type)] + ">(" +
+                to_string(outputs.at(0).get_index()) +
+                ")->get_vector() =\n"
+                "        call_frame->get_parameterized_tensor_view<" +
+                element_type_names[TI(f_result_element_type)] + ">(" +
+                to_string(inputs.at(1).get_index()) +
+                ")->get_vector();\n"
+                "    }\n";
+        }
+        else
+        {
+            std::shared_ptr<CallFrame> cf =
+                std::dynamic_pointer_cast<CallFrame>(external->make_call_frame());
+            ef->get_callees().emplace_back(cf);
+            TU +=
+                "    {\n"
+                "        using ET = " + element_type_names[TI(f_result_element_type)] + ";\n"
+                "        auto cf = callees.at(" + to_string(ef->get_callees().size() - 1) + ");\n"
+                "        auto f = [cf](typename ET::type x, typename ET::type y) -> typename ET::type {\n"
+                "            auto tx = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            *tx = std::vector<typename ET::type>({x});\n"
+                "            auto ty = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            *ty = std::vector<typename ET::type>({y});\n"
+                "            auto tr = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            (*cf)({tx, ty}, {tr});\n"
+                "            return tr->get_vector()[0];\n"
+                "        };\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(inputs[0].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        EigenArray1d<" + element_type_names[TI(f_result_element_type)] + ">(out, "
+                EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+                "        EigenArray1d<" + element_type_names[TI(f_result_element_type)] + ">(arg0, "
+                EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").redux(f);\n"
+                "    }\n";
+        }
+    }
+    else if (reductee_shape.size() == 2 && reduction_axes == AxisSet{1})
+    {
+        if (reductee_shape.at(1) == 0)
+        {
+            TU += "    {\n"
+                "        auto arg1 = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(inputs[1].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        EigenArray1d<" + element_type_names[TI(f_result_element_type)] + ">(out, "
+                EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+                "        EigenArray1d<" + element_type_names[TI(f_result_element_type)] + ">(arg1, "
+                EIGEN_VECTOR_FORMAT(inputs[1].get_layout<DenseTensorViewLayout>()->get_size()) ")(0, 0);\n"
+                "    }\n";
+        }
+        else
+        {
+            std::shared_ptr<CallFrame> cf =
+                std::dynamic_pointer_cast<CallFrame>(external->make_call_frame());
+            ef->get_callees().emplace_back(cf);
+            TU +=
+                "    {\n"
+                "        using ET = " + element_type_names[TI(f_result_element_type)] + ";\n"
+                "        auto cf = callees.at(" + to_string(ef->get_callees().size() - 1) + ");\n"
+                "        auto f = [cf](typename ET::type x, typename ET::type y) -> typename ET::type {\n"
+                "            auto tx = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            *tx = std::vector<typename ET::type>({x});\n"
+                "            auto ty = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            *ty = std::vector<typename ET::type>({y});\n"
+                "            auto tr = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            (*cf)({tx, ty}, {tr});\n"
+                "            return tr->get_vector()[0];\n"
+                "        };\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(inputs[0].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        EigenVector<" + element_type_names[TI(f_result_element_type)] + ">(out, "
+                EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+                "        EigenMatrix<" + element_type_names[TI(f_result_element_type)] + ">(arg0, " +
+                EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) + ").rowwise().redux(f);\n"
+                "    }\n";
+        }
+    }
+    else if (reductee_shape.size() == 2 && reduction_axes == AxisSet{0})
+    {
+        if (reductee_shape.at(0) == 0)
+        {
+            TU += "    {\n"
+                "        auto arg1 = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(inputs[1].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        EigenArray1d<" + element_type_names[TI(f_result_element_type)] + ">(out, "
+                EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+                "        EigenArray1d<" + element_type_names[TI(f_result_element_type)] + ">(arg1, "
+                EIGEN_VECTOR_FORMAT(inputs[1].get_layout<DenseTensorViewLayout>()->get_size()) ")(0, 0);\n"
+                "    }\n";
+        }
+        else
+        {
+            std::shared_ptr<CallFrame> cf =
+                std::dynamic_pointer_cast<CallFrame>(external->make_call_frame());
+            ef->get_callees().emplace_back(cf);
+            TU +=
+                "    {\n"
+                "        using ET = " + element_type_names[TI(f_result_element_type)] + ";\n"
+                "        auto cf = callees.at(" + to_string(ef->get_callees().size() - 1) + ");\n"
+                "        auto f = [cf](typename ET::type x, typename ET::type y) -> typename ET::type {\n"
+                "            auto tx = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            *tx = std::vector<typename ET::type>({x});\n"
+                "            auto ty = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            *ty = std::vector<typename ET::type>({y});\n"
+                "            auto tr = ngraph::runtime::make_tensor<ET>(ngraph::Shape{});\n"
+                "            (*cf)({tx, ty}, {tr});\n"
+                "            return tr->get_vector()[0];\n"
+                "        };\n"
+                "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(inputs[0].get_index()) + ");\n"
+                "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(f_result_element_type)] +
+                ">(" + to_string(outputs[0].get_index()) + ");\n"
+                "        EigenVector<" + element_type_names[TI(f_result_element_type)] + ">(out, "
+                EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+                "        EigenMatrix<" + element_type_names[TI(f_result_element_type)] + ">(arg0, " +
+                EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) + ").colwise().redux(f);\n"
+                "    }\n";
+        }
+    }
+    else
+    {
+        throw ngraph_error("Reduce: only vectors and matrices are currently supported");
+    }
+}
+void Emitter::EMITTER_DECL(EmitSign)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU += "    {\n"
+          "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" + to_string(inputs[0].get_index()) + ");\n"
+          "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" + to_string(outputs[0].get_index()) + ");\n"
+          "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+                   EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+          "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+                   EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").sign();\n"
+          "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitSlice)
+{
+    auto slice = static_cast<const op::Slice*>(n);
+    for (auto d : slice->get_step())
+    {
+        if (1 != d)
+        {
+            throw ngraph_error("Slice does not support non-unit step yet");
+        }
+    }
+    auto arg_type = slice->get_arguments().at(0)->get_value_type();
+    auto arg_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(arg_type);
+    assert(arg_tensor_view_type);
+    auto arg_shape = arg_tensor_view_type->get_shape();
+    auto arg_rank = arg_shape.size();
+    auto& arg_element_type = arg_tensor_view_type->get_element_type();
+    auto& lower_bounds = slice->get_lower_bounds();
+    auto& upper_bounds = slice->get_upper_bounds();
+    // Scalar slice is necessarily just a copy.
+    if (arg_rank == 0)
+    {
+        TU +=
+            "    {\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(arg_element_type)] + ">(" + to_string(outputs.at(0).get_index()) +
+            ")->get_vector() =\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(arg_element_type)] + ">(" + to_string(inputs.at(0).get_index()) +
+            ")->get_vector();\n"
+            "    }\n";
+    }
+    else if (arg_rank == 1)
+    {
+        TU +=
+            "    {\n"
+            "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(arg_element_type)] +
+            ">(" + to_string(inputs[0].get_index()) + ");\n"
+            "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(arg_element_type)] +
+            ">(" + to_string(outputs[0].get_index()) + ");\n"
+            "        EigenVector<" + element_type_names[TI(arg_element_type)] +
+            ">(out, " EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+            "        EigenVector<" + element_type_names[TI(arg_element_type)] +
+            ">(arg0, " EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").segment(\n"
+            "        " + to_string(lower_bounds[0]) + ", " + to_string(upper_bounds[0] - lower_bounds[0]) + ");\n"
+            "    }\n";
+    }
+    else if (arg_rank == 2)
+    {
+        auto arg0_layout = inputs[0].get_layout<DenseTensorViewLayout>();
+        auto out_layout = outputs[0].get_layout<DenseTensorViewLayout>();
+        TU +=
+            "    {\n"
+            "        auto arg0 = call_frame->get_tensor_view_data<" +
+            element_type_names[TI(arg_element_type)] + ">(" + to_string(inputs[0].get_index()) +
+            ");\n"
+            "        auto out  = call_frame->get_tensor_view_data<" +
+            element_type_names[TI(arg_element_type)] + ">(" + to_string(outputs[0].get_index()) +
+            ");\n"
+            "        EigenMatrix<" +
+            element_type_names[TI(arg_element_type)] + ">(out, " +
+            EIGEN_MATRIX_FORMAT(out_layout->get_shape(), out_layout->get_strides()) +
+            ") = \n"
+            "        EigenMatrix<" +
+            element_type_names[TI(arg_element_type)] + ">(arg0, " +
+            EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) + ").block(" +
+            to_string(lower_bounds[0]) + ", " + to_string(lower_bounds[1]) +
+            ",\n"
+            "        " +
+            to_string(upper_bounds[0] - lower_bounds[0]) +
+            ",\n"
+            "        " +
+            to_string(upper_bounds[1] - lower_bounds[1]) +
+            ");\n"
+            "    }\n";
+    }
+    // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
+    else
+    {
+        throw ngraph_error("Slice is not implemented yet for tensors with rank>2");
+    }
+}
+void Emitter::EMITTER_DECL(EmitSum)
+{
+    auto s = static_cast<const op::Sum*>(n);
+    auto s_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(s->get_value_type());
+    assert(s_tensor_view_type);
+    auto& s_element_type = s_tensor_view_type->get_element_type();
+    auto s_shape = s_tensor_view_type->get_shape();
+    auto arg = s->get_arguments().at(0);
+    auto arg_type = arg->get_value_type();
+    auto arg_tensor_view_type = dynamic_pointer_cast<const TensorViewType>(arg_type);
+    assert(arg_tensor_view_type);
+    auto arg_shape = arg_tensor_view_type->get_shape();
+    auto arg_rank = arg_shape.size();
+    auto& reduction_axes = s->get_reduction_axes();
+    // Trivial case: no reduction axes.
+    if (reduction_axes.size() == 0)
+    {
+        TU +=
+            "    {\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(s_element_type)] + ">(" + to_string(outputs.at(0).get_index()) +
+            ")->get_vector() =\n"
+            "        call_frame->get_parameterized_tensor_view<" +
+            element_type_names[TI(s_element_type)] + ">(" + to_string(inputs.at(0).get_index()) +
+            ")->get_vector();\n"
+            "    }\n";
+    }
+    // Full reduction? Then sum to scalar.
+    else if ((arg_rank == 1 && reduction_axes == AxisSet{0}) ||
+             (arg_rank == 2 && reduction_axes == AxisSet{0, 1}))
+    {
+        TU +=
+            "    {\n"
+            "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(s_element_type)] +
+            ">(" + to_string(inputs[0].get_index()) + ");\n"
+            "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(s_element_type)] +
+            ">(" + to_string(outputs[0].get_index()) + ");\n"
+            "        EigenArray1d<" + element_type_names[TI(s_element_type)] + ">(out, "
+            EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+            "        EigenArray1d<" + element_type_names[TI(s_element_type)] + ">(arg0, "
+            EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").sum();\n"
+            "    }\n";
+    }
+    else if (arg_rank == 2 && reduction_axes == AxisSet{1})
+    {
+        auto arg0_layout = inputs[0].get_layout<DenseTensorViewLayout>();
+        TU +=
+            "    {\n"
+            "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(s_element_type)] +
+            ">(" + to_string(inputs[0].get_index()) + ");\n"
+            "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(s_element_type)] +
+            ">(" + to_string(outputs[0].get_index()) + ");\n"
+            "        EigenVector<" + element_type_names[TI(s_element_type)] + ">(out, "
+            EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+            "        EigenMatrix<" + element_type_names[TI(s_element_type)] + ">(arg0, " +
+            EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) + ").rowwise().sum();\n"
+            "    }\n";
+    }
+    else if (arg_rank == 2 && reduction_axes == AxisSet{0})
+    {
+        auto arg0_layout = inputs[0].get_layout<DenseTensorViewLayout>();
+        TU +=
+            "    {\n"
+            "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(s_element_type)] +
+            ">(" + to_string(inputs[0].get_index()) + ");\n"
+            "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(s_element_type)] +
+            ">(" + to_string(outputs[0].get_index()) + ");\n"
+            "        EigenVector<" + element_type_names[TI(s_element_type)] + ">(out, "
+            EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+            "        EigenMatrix<" + element_type_names[TI(s_element_type)] + ">(arg0, " +
+            EIGEN_MATRIX_FORMAT(arg0_layout->get_shape(), arg0_layout->get_strides()) + ").colwise().sum();\n"
+            "    }\n";
+    }
+    else
+    {
+        throw ngraph_error("Sum: only vectors and matrices are currently supported");
+    }
+}
+void Emitter::EMITTER_DECL(EmitExp)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").exp();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitSin)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").sin();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitSinh)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").sinh();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitCos)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").cos();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitCosh)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").cosh();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitTan)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").tan();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitTanh)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    // Eigen's generic_fast_tanh_float<float> is currently miscompiled by Clang/LLVM
+    // so we fall-back to std::tanh
+    // TODO: Implement our own internal fast/approximate tanh if this actually gets used
+    // by models
+    TU +=
+        "    {\n"
+        "        auto& arg0 = call_frame->get_parameterized_tensor_view<" +
+        element_type_names[TI(et)] + ">(" + to_string(inputs[0].get_index()) +
+        ")->get_vector();\n"
+        "        auto& out  = call_frame->get_parameterized_tensor_view<" +
+        element_type_names[TI(et)] + ">(" + to_string(outputs[0].get_index()) +
+        ")->get_vector();\n"
+        "        std::transform(arg0.begin(), arg0.end(), out.begin(), [](" +
+        element_type_names[TI(et)] + "::type x) -> " + element_type_names[TI(et)] +
+        "::type { return std::tanh(x); });\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitAsin)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").asin();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitAcos)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").acos();\n"
+        "    }\n";
+}
+void Emitter::EMITTER_DECL(EmitAtan)
+{
+    const element::Type& et =
+        (dynamic_pointer_cast<const TensorViewType>(n->get_arguments().at(0)->get_value_type()))
+            ->get_element_type();
+    TU +=
+        "    {\n"
+        "        auto arg0 = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(inputs[0].get_index()) + ");\n"
+        "        auto out  = call_frame->get_tensor_view_data<" + element_type_names[TI(et)] + ">(" +
+        to_string(outputs[0].get_index()) + ");\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(out, "
+        EIGEN_VECTOR_FORMAT(outputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ") =\n"
+        "        EigenArray1d<" + element_type_names[TI(et)] + ">(arg0, "
+        EIGEN_VECTOR_FORMAT(inputs[0].get_layout<DenseTensorViewLayout>()->get_size()) ").atan();\n"
+        "    }\n";
+}
--- a/src/ngraph/runtime/cpu/emitter.hpp
+++ b/src/ngraph/runtime/cpu/emitter.hpp
@@ -61,6 +61,7 @@ namespace ngraph
                void EMITTER_DECL(EmitLessEq);
                void EMITTER_DECL(EmitLog);
                void EMITTER_DECL(EmitMaximum);
+                void EMITTER_DECL(EmitMinimum);
                void EMITTER_DECL(EmitNegative);
                void EMITTER_DECL(EmitNotEqual);
                void EMITTER_DECL(EmitSelect);
@@ -75,6 +76,23 @@ namespace ngraph
                void EMITTER_DECL(EmitParameterizedConstantUInt64);
                void EMITTER_DECL(EmitBroadcast);
                void EMITTER_DECL(EmitConvert);
+                void EMITTER_DECL(EmitConstant);
+                void EMITTER_DECL(EmitReshape);
+                void EMITTER_DECL(EmitFunctionCall);
+                void EMITTER_DECL(EmitReduce);
+                void EMITTER_DECL(EmitSign);
+                void EMITTER_DECL(EmitSlice);
+                void EMITTER_DECL(EmitSum);
+                void EMITTER_DECL(EmitExp);
+                void EMITTER_DECL(EmitSin);
+                void EMITTER_DECL(EmitSinh);
+                void EMITTER_DECL(EmitCos);
+                void EMITTER_DECL(EmitCosh);
+                void EMITTER_DECL(EmitTan);
+                void EMITTER_DECL(EmitTanh);
+                void EMITTER_DECL(EmitAsin);
+                void EMITTER_DECL(EmitAcos);
+                void EMITTER_DECL(EmitAtan);
            };
        }
    }

--- a/src/ngraph/runtime/cpu/external_function.cpp
+++ b/src/ngraph/runtime/cpu/external_function.cpp
@@ -27,14 +27,20 @@
 #include "ngraph/function.hpp"
 #include "ngraph/node.hpp"
 #include "ngraph/ops/abs.hpp"
+#include "ngraph/ops/acos.hpp"
 #include "ngraph/ops/add.hpp"
+#include "ngraph/ops/asin.hpp"
+#include "ngraph/ops/atan.hpp"
 #include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/concatenate.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convert.hpp"
+#include "ngraph/ops/cos.hpp"
+#include "ngraph/ops/cosh.hpp"
 #include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/dot.hpp"
 #include "ngraph/ops/equal.hpp"
+#include "ngraph/ops/exp.hpp"
 #include "ngraph/ops/function_call.hpp"
 #include "ngraph/ops/get_tuple_element.hpp"
 #include "ngraph/ops/greater.hpp"
@@ -43,12 +49,21 @@
 #include "ngraph/ops/less_eq.hpp"
 #include "ngraph/ops/log.hpp"
 #include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/minimum.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/negative.hpp"
 #include "ngraph/ops/not_equal.hpp"
 #include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/reshape.hpp"
 #include "ngraph/ops/select.hpp"
+#include "ngraph/ops/sign.hpp"
+#include "ngraph/ops/sin.hpp"
+#include "ngraph/ops/sinh.hpp"
+#include "ngraph/ops/slice.hpp"
 #include "ngraph/ops/subtract.hpp"
+#include "ngraph/ops/sum.hpp"
+#include "ngraph/ops/tan.hpp"
+#include "ngraph/ops/tanh.hpp"
 #include "ngraph/ops/tuple.hpp"
 #include "ngraph/pass/assign_layout.hpp"
 #include "ngraph/pass/assign_tensors.hpp"
@@ -84,6 +99,7 @@ static const OpMap dispatcher{
    {TI(ngraph::op::LessEq), &Emitter::EmitLessEq},
    {TI(ngraph::op::Log), &Emitter::EmitLog},
    {TI(ngraph::op::Maximum), &Emitter::EmitMaximum},
+    {TI(ngraph::op::Minimum), &Emitter::EmitMinimum},
    {TI(ngraph::op::Negative), &Emitter::EmitNegative},
    {TI(ngraph::op::NotEqual), &Emitter::EmitNotEqual},
    {TI(ngraph::op::Select), &Emitter::EmitSelect},
@@ -106,6 +122,23 @@ static const OpMap dispatcher{
     &Emitter::EmitParameterizedConstantUInt64},
    {TI(ngraph::op::Broadcast), &Emitter::EmitBroadcast},
    {TI(ngraph::op::Convert), &Emitter::EmitConvert},
+    {TI(ngraph::op::Constant), &Emitter::EmitConstant},
+    {TI(ngraph::op::Reshape), &Emitter::EmitReshape},
+    {TI(ngraph::op::FunctionCall), &Emitter::EmitFunctionCall},
+    {TI(ngraph::op::Reduce), &Emitter::EmitReduce},
+    {TI(ngraph::op::Sign), &Emitter::EmitSign},
+    {TI(ngraph::op::Slice), &Emitter::EmitSlice},
+    {TI(ngraph::op::Sum), &Emitter::EmitSum},
+    {TI(ngraph::op::Exp), &Emitter::EmitExp},
+    {TI(ngraph::op::Sin), &Emitter::EmitSin},
+    {TI(ngraph::op::Sinh), &Emitter::EmitSinh},
+    {TI(ngraph::op::Cos), &Emitter::EmitCos},
+    {TI(ngraph::op::Cosh), &Emitter::EmitCosh},
+    {TI(ngraph::op::Tan), &Emitter::EmitTan},
+    {TI(ngraph::op::Tanh), &Emitter::EmitTanh},
+    {TI(ngraph::op::Asin), &Emitter::EmitAsin},
+    {TI(ngraph::op::Acos), &Emitter::EmitAcos},
+    {TI(ngraph::op::Atan), &Emitter::EmitAtan},
 };
 #undef TI
@@ -174,7 +207,9 @@ void ExternalFunction::compile(FunctionMap& function_map)
    // Now we build the TU
    Emitter emitter;
    auto& TU = emitter.GetTU();
-    TU += R"(
+    TU += R"(// Generated by the NGraph CPU backend
+#include <algorithm>
+#include <cmath>
 #include <memory>
 #include <vector>
@@ -182,17 +217,18 @@ void ExternalFunction::compile(FunctionMap& function_map)
 #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
 #include "ngraph/runtime/cpu/call_frame.hpp"
+#include "ngraph/runtime/cpu/cpu_kernels.hpp"
 #include "ngraph/runtime/cpu/eigen_utils.hpp"
-#include "ngraph/runtime/tensor_view_info.hpp"
+#include "ngraph/runtime/utils.hpp"
-void *__dso_handle = 0;
 using namespace ngraph::element;
 using namespace ngraph::runtime;
 using namespace ngraph::runtime::cpu::eigen;
 extern "C" void __entrypoint(ngraph::runtime::cpu::CallFrame* call_frame,
-                             ngraph::runtime::TensorViewPtrs& tensor_views)
+                             ngraph::runtime::TensorViewPtrs& tensor_views,
+                             const std::vector<std::shared_ptr<ngraph::runtime::cpu::CallFrame>>& callees)
 {
 )";
@@ -243,8 +279,10 @@ extern "C" void __entrypoint(ngraph::runtime::cpu::CallFrame* call_frame,
    assert(llvm_module);
    estate.add_module(llvm_module);
    estate.finalize();
-    compiled_function = estate.find_function<void(
+    compiled_function =
-        ngraph::runtime::cpu::CallFrame*, ngraph::runtime::TensorViewPtrs&)>("__entrypoint");
+        estate.find_function<void(ngraph::runtime::cpu::CallFrame*,
+                                  ngraph::runtime::TensorViewPtrs&,
+                                  const std::vector<std::shared_ptr<CallFrame>>&)>("__entrypoint");
    assert(compiled_function);
    m_is_compiled = true;
@@ -322,5 +360,5 @@ shared_ptr<ngraph::runtime::CallFrame> ExternalFunction::make_call_frame()
 #undef M
    }
    return make_shared<ngraph::runtime::cpu::CallFrame>(
-        compiled_function, m_n_outputs, m_n_inputs, temps);
+        compiled_function, m_n_outputs, m_n_inputs, temps, callees);
 }
--- a/src/ngraph/runtime/cpu/external_function.hpp
+++ b/src/ngraph/runtime/cpu/external_function.hpp
@@ -47,8 +47,10 @@ namespace ngraph
            using OpMap = std::unordered_map<std::type_index, OpFunction>;
-            using EntryPoint = std::function<void(ngraph::runtime::cpu::CallFrame*,
+            using EntryPoint = std::function<void(
-                                                  ngraph::runtime::TensorViewPtrs&)>;
+                ngraph::runtime::cpu::CallFrame*,
+                ngraph::runtime::TensorViewPtrs&,
+                const std::vector<std::shared_ptr<ngraph::runtime::cpu::CallFrame>>&)>;
            class ExternalFunction : public ngraph::runtime::ExternalFunction
            {
@@ -56,7 +58,7 @@ namespace ngraph
                ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                 bool release_function = true);
                std::shared_ptr<ngraph::runtime::CallFrame> make_call_frame();
+                std::vector<std::shared_ptr<CallFrame>>& get_callees() { return callees; }
            protected:
                void compile(FunctionMap& function_map);
@@ -64,6 +66,7 @@ namespace ngraph
                size_t m_n_outputs;
                ngraph::descriptor::TensorViewPtrs m_temp_views;
                EntryPoint compiled_function;
+                std::vector<std::shared_ptr<CallFrame>> callees;
            };
        }
    }