Merge branch 'master' into master

b3d70927 · Scott Cyphers · GitHub · c0b0bf8f · a4b9e6b7 · b3d70927
Unverified Commit b3d70927 authored Dec 01, 2018 by Scott Cyphers Committed by GitHub Dec 01, 2018
23 changed files
--- a/README.md
+++ b/README.md
-# nGraph Compiler Stack
+# nGraph Compiler Stack (Beta)

 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/NervanaSystems/ngraph/blob/master/LICENSE) [![Build Status][build-status-badge]][build-status] 

@@ -16,12 +16,12 @@ workloads on CPU for inference, please refer to the links below.

 |  Framework (Version)       | Installation guide                     | Notes  
 |----------------------------|----------------------------------------|-----------------------------------
-| TensorFlow* 1.12           | [Pip package] or [Build from source]   | 17 [Validated workloads]
-| MXNet* 1.4                 | [Enable the module] or [Source compile]| 17 [Validated workloads]   
-| ONNX 1.3                   | [Pip package]                          | 14 [Validated workloads] 
+| TensorFlow* 1.12           | [Pip install](https://github.com/NervanaSystems/ngraph-tf) or [Build from source](https://github.com/NervanaSystems/ngraph-tf)   | 20 [Validated workloads]
+| MXNet* 1.3                 | [Pip install](https://github.com/NervanaSystems/ngraph-mxnet#Installation) or [Build from source](https://github.com/NervanaSystems/ngraph-mxnet#building-with-ngraph-support)| 18 [Validated workloads]   
+| ONNX 1.3                   | [Pip install](https://github.com/NervanaSystems/ngraph-onnx#installation)                          | 14 [Validated workloads] 

 Frameworks using nGraph Compiler stack to execute workloads have shown 
-**up to 45X** performance boost when compared to native framework 
+[**up to 45X**](https://ai.intel.com/ngraph-compiler-stack-beta-release/) performance boost when compared to native framework 
 implementations. We've also seen performance boosts running workloads that 
 are not included on the list of [Validated workloads], thanks to our 
 powerful subgraph pattern matching.
@@ -100,9 +100,6 @@ to improve it:
 [develop-without-lockin]: doc/sphinx/source/graphics/develop-without-lockin.png "Develop on any part of the stack wtihout lockin"
 [Movidius™ Myriad™ 2]:https://www.movidius.com/solutions/vision-processing-unit
 [PlaidML]: https://github.com/plaidml/plaidml
-[Pip package]: https://github.com/NervanaSystems/ngraph-onnx#installing-ngraph-onnx
-[Build from source]: https://github.com/NervanaSystems/ngraph-tf
-[Enable the module]: https://github.com/NervanaSystems/ngraph/blob/mbrookhart/mxnet_tutorial/doc/sphinx/source/shared/mxnet_tutorial.rst
 [Source compile]: https://github.com/NervanaSystems/ngraph-mxnet/blob/master/README.md
 [nGraph-ONNX]: https://github.com/NervanaSystems/ngraph-onnx/blob/master/README.md
 [nGraph-ONNX adaptable]: https://ai.intel.com/adaptable-deep-learning-solutions-with-ngraph-compiler-and-onnx/

--- a/cmake/external_halide.cmake
+++ b/cmake/external_halide.cmake
@@ -45,6 +45,7 @@ ExternalProject_Add(
    GIT_REPOSITORY ${HALIDE_GIT_REPO_URL}
    GIT_TAG ${HALIDE_GIT_TAG}
    UPDATE_COMMAND ""
+    PATCH_COMMAND patch -p1 --forward --reject-file=- -i ${CMAKE_SOURCE_DIR}/cmake/halide.patch || exit 0
    CMAKE_ARGS
    -DLLVM_DIR=${HALIDE_LLVM_DIR}
    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}

--- a/cmake/halide.patch
+++ b/cmake/halide.patch
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index d70fdc79d..60aa4c3b7 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -131,7 +131,8 @@ function(check_llvm_target TARGET HAS_TARGET)
+     set(_llvm_required_version ${ARGV2})
+   endif()
+   if (NOT LLVM_VERSION LESS _llvm_required_version)
+-    list(FIND LLVM_TARGETS_TO_BUILD ${TARGET} _found_target)
+    set(NGRAPH_TARGETS_TO_BUILD "X86")
+    list(FIND NGRAPH_TARGETS_TO_BUILD ${TARGET} _found_target)
+     if (_found_target GREATER -1)
+       set(${HAS_TARGET} ON PARENT_SCOPE)
+     else()
--- a/doc/sphinx/ngraph_theme/static/css/theme.css
+++ b/doc/sphinx/ngraph_theme/static/css/theme.css
@@ -1634,7 +1634,7 @@ body {
  color: #38403f;
  min-height: 100%;
  overflow-x: hidden;
-  background: #edf0f2;
+  background: #fcfcfc;
 }

 .wy-text-left {
@@ -3193,7 +3193,7 @@ footer span.commit code, footer span.commit .rst-content tt, .rst-content footer
 }
 @media screen and (min-width: 1400px) {
  .wy-nav-content-wrap {
-    background: #0C7881;
+    background: #fcfcfc;
  }

  .wy-nav-content {

--- a/doc/sphinx/source/conf.py
+++ b/doc/sphinx/source/conf.py
@@ -73,9 +73,11 @@ author = 'Intel Corporation'
 # built documents.
 #
 # The short X.Y version.
-version = '0.9'
-# The full version, including alpha/beta/rc tags.
-release = '0.9.0'
+version = '0.10'
+# The Documentation full version, including alpha/beta/rc tags. Some features
+# available in the latest code will not necessarily be documented first
+release = '0.10.1'
+

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

--- a/doc/sphinx/source/framework-integration-guides.rst
+++ b/doc/sphinx/source/framework-integration-guides.rst
@@ -50,4 +50,4 @@ nGraph-TensorFlow bridge.
 .. _MXNet: http://mxnet.incubator.apache.org
 .. _DSO: http://csweb.cs.wfu.edu/%7Etorgerse/Kokua/More_SGI/007-2360-010/sgi_html/ch03.html
 .. _being the fastest: https://github.com/soumith/convnet-benchmarks
-.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf
+.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf/blob/master/README.md
--- a/doc/sphinx/source/frameworks/validation-testing.rst
+++ b/doc/sphinx/source/frameworks/validation-testing.rst
@@ -15,19 +15,22 @@ TensorFlow
   :widths: 27, 53
   :escape: ~

-   Resnet50 v1 and v2, Image recognition
-   Inception V3 and V4, Image recognition
+   Resnet50 v1, Image recognition
+   Resnet50 v2, Image recognition
+   Inception V3, Image recognition
+   Inception V4, Image recognition
   Inception-ResNetv2, Image recognition
   MobileNet v1, Image recognition
-   SqueezeNet v1.1, Image recognition
-   DenseNet-121, Image recognition
+   MobileNet v2, Image recognition
+   VGG16, Image recognition
   SSD-VGG16, Object detection
   SSD-MobileNetv1, Object detection
+   R-FCN, Object detection
   Faster RCNN, Object detection
   Yolo v2, Object detection
+   Transformer-LT, Language translation
   Wide & Deep, Recommender system
   NCF, Recommender system
-   WaveNet, Speech generation
   U-Net, Image segmentation
   DCGAN, Generative adversarial network
   DRAW, Image generation
@@ -41,7 +44,8 @@ MXNet
   :widths: 27, 53
   :escape: ~

-   Resnet50 v1 and v2, Image recognition
+   Resnet50 v1, Image recognition
+   Resnet50 v2, Image recognition
   DenseNet-121, Image recognition
   InceptionV3, Image recognition
   InceptionV4, Image recognition
@@ -70,10 +74,10 @@ Additionally, we validated the following workloads are functional through nGraph
   :widths: 27, 53
   :escape: ~

+   ResNet-50, Image recognition
   DenseNet-121, Image recognition
   Inception-v1, Image recognition
   Inception-v2, Image recognition
-   ResNet-50, Image recognition
   Shufflenet, Image recognition
   SqueezeNet, Image recognition
   VGG-19, Image recognition

--- a/licenses/notifications.txt
+++ b/licenses/notifications.txt
+The MPL 2.0 license used by the eigen library used by this ngraph core
+component requires distribution of the following information:
+
+          Eigen source code can be viewed or downloaded from here:
+          http://eigen.tuxfamily.org
--- a/python/setup.py
+++ b/python/setup.py
@@ -22,7 +22,7 @@ import os
 import distutils.ccompiler

 __version__ = os.environ.get('NGRAPH_VERSION', '0.0.0-dev')
-PYNGRAPH_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__))
+PYNGRAPH_ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
 NGRAPH_DEFAULT_INSTALL_DIR = os.environ.get('HOME')
 NGRAPH_ONNX_IMPORT_ENABLE = os.environ.get('NGRAPH_ONNX_IMPORT_ENABLE')

@@ -50,7 +50,7 @@ def find_pybind_headers_dir():
    if os.environ.get('PYBIND_HEADERS_PATH'):
        pybind_headers_dir = os.environ.get('PYBIND_HEADERS_PATH')
    else:
-        pybind_headers_dir = os.path.join(PYNGRAPH_SOURCE_DIR, 'pybind11')
+        pybind_headers_dir = os.path.join(PYNGRAPH_ROOT_DIR, 'pybind11')

    found = os.path.exists(os.path.join(pybind_headers_dir, 'include/pybind11'))
    if not found:
@@ -233,13 +233,13 @@ sources = [
 ]

 package_dir = {
-    'ngraph': PYNGRAPH_SOURCE_DIR + "/ngraph",
-    'ngraph.utils': PYNGRAPH_SOURCE_DIR + "/ngraph/utils",
-    'ngraph.impl': PYNGRAPH_SOURCE_DIR + "/ngraph/impl",
-    'ngraph.impl.op': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/op",
-    'ngraph.impl.op.util': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/op/util",
-    'ngraph.impl.passes': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/passes",
-    'ngraph.impl.runtime': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/runtime",
+    'ngraph': PYNGRAPH_ROOT_DIR + "/ngraph",
+    'ngraph.utils': PYNGRAPH_ROOT_DIR + "/ngraph/utils",
+    'ngraph.impl': PYNGRAPH_ROOT_DIR + "/ngraph/impl",
+    'ngraph.impl.op': PYNGRAPH_ROOT_DIR + "/ngraph/impl/op",
+    'ngraph.impl.op.util': PYNGRAPH_ROOT_DIR + "/ngraph/impl/op/util",
+    'ngraph.impl.passes': PYNGRAPH_ROOT_DIR + "/ngraph/impl/passes",
+    'ngraph.impl.runtime': PYNGRAPH_ROOT_DIR + "/ngraph/impl/runtime",
 }
 packages = [
    'ngraph',
@@ -251,9 +251,9 @@ packages = [
    'ngraph.impl.runtime',
 ]

-sources = [PYNGRAPH_SOURCE_DIR + "/" + source for source in sources]
+sources = [PYNGRAPH_ROOT_DIR + "/" + source for source in sources]

-include_dirs = [PYNGRAPH_SOURCE_DIR, NGRAPH_CPP_INCLUDE_DIR, PYBIND11_INCLUDE_DIR]
+include_dirs = [PYNGRAPH_ROOT_DIR, NGRAPH_CPP_INCLUDE_DIR, PYBIND11_INCLUDE_DIR]

 library_dirs = [NGRAPH_CPP_LIBRARY_DIR]

@@ -274,13 +274,13 @@ data_files = [
    (
        'licenses',
        [
-            PYNGRAPH_SOURCE_DIR + "/../licenses/" + license
-            for license in os.listdir(PYNGRAPH_SOURCE_DIR + "/../licenses")
+            PYNGRAPH_ROOT_DIR + "/../licenses/" + license
+            for license in os.listdir(PYNGRAPH_ROOT_DIR + "/../licenses")
        ],
    ),
    (
        '',
-        [PYNGRAPH_SOURCE_DIR + "/../LICENSE"],
+        [PYNGRAPH_ROOT_DIR + "/../LICENSE"],
    )
 ]

@@ -302,10 +302,10 @@ if NGRAPH_ONNX_IMPORT_ENABLE == 'TRUE':
        'pyngraph/pyngraph_onnx_import.cpp',
        'pyngraph/onnx_import/onnx_import.cpp',
    ]
-    onnx_sources = [PYNGRAPH_SOURCE_DIR + "/" + source for source in onnx_sources]
+    onnx_sources = [PYNGRAPH_ROOT_DIR + "/" + source for source in onnx_sources]

    package_dir['ngraph.impl.onnx_import'] = (
-        PYNGRAPH_SOURCE_DIR + "/ngraph/impl/onnx_import"
+        PYNGRAPH_ROOT_DIR + "/ngraph/impl/onnx_import"
    )
    packages.append('ngraph.impl.onnx_import')

@@ -360,17 +360,17 @@ class BuildExt(build_ext):
        build_ext.build_extensions(self)


-with open(os.path.join(PYNGRAPH_SOURCE_DIR, 'requirements.txt')) as req:
+with open(os.path.join(PYNGRAPH_ROOT_DIR, 'requirements.txt')) as req:
    requirements = req.read().splitlines()

 setup(
    name='ngraph-core',
+    description=open(os.path.join(PYNGRAPH_ROOT_DIR, 'README.md')).read(),
    version=__version__,
    author='Intel',
    author_email='intelnervana@intel.com',
    url='https://ai.intel.com/',
    license='License :: OSI Approved :: Apache Software License',
-    description='Python API for nGraph',
    long_description='',
    ext_modules=ext_modules,
    package_dir=package_dir,

--- a/src/ngraph/log.cpp
+++ b/src/ngraph/log.cpp
@@ -28,26 +28,6 @@
 using namespace std;
 using namespace ngraph;

-namespace
-{
-    class NilStreamBuf final : public streambuf
-    {
-        // N.B. We derive from the base streambuf implementation, in
-        //      which underflow() and overflow() both return
-        //      Traits::eof() -- any access returns a failure.
-    };
-}
-
-ostream& ngraph::get_nil_stream()
-{
-    // N.B. When debug logging is disabled, multiple threads may
-    //      access the nil stream simultaneously, so it's important to
-    //      return a threadsafe nil stream implementation.
-    static NilStreamBuf nil_buf;
-    static ostream nil{&nil_buf};
-    return nil;
-}
-
 void ngraph::default_logger_handler_func(const string& s)
 {
    cout << s << endl;

--- a/src/ngraph/log.hpp
+++ b/src/ngraph/log.hpp
@@ -100,8 +100,6 @@ namespace ngraph
        static std::deque<std::string> m_queue;
    };

-    extern std::ostream& get_nil_stream();
-
    void default_logger_handler_func(const std::string& s);

 #define NGRAPH_ERR                                                                                 \
@@ -133,6 +131,33 @@ namespace ngraph
                      ngraph::default_logger_handler_func)                                         \
        .stream()
 #else
-#define NGRAPH_DEBUG ngraph::get_nil_stream()
+
+    struct NullLogger
+    {
+    };
+
+    template <typename T>
+    NullLogger&& operator<<(NullLogger&& logger, T&&)
+    {
+        return std::move(logger);
+    }
+
+    template <typename T>
+    NullLogger&& operator<<(NullLogger&& logger, const T&)
+    {
+        return std::move(logger);
+    }
+
+    inline NullLogger&&
+        operator<<(NullLogger&& logger,
+                   std::basic_ostream<char, std::char_traits<char>>& (&)(std::basic_ostream<
+                                                                         char,
+                                                                         std::char_traits<char>>&))
+    {
+        return std::move(logger);
+    }
+
+#define NGRAPH_DEBUG                                                                               \
+    ::ngraph::NullLogger {}
 #endif
 }
--- a/src/ngraph/op/experimental/quantized_conv_relu.cpp
+++ b/src/ngraph/op/experimental/quantized_conv_relu.cpp
-/*******************************************************************************
-* Copyright 2017-2018 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************

 #include <numeric>


--- a/src/ngraph/runtime/cpu/unit_test.manifest
+++ b/src/ngraph/runtime/cpu/unit_test.manifest
@@ -16,3 +16,6 @@ quantize_clamp_int32

 # failing in CI build but passing on local machine
 max_3d_to_scalar_int32
+
+argmin_trivial_in_i32
+argmax_4D_axis_3_i64_in_i32
--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -134,6 +134,7 @@ shape_of_vector
 shape_of_matrix
 shape_of_5d
 sum_stable_acc
+sum_trivial_in_double
 product_2d_to_scalar_int32
 product_to_scalar_int32
 product_to_scalar_int8
@@ -141,3 +142,6 @@ max_matrix_rows_zero_int32
 max_to_scalar_int8
 min_to_scalar_int8
 max_3d_to_scalar_double
+argmin_trivial_in_i32
+argmax_4D_axis_3_i64_in_i32
+argmin_trivial_in_double
--- a/src/ngraph/runtime/plaidml/plaidml_backend.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_backend.cpp
@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call(
    const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
    const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
 {
-    auto cfunc = m_cache.try_lookup(func);
-    if (!cfunc)
-    {
-        cfunc = m_compiler.compile(func);
-    }
+    auto cfunc = m_cache.compile(func, &m_compiler);
    cfunc->schedule_invocation(inputs, outputs);
    return true;
 }

--- a/src/ngraph/runtime/plaidml/plaidml_compiled_function.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_compiled_function.cpp
@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(

    NGRAPH_DEBUG << "Binding PlaidML function " << this;

+    m_bound_inputs.resize(inputs.size());
+    m_bound_outputs.resize(outputs.size());
+
    std::size_t input_count = 0;
    for (const auto& param : m_func->get_parameters())
    {
        for (std::size_t idx = 0; idx < param->get_output_size(); ++idx)
        {
            descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get();
-            auto rtv = dynamic_cast<PlaidML_Tensor*>(inputs[input_count++].get());
+            auto& input = inputs.at(input_count);
+            auto rtv = dynamic_cast<PlaidML_Tensor*>(input.get());
            if (!rtv)
            {
                throw std::runtime_error{
-                    "The PlaidML backend only operations on PlaidML tensor views"};
+                    "The PlaidML backend only operates on PlaidML tensor views"};
            }
            rtv->sync_input();
+            auto& bound_input = m_bound_inputs.at(input_count);
+            ++input_count;
+            if (bound_input.lock() == input)
+            {
+                // No need to re-bind this input.
+                continue;
+            }
+            bound_input = input;
            NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv;
            m_invoker.set_input(m_input_names.at(tv), rtv->tensor());
        }
@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
        for (std::size_t idx = 0; idx < result->get_output_size(); ++idx)
        {
            descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get();
-            auto rtv = dynamic_cast<PlaidML_Tensor*>(outputs[output_count++].get());
+            auto& output = outputs.at(output_count);
+            auto rtv = dynamic_cast<PlaidML_Tensor*>(output.get());
            if (!rtv)
            {
                throw std::runtime_error{
-                    "The PlaidML backend only operations on PlaidML tensor views"};
+                    "The PlaidML backend only operates on PlaidML tensor views"};
+            }
+            auto& bound_output = m_bound_outputs.at(output_count);
+            ++output_count;
+            if (bound_output.lock() == output)
+            {
+                // No need to re-bind this output.
+                continue;
            }
+            bound_output = output;
            NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv;
            m_invoker.set_output(m_output_names.at(tv), rtv->tensor());
        }
@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
            if (!rtv)
            {
                throw std::runtime_error{
-                    "The PlaidML backend only operations on PlaidML tensor views"};
+                    "The PlaidML backend only operates on PlaidML tensor views"};
            }
            rtv->sync_output();
        }

--- a/src/ngraph/runtime/plaidml/plaidml_compiled_function.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_compiled_function.hpp
@@ -58,5 +58,7 @@ private:
    std::shared_ptr<Function> m_func;
    std::unordered_map<descriptor::Tensor*, std::string> m_input_names;
    std::unordered_map<descriptor::Tensor*, std::string> m_output_names;
+    mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_inputs;
+    mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_outputs;
    mutable vertexai::plaidml::invoker m_invoker;
 };
--- a/src/ngraph/runtime/plaidml/plaidml_tensor.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_tensor.cpp
@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse

 void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input()
 {
+    if (!get_stale())
+    {
+        return;
+    }
+    set_stale(false);
    if (!m_memory)
    {
        if (m_is_logically_zero)
@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output()
 {
    // The tensor's been used for an output, so it's no longer logically zero.
    m_is_logically_zero = false;
+    set_stale(false);

    if (!m_memory)
    {

--- a/src/ngraph/runtime/plaidml/unit_test.manifest
+++ b/src/ngraph/runtime/plaidml/unit_test.manifest
@@ -26,12 +26,15 @@ topk_1d_max_one                         # No plans to implement TopK
 topk_1d_min_all                         # No plans to implement TopK
 topk_1d_min_partial                     # No plans to implement TopK
 topk_1d_min_one                         # No plans to implement TopK
+topk_3d_large_input_max                 # No plans to implement TopK
+topk_3d_large_input_min                 # No plans to implement TopK
 topk_3d_max_all                         # No plans to implement TopK
 topk_3d_max_partial                     # No plans to implement TopK
 topk_3d_max_one                         # No plans to implement TopK
 topk_3d_min_all                         # No plans to implement TopK
 topk_3d_min_partial                     # No plans to implement TopK
 topk_3d_min_one                         # No plans to implement TopK
+topk_3d_single_output                   # No plans to implement TopK
 topk_2d_max_all                         # No plans to implement TopK
 topk_2d_max_partial                     # No plans to implement TopK
 topk_2d_max_one                         # No plans to implement TopK
@@ -43,15 +46,21 @@ topk_5d_max_partial                     # No plans to implement TopK

 # Tests that PlaidML might be able to run at some point.
 backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3
+backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
+backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2
+backwards_maxpool_n4_c1_hw4_2x2_max
+backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_slice
 batchnorm_fprop_bprop  # To debug
 batchnorm_fprop_bprop_2step  # To debug
+softmax_axis_3d_double  # To debug
 reduce_matrix_rows_zero  # To debug: possible broadcasting error?
 reduce_matrix_cols_zero  # To debug: possible broadcasting error?
 reduce_3d_to_vector  # To debug: possible broadcasting error?
 replace_slice_matrix_inplace
 max_pool_2d_1channel_1image_overpadded
 max_pool_3d
+maxpool_bprop_larger_than_cache
 reduce_window_emulating_max_pool_1d_1channel_1image
 reduce_window_emulating_max_pool_1d_1channel_2image
 reduce_window_emulating_max_pool_1d_2channel_2image
@@ -60,31 +69,49 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided
 select_and_scatter_with_overlap
 select_and_scatter_without_overlap
 select_and_scatter_3d_without_overlap
+generate_mask
 avg_pool_3d
 avg_pool_3d_uneven_strided_padded_include_in_computation
-dequantize_zero_offset              # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_TOWARD_ZERO  # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_UPWARD       # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_DOWNWARD     # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_TOWARD_EVEN  # Quantization/Dequantization is unimplemented
-quantize_ROUND_TOWARD_INFINITY      # Quantization/Dequantization is unimplemented
-quantize_ROUND_TOWARD_ZERO          # Quantization/Dequantization is unimplemented
-quantize_ROUND_UP                   # Quantization/Dequantization is unimplemented
-quantize_ROUND_DOWN                 # Quantization/Dequantization is unimplemented
-quantize                            # Quantization/Dequantization is unimplemented
-quantize_axes                       # Quantization/Dequantization is unimplemented
-quantize_int8                       # Quantization/Dequantization is unimplemented
-quantize_clamp                      # Quantization/Dequantization is unimplemented
-dequantize                          # Quantization/Dequantization is unimplemented
-dequantize_axes                     # Quantization/Dequantization is unimplemented
-dequantize_int8                     # Quantization/Dequantization is unimplemented
-sum_matrix_rows_zero                # Empty dims apparently should produce shaped 0s
-sum_matrix_cols_zero                # Empty dims apparently should produce shaped 0s
-sum_vector_zero                     # Empty dims apparently should produce shaped 0s
-sum_matrix_to_scalar_zero_by_zero   # Empty dims apparently should produce shaped 0s
-sum_3d_eliminate_zero_dim           # Empty dims apparently should produce shaped 0s
-dot_0_0                             # Empty dims apparently should produce shaped 0s
-dot_matrix_2x0_0x2                  # Empty dims apparently should produce shaped 0s
-dot_2x0_0                           # Empty dims apparently should produce shaped 0s
+dequantize_int8_zero_offset             # Quantization/Dequantization is unimplemented
+dequantize_int32                        # Quantization/Dequantization is unimplemented
+dequantize_int32_zero_offset            # Quantization/Dequantization is unimplemented
+dequantize_zero_offset                  # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_TOWARD_ZERO      # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_UPWARD           # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_DOWNWARD         # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_TOWARD_EVEN      # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_TOWARD_INFINITY  # Quantization/Dequantization is unimplemented
+quantize_ROUND_TOWARD_INFINITY          # Quantization/Dequantization is unimplemented
+quantize_ROUND_TOWARD_ZERO              # Quantization/Dequantization is unimplemented
+quantize_ROUND_UP                       # Quantization/Dequantization is unimplemented
+quantize_ROUND_DOWN                     # Quantization/Dequantization is unimplemented
+quantize                                # Quantization/Dequantization is unimplemented
+quantize_zero_offset                    # Quantization/Dequantization is unimplemented
+quantize_axes                           # Quantization/Dequantization is unimplemented
+quantize_int8                           # Quantization/Dequantization is unimplemented
+quantize_int8_zero_offset               # Quantization/Dequantization is unimplemented
+quantize_int32                          # Quantization/Dequantization is unimplemented
+quantize_int32_zero_offset              # Quantization/Dequantization is unimplemented
+quantize_clamp                          # Quantization/Dequantization is unimplemented
+quantize_clamp_int8                     # Quantization/Dequantization is unimplemented
+quantize_clamp_int32                    # Quantization/Dequantization is unimplemented
+quantize_clamp_int32_zero_offset        # Quantization/Dequantization is unimplemented
+quantize_clamp_uint8                    # Quantization/Dequantization is unimplemented
+dequantize                              # Quantization/Dequantization is unimplemented
+dequantize_axes                         # Quantization/Dequantization is unimplemented
+dequantize_int8                         # Quantization/Dequantization is unimplemented
+sum_matrix_rows_zero                    # Empty dims apparently should produce shaped 0s
+sum_matrix_cols_zero                    # Empty dims apparently should produce shaped 0s
+sum_vector_zero                         # Empty dims apparently should produce shaped 0s
+sum_matrix_to_scalar_zero_by_zero       # Empty dims apparently should produce shaped 0s
+sum_3d_eliminate_zero_dim               # Empty dims apparently should produce shaped 0s
+dot_0_0                                 # Empty dims apparently should produce shaped 0s
+dot_matrix_2x0_0x2                      # Empty dims apparently should produce shaped 0s
+dot_2x0_0                               # Empty dims apparently should produce shaped 0s
 numeric_float_nan
 numeric_double_nan
+shape_of_scalar
+shape_of_vector
+shape_of_matrix
+shape_of_5d
+
--- a/src/tools/ngraph-to-plaidml/ngraph-to-plaidml.cpp
+++ b/src/tools/ngraph-to-plaidml/ngraph-to-plaidml.cpp
-/*******************************************************************************
-* Copyright 2018 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************

 #include <getopt.h>


--- a/test/backend_arg_reduce.in.cpp
+++ b/test/backend_arg_reduce.in.cpp
@@ -311,3 +311,82 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3)
                   .get_vector()),
              read_vector<int>(result));
 }
+
+NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_i32)
+{
+    Shape shape{4, 3};
+    Shape rshape{3};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i32, shape);
+    copy_data(a, vector<int32_t>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
+    auto result = backend->create_tensor(element::i32, rshape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<int>{3, 2, 1}), read_vector<int>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3_i64_in_i32)
+{
+    Shape shape{2, 2, 5, 5}; // NCHW ->(0,1,2,3)
+    Shape rshape{2, 2, 5};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    auto f = make_shared<Function>(make_shared<op::ArgMax>(A, 3, element::i64), ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i32, shape);
+    copy_data(a,
+              test::NDArray<int32_t, 4>({{{{0, 1, 0, 2, 1}, // img 0 ch 0
+                                           {0, 3, 2, 0, 0},
+                                           {2, 0, 0, 0, 1},
+                                           {2, 0, 1, 1, 2},
+                                           {0, 2, 1, 0, 0}},
+
+                                          {{0, 0, 0, 2, 0}, // img 0 ch 1
+                                           {0, 2, 3, 0, 1},
+                                           {2, 0, 1, 0, 2},
+                                           {3, 1, 0, 0, 0},
+                                           {2, 0, 0, 0, 0}}},
+
+                                         {{{0, 2, 1, 1, 0}, // img 1 ch 0
+                                           {0, 0, 2, 0, 1},
+                                           {0, 0, 1, 2, 3},
+                                           {2, 0, 0, 3, 0},
+                                           {0, 0, 0, 0, 0}},
+
+                                          {{2, 1, 0, 0, 1}, // img 1 ch 1
+                                           {0, 2, 0, 0, 0},
+                                           {1, 1, 2, 0, 2},
+                                           {1, 1, 1, 0, 1},
+                                           {1, 0, 0, 0, 2}}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::i64, rshape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<int64_t, 3>({{{3, 1, 0, 0, 1}, {3, 2, 0, 0, 0}},  //ch0
+                                          {{1, 2, 4, 3, 0}, {0, 1, 2, 0, 4}}}) //ch1
+                   .get_vector()),
+              read_vector<int64_t>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_double)
+{
+    Shape shape{4, 3};
+    Shape rshape{3};
+    auto A = make_shared<op::Parameter>(element::f64, shape);
+    auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f64, shape);
+    copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
+    auto result = backend->create_tensor(element::i32, rshape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<int32_t>{3, 2, 1}), read_vector<int32_t>(result));
+}
--- a/test/backend_sum.in.cpp
+++ b/test/backend_sum.in.cpp
@@ -485,6 +485,24 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_2d_to_scalar_int8)
    EXPECT_EQ(std::vector<int8_t>{45}, read_vector<int8_t>(result));
 }

+NGRAPH_TEST(${BACKEND_NAME}, sum_trivial_in_double)
+{
+    Shape shape{4, 3};
+    Shape rshape{3};
+    auto A = make_shared<op::Parameter>(element::f64, shape);
+    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f64, shape);
+    copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
+    auto result = backend->create_tensor(element::f64, rshape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<double>{30, 22, 26}), read_vector<double>(result));
+}
+
 #if NGRAPH_INTERPRETER_ENABLE

 NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc)