Unverified Commit b3d70927 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

Merge branch 'master' into master

parents c0b0bf8f a4b9e6b7
# nGraph Compiler Stack # nGraph Compiler Stack (Beta)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/NervanaSystems/ngraph/blob/master/LICENSE) [![Build Status][build-status-badge]][build-status] [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/NervanaSystems/ngraph/blob/master/LICENSE) [![Build Status][build-status-badge]][build-status]
...@@ -16,12 +16,12 @@ workloads on CPU for inference, please refer to the links below. ...@@ -16,12 +16,12 @@ workloads on CPU for inference, please refer to the links below.
| Framework (Version) | Installation guide | Notes | Framework (Version) | Installation guide | Notes
|----------------------------|----------------------------------------|----------------------------------- |----------------------------|----------------------------------------|-----------------------------------
| TensorFlow* 1.12 | [Pip package] or [Build from source] | 17 [Validated workloads] | TensorFlow* 1.12 | [Pip install](https://github.com/NervanaSystems/ngraph-tf) or [Build from source](https://github.com/NervanaSystems/ngraph-tf) | 20 [Validated workloads]
| MXNet* 1.4 | [Enable the module] or [Source compile]| 17 [Validated workloads] | MXNet* 1.3 | [Pip install](https://github.com/NervanaSystems/ngraph-mxnet#Installation) or [Build from source](https://github.com/NervanaSystems/ngraph-mxnet#building-with-ngraph-support)| 18 [Validated workloads]
| ONNX 1.3 | [Pip package] | 14 [Validated workloads] | ONNX 1.3 | [Pip install](https://github.com/NervanaSystems/ngraph-onnx#installation) | 14 [Validated workloads]
Frameworks using nGraph Compiler stack to execute workloads have shown Frameworks using nGraph Compiler stack to execute workloads have shown
**up to 45X** performance boost when compared to native framework [**up to 45X**](https://ai.intel.com/ngraph-compiler-stack-beta-release/) performance boost when compared to native framework
implementations. We've also seen performance boosts running workloads that implementations. We've also seen performance boosts running workloads that
are not included on the list of [Validated workloads], thanks to our are not included on the list of [Validated workloads], thanks to our
powerful subgraph pattern matching. powerful subgraph pattern matching.
...@@ -100,9 +100,6 @@ to improve it: ...@@ -100,9 +100,6 @@ to improve it:
[develop-without-lockin]: doc/sphinx/source/graphics/develop-without-lockin.png "Develop on any part of the stack wtihout lockin" [develop-without-lockin]: doc/sphinx/source/graphics/develop-without-lockin.png "Develop on any part of the stack wtihout lockin"
[Movidius™ Myriad™ 2]:https://www.movidius.com/solutions/vision-processing-unit [Movidius™ Myriad™ 2]:https://www.movidius.com/solutions/vision-processing-unit
[PlaidML]: https://github.com/plaidml/plaidml [PlaidML]: https://github.com/plaidml/plaidml
[Pip package]: https://github.com/NervanaSystems/ngraph-onnx#installing-ngraph-onnx
[Build from source]: https://github.com/NervanaSystems/ngraph-tf
[Enable the module]: https://github.com/NervanaSystems/ngraph/blob/mbrookhart/mxnet_tutorial/doc/sphinx/source/shared/mxnet_tutorial.rst
[Source compile]: https://github.com/NervanaSystems/ngraph-mxnet/blob/master/README.md [Source compile]: https://github.com/NervanaSystems/ngraph-mxnet/blob/master/README.md
[nGraph-ONNX]: https://github.com/NervanaSystems/ngraph-onnx/blob/master/README.md [nGraph-ONNX]: https://github.com/NervanaSystems/ngraph-onnx/blob/master/README.md
[nGraph-ONNX adaptable]: https://ai.intel.com/adaptable-deep-learning-solutions-with-ngraph-compiler-and-onnx/ [nGraph-ONNX adaptable]: https://ai.intel.com/adaptable-deep-learning-solutions-with-ngraph-compiler-and-onnx/
......
...@@ -45,6 +45,7 @@ ExternalProject_Add( ...@@ -45,6 +45,7 @@ ExternalProject_Add(
GIT_REPOSITORY ${HALIDE_GIT_REPO_URL} GIT_REPOSITORY ${HALIDE_GIT_REPO_URL}
GIT_TAG ${HALIDE_GIT_TAG} GIT_TAG ${HALIDE_GIT_TAG}
UPDATE_COMMAND "" UPDATE_COMMAND ""
PATCH_COMMAND patch -p1 --forward --reject-file=- -i ${CMAKE_SOURCE_DIR}/cmake/halide.patch || exit 0
CMAKE_ARGS CMAKE_ARGS
-DLLVM_DIR=${HALIDE_LLVM_DIR} -DLLVM_DIR=${HALIDE_LLVM_DIR}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
......
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d70fdc79d..60aa4c3b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,7 +131,8 @@ function(check_llvm_target TARGET HAS_TARGET)
set(_llvm_required_version ${ARGV2})
endif()
if (NOT LLVM_VERSION LESS _llvm_required_version)
- list(FIND LLVM_TARGETS_TO_BUILD ${TARGET} _found_target)
+ set(NGRAPH_TARGETS_TO_BUILD "X86")
+ list(FIND NGRAPH_TARGETS_TO_BUILD ${TARGET} _found_target)
if (_found_target GREATER -1)
set(${HAS_TARGET} ON PARENT_SCOPE)
else()
...@@ -1634,7 +1634,7 @@ body { ...@@ -1634,7 +1634,7 @@ body {
color: #38403f; color: #38403f;
min-height: 100%; min-height: 100%;
overflow-x: hidden; overflow-x: hidden;
background: #edf0f2; background: #fcfcfc;
} }
.wy-text-left { .wy-text-left {
...@@ -3193,7 +3193,7 @@ footer span.commit code, footer span.commit .rst-content tt, .rst-content footer ...@@ -3193,7 +3193,7 @@ footer span.commit code, footer span.commit .rst-content tt, .rst-content footer
} }
@media screen and (min-width: 1400px) { @media screen and (min-width: 1400px) {
.wy-nav-content-wrap { .wy-nav-content-wrap {
background: #0C7881; background: #fcfcfc;
} }
.wy-nav-content { .wy-nav-content {
......
...@@ -73,9 +73,11 @@ author = 'Intel Corporation' ...@@ -73,9 +73,11 @@ author = 'Intel Corporation'
# built documents. # built documents.
# #
# The short X.Y version. # The short X.Y version.
version = '0.9' version = '0.10'
# The full version, including alpha/beta/rc tags. # The Documentation full version, including alpha/beta/rc tags. Some features
release = '0.9.0' # available in the latest code will not necessarily be documented first
release = '0.10.1'
# The language for content autogenerated by Sphinx. Refer to documentation # The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages. # for a list of supported languages.
......
...@@ -50,4 +50,4 @@ nGraph-TensorFlow bridge. ...@@ -50,4 +50,4 @@ nGraph-TensorFlow bridge.
.. _MXNet: http://mxnet.incubator.apache.org .. _MXNet: http://mxnet.incubator.apache.org
.. _DSO: http://csweb.cs.wfu.edu/%7Etorgerse/Kokua/More_SGI/007-2360-010/sgi_html/ch03.html .. _DSO: http://csweb.cs.wfu.edu/%7Etorgerse/Kokua/More_SGI/007-2360-010/sgi_html/ch03.html
.. _being the fastest: https://github.com/soumith/convnet-benchmarks .. _being the fastest: https://github.com/soumith/convnet-benchmarks
.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf .. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf/blob/master/README.md
...@@ -15,19 +15,22 @@ TensorFlow ...@@ -15,19 +15,22 @@ TensorFlow
:widths: 27, 53 :widths: 27, 53
:escape: ~ :escape: ~
Resnet50 v1 and v2, Image recognition Resnet50 v1, Image recognition
Inception V3 and V4, Image recognition Resnet50 v2, Image recognition
Inception V3, Image recognition
Inception V4, Image recognition
Inception-ResNetv2, Image recognition Inception-ResNetv2, Image recognition
MobileNet v1, Image recognition MobileNet v1, Image recognition
SqueezeNet v1.1, Image recognition MobileNet v2, Image recognition
DenseNet-121, Image recognition VGG16, Image recognition
SSD-VGG16, Object detection SSD-VGG16, Object detection
SSD-MobileNetv1, Object detection SSD-MobileNetv1, Object detection
R-FCN, Object detection
Faster RCNN, Object detection Faster RCNN, Object detection
Yolo v2, Object detection Yolo v2, Object detection
Transformer-LT, Language translation
Wide & Deep, Recommender system Wide & Deep, Recommender system
NCF, Recommender system NCF, Recommender system
WaveNet, Speech generation
U-Net, Image segmentation U-Net, Image segmentation
DCGAN, Generative adversarial network DCGAN, Generative adversarial network
DRAW, Image generation DRAW, Image generation
...@@ -41,7 +44,8 @@ MXNet ...@@ -41,7 +44,8 @@ MXNet
:widths: 27, 53 :widths: 27, 53
:escape: ~ :escape: ~
Resnet50 v1 and v2, Image recognition Resnet50 v1, Image recognition
Resnet50 v2, Image recognition
DenseNet-121, Image recognition DenseNet-121, Image recognition
InceptionV3, Image recognition InceptionV3, Image recognition
InceptionV4, Image recognition InceptionV4, Image recognition
...@@ -70,10 +74,10 @@ Additionally, we validated the following workloads are functional through nGraph ...@@ -70,10 +74,10 @@ Additionally, we validated the following workloads are functional through nGraph
:widths: 27, 53 :widths: 27, 53
:escape: ~ :escape: ~
ResNet-50, Image recognition
DenseNet-121, Image recognition DenseNet-121, Image recognition
Inception-v1, Image recognition Inception-v1, Image recognition
Inception-v2, Image recognition Inception-v2, Image recognition
ResNet-50, Image recognition
Shufflenet, Image recognition Shufflenet, Image recognition
SqueezeNet, Image recognition SqueezeNet, Image recognition
VGG-19, Image recognition VGG-19, Image recognition
......
The MPL 2.0 license used by the eigen library used by this ngraph core
component requires distribution of the following information:
Eigen source code can be viewed or downloaded from here:
http://eigen.tuxfamily.org
...@@ -22,7 +22,7 @@ import os ...@@ -22,7 +22,7 @@ import os
import distutils.ccompiler import distutils.ccompiler
__version__ = os.environ.get('NGRAPH_VERSION', '0.0.0-dev') __version__ = os.environ.get('NGRAPH_VERSION', '0.0.0-dev')
PYNGRAPH_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__)) PYNGRAPH_ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
NGRAPH_DEFAULT_INSTALL_DIR = os.environ.get('HOME') NGRAPH_DEFAULT_INSTALL_DIR = os.environ.get('HOME')
NGRAPH_ONNX_IMPORT_ENABLE = os.environ.get('NGRAPH_ONNX_IMPORT_ENABLE') NGRAPH_ONNX_IMPORT_ENABLE = os.environ.get('NGRAPH_ONNX_IMPORT_ENABLE')
...@@ -50,7 +50,7 @@ def find_pybind_headers_dir(): ...@@ -50,7 +50,7 @@ def find_pybind_headers_dir():
if os.environ.get('PYBIND_HEADERS_PATH'): if os.environ.get('PYBIND_HEADERS_PATH'):
pybind_headers_dir = os.environ.get('PYBIND_HEADERS_PATH') pybind_headers_dir = os.environ.get('PYBIND_HEADERS_PATH')
else: else:
pybind_headers_dir = os.path.join(PYNGRAPH_SOURCE_DIR, 'pybind11') pybind_headers_dir = os.path.join(PYNGRAPH_ROOT_DIR, 'pybind11')
found = os.path.exists(os.path.join(pybind_headers_dir, 'include/pybind11')) found = os.path.exists(os.path.join(pybind_headers_dir, 'include/pybind11'))
if not found: if not found:
...@@ -233,13 +233,13 @@ sources = [ ...@@ -233,13 +233,13 @@ sources = [
] ]
package_dir = { package_dir = {
'ngraph': PYNGRAPH_SOURCE_DIR + "/ngraph", 'ngraph': PYNGRAPH_ROOT_DIR + "/ngraph",
'ngraph.utils': PYNGRAPH_SOURCE_DIR + "/ngraph/utils", 'ngraph.utils': PYNGRAPH_ROOT_DIR + "/ngraph/utils",
'ngraph.impl': PYNGRAPH_SOURCE_DIR + "/ngraph/impl", 'ngraph.impl': PYNGRAPH_ROOT_DIR + "/ngraph/impl",
'ngraph.impl.op': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/op", 'ngraph.impl.op': PYNGRAPH_ROOT_DIR + "/ngraph/impl/op",
'ngraph.impl.op.util': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/op/util", 'ngraph.impl.op.util': PYNGRAPH_ROOT_DIR + "/ngraph/impl/op/util",
'ngraph.impl.passes': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/passes", 'ngraph.impl.passes': PYNGRAPH_ROOT_DIR + "/ngraph/impl/passes",
'ngraph.impl.runtime': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/runtime", 'ngraph.impl.runtime': PYNGRAPH_ROOT_DIR + "/ngraph/impl/runtime",
} }
packages = [ packages = [
'ngraph', 'ngraph',
...@@ -251,9 +251,9 @@ packages = [ ...@@ -251,9 +251,9 @@ packages = [
'ngraph.impl.runtime', 'ngraph.impl.runtime',
] ]
sources = [PYNGRAPH_SOURCE_DIR + "/" + source for source in sources] sources = [PYNGRAPH_ROOT_DIR + "/" + source for source in sources]
include_dirs = [PYNGRAPH_SOURCE_DIR, NGRAPH_CPP_INCLUDE_DIR, PYBIND11_INCLUDE_DIR] include_dirs = [PYNGRAPH_ROOT_DIR, NGRAPH_CPP_INCLUDE_DIR, PYBIND11_INCLUDE_DIR]
library_dirs = [NGRAPH_CPP_LIBRARY_DIR] library_dirs = [NGRAPH_CPP_LIBRARY_DIR]
...@@ -274,13 +274,13 @@ data_files = [ ...@@ -274,13 +274,13 @@ data_files = [
( (
'licenses', 'licenses',
[ [
PYNGRAPH_SOURCE_DIR + "/../licenses/" + license PYNGRAPH_ROOT_DIR + "/../licenses/" + license
for license in os.listdir(PYNGRAPH_SOURCE_DIR + "/../licenses") for license in os.listdir(PYNGRAPH_ROOT_DIR + "/../licenses")
], ],
), ),
( (
'', '',
[PYNGRAPH_SOURCE_DIR + "/../LICENSE"], [PYNGRAPH_ROOT_DIR + "/../LICENSE"],
) )
] ]
...@@ -302,10 +302,10 @@ if NGRAPH_ONNX_IMPORT_ENABLE == 'TRUE': ...@@ -302,10 +302,10 @@ if NGRAPH_ONNX_IMPORT_ENABLE == 'TRUE':
'pyngraph/pyngraph_onnx_import.cpp', 'pyngraph/pyngraph_onnx_import.cpp',
'pyngraph/onnx_import/onnx_import.cpp', 'pyngraph/onnx_import/onnx_import.cpp',
] ]
onnx_sources = [PYNGRAPH_SOURCE_DIR + "/" + source for source in onnx_sources] onnx_sources = [PYNGRAPH_ROOT_DIR + "/" + source for source in onnx_sources]
package_dir['ngraph.impl.onnx_import'] = ( package_dir['ngraph.impl.onnx_import'] = (
PYNGRAPH_SOURCE_DIR + "/ngraph/impl/onnx_import" PYNGRAPH_ROOT_DIR + "/ngraph/impl/onnx_import"
) )
packages.append('ngraph.impl.onnx_import') packages.append('ngraph.impl.onnx_import')
...@@ -360,17 +360,17 @@ class BuildExt(build_ext): ...@@ -360,17 +360,17 @@ class BuildExt(build_ext):
build_ext.build_extensions(self) build_ext.build_extensions(self)
with open(os.path.join(PYNGRAPH_SOURCE_DIR, 'requirements.txt')) as req: with open(os.path.join(PYNGRAPH_ROOT_DIR, 'requirements.txt')) as req:
requirements = req.read().splitlines() requirements = req.read().splitlines()
setup( setup(
name='ngraph-core', name='ngraph-core',
description=open(os.path.join(PYNGRAPH_ROOT_DIR, 'README.md')).read(),
version=__version__, version=__version__,
author='Intel', author='Intel',
author_email='intelnervana@intel.com', author_email='intelnervana@intel.com',
url='https://ai.intel.com/', url='https://ai.intel.com/',
license='License :: OSI Approved :: Apache Software License', license='License :: OSI Approved :: Apache Software License',
description='Python API for nGraph',
long_description='', long_description='',
ext_modules=ext_modules, ext_modules=ext_modules,
package_dir=package_dir, package_dir=package_dir,
......
...@@ -28,26 +28,6 @@ ...@@ -28,26 +28,6 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
namespace
{
class NilStreamBuf final : public streambuf
{
// N.B. We derive from the base streambuf implementation, in
// which underflow() and overflow() both return
// Traits::eof() -- any access returns a failure.
};
}
ostream& ngraph::get_nil_stream()
{
// N.B. When debug logging is disabled, multiple threads may
// access the nil stream simultaneously, so it's important to
// return a threadsafe nil stream implementation.
static NilStreamBuf nil_buf;
static ostream nil{&nil_buf};
return nil;
}
void ngraph::default_logger_handler_func(const string& s) void ngraph::default_logger_handler_func(const string& s)
{ {
cout << s << endl; cout << s << endl;
......
...@@ -100,8 +100,6 @@ namespace ngraph ...@@ -100,8 +100,6 @@ namespace ngraph
static std::deque<std::string> m_queue; static std::deque<std::string> m_queue;
}; };
extern std::ostream& get_nil_stream();
void default_logger_handler_func(const std::string& s); void default_logger_handler_func(const std::string& s);
#define NGRAPH_ERR \ #define NGRAPH_ERR \
...@@ -133,6 +131,33 @@ namespace ngraph ...@@ -133,6 +131,33 @@ namespace ngraph
ngraph::default_logger_handler_func) \ ngraph::default_logger_handler_func) \
.stream() .stream()
#else #else
#define NGRAPH_DEBUG ngraph::get_nil_stream()
struct NullLogger
{
};
template <typename T>
NullLogger&& operator<<(NullLogger&& logger, T&&)
{
return std::move(logger);
}
template <typename T>
NullLogger&& operator<<(NullLogger&& logger, const T&)
{
return std::move(logger);
}
inline NullLogger&&
operator<<(NullLogger&& logger,
std::basic_ostream<char, std::char_traits<char>>& (&)(std::basic_ostream<
char,
std::char_traits<char>>&))
{
return std::move(logger);
}
#define NGRAPH_DEBUG \
::ngraph::NullLogger {}
#endif #endif
} }
/******************************************************************************* //*****************************************************************************
* Copyright 2017-2018 Intel Corporation // Copyright 2017-2018 Intel Corporation
* //
* Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
* You may obtain a copy of the License at // You may obtain a copy of the License at
* //
* http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
* //
* Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
* limitations under the License. // limitations under the License.
*******************************************************************************/ //*****************************************************************************
#include <numeric> #include <numeric>
......
...@@ -16,3 +16,6 @@ quantize_clamp_int32 ...@@ -16,3 +16,6 @@ quantize_clamp_int32
# failing in CI build but passing on local machine # failing in CI build but passing on local machine
max_3d_to_scalar_int32 max_3d_to_scalar_int32
argmin_trivial_in_i32
argmax_4D_axis_3_i64_in_i32
...@@ -165,6 +165,15 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -165,6 +165,15 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
const ReductionMode& reduction_mode) const ReductionMode& reduction_mode)
{ {
auto input_type = dtypes[0]; auto input_type = dtypes[0];
bool use_cudnn_reduce = !((reduction_mode == ReductionMode::Reduce) &&
((input_type == element::i32) || (input_type == element::i8)));
NGRAPH_ASSERT(use_cudnn_reduce)
<< "cuDNN reduce for input type int32_t or int8_t currently not supported";
bool unsupported_int8_type_arg_reduce =
!((reduction_mode == ReductionMode::ArgReduce) && (input_type == element::i8));
NGRAPH_ASSERT(unsupported_int8_type_arg_reduce)
<< "cuDNN arg_reduce for input type int8_t currently not supported";
auto output_type = dtypes[1]; auto output_type = dtypes[1];
std::stringstream ss; std::stringstream ss;
ss << "reduce_" << reduce_op << "_" << input_type.c_type_string() << "_" ss << "reduce_" << reduce_op << "_" << input_type.c_type_string() << "_"
...@@ -180,7 +189,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -180,7 +189,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
} }
auto& desc = m_descriptors.build<cudnnReduceTensorDescriptor_t>(); auto& desc = m_descriptors.build<cudnnReduceTensorDescriptor_t>();
cudnnDataType_t data_type = get_cudnn_datatype(input_type); auto modified_input_type = (input_type == element::i32) ? element::f64 : input_type;
cudnnDataType_t data_type = get_cudnn_datatype(modified_input_type);
cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format); auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format);
Shape output_shape = input_shape; Shape output_shape = input_shape;
...@@ -193,15 +203,6 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -193,15 +203,6 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t input_buffer_size = shape_size(input_shape) * input_type.size();
if (workspace_size < input_buffer_size)
{
workspace_size = input_buffer_size;
}
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
...@@ -217,6 +218,12 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -217,6 +218,12 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
CUDNN_NOT_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES)); CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
// emit reduce operation // emit reduce operation
reduce.reset(new gpu::primitive{ reduce.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { [=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
...@@ -243,71 +250,83 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -243,71 +250,83 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
{ {
if (output_type == element::i32 || output_type == element::i64) if (output_type == element::i32 || output_type == element::i64)
{ {
size_t indices_size = shape_size(output_shape) * output_type.size(); // Since cuDNN only outputs int32 indices
size_t indices_size = shape_size(output_shape) * element::i32.size();
size_t reduce_buffer_idx = size_t reduce_buffer_idx =
allocator.reserve_workspace(shape_size(output_shape) * input_type.size()); allocator.reserve_workspace(shape_size(output_shape) * modified_input_type.size());
CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(desc, CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(desc,
reduce_op, reduce_op,
data_type, data_type,
CUDNN_NOT_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
CUDNN_32BIT_INDICES)); CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
std::function<void(void**, void**)> convert_output = [](void** inputs, void** outputs) {
};
std::function<void*(void*)> convert_output_space = [](void* ptr) { return ptr; };
if (output_type == element::i64) if (output_type == element::i64)
{ {
size_t workspace_indices_idx = size_t workspace_indices_idx = allocator.reserve_workspace(indices_size);
allocator.reserve_workspace(shape_size(output_shape) * input_type.size()); auto convert_idx = cuda_emitter->template build_elementwise<op::Convert>(
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
auto convert_idx = cuda_emitter->build_elementwise<op::Convert>(
{element::i32.c_type_string(), element::i64.c_type_string()}, output_shape); {element::i32.c_type_string(), element::i64.c_type_string()}, output_shape);
reduce.reset(new gpu::primitive{ convert_output = [=](void** inputs, void** outputs) {
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { gpu::invoke_primitive(m_ctx, convert_idx, inputs, outputs);
void* workspace_indices_ptr = };
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_indices_idx); convert_output_space = [=](void* ptr) {
void* workspace_ptr = return runtime::gpu::invoke_memory_primitive(m_ctx, workspace_indices_idx);
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx); };
void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc,
workspace_indices_ptr,
indices_size,
workspace_ptr,
workspace_size,
alpha,
input_desc,
inputs[0],
beta,
output_desc,
reduce_buffer));
gpu::invoke_primitive(m_ctx, convert_idx, &workspace_indices_ptr, outputs);
debug_sync();
}});
} }
else
std::function<void(void**, void**)> convert_input = [](void** inputs, void** outputs) {
};
std::function<void*(void*)> convert_input_space = [](void* ptr) { return ptr; };
if (input_type == element::i32)
{ {
reduce.reset(new gpu::primitive{ size_t input_idx = allocator.reserve_workspace(shape_size(input_shape) *
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { modified_input_type.size());
auto convert_input_idx = cuda_emitter->template build_elementwise<op::Convert>(
{input_type.c_type_string(), modified_input_type.c_type_string()}, input_shape);
convert_input = [=](void** inputs, void** outputs) {
gpu::invoke_primitive(m_ctx, convert_input_idx, inputs, outputs);
};
convert_input_space = [=](void* ptr) {
return runtime::gpu::invoke_memory_primitive(m_ctx, input_idx);
};
}
void* workspace_ptr = reduce.reset(new gpu::primitive{[=, &desc, &input_desc, &output_desc](void** inputs,
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx); void** outputs) {
void* input_ptr = convert_input_space(inputs[0]);
void* workspace_indices_ptr = convert_output_space(outputs[0]);
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
void* reduce_buffer = void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx); runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
convert_input(inputs, &input_ptr);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc, desc,
outputs[0], workspace_indices_ptr,
indices_size, indices_size,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
alpha, alpha,
input_desc, input_desc,
inputs[0], input_ptr,
beta, beta,
output_desc, output_desc,
reduce_buffer)); reduce_buffer));
convert_output(&workspace_indices_ptr, outputs);
debug_sync(); debug_sync();
}}); }});
} }
}
else else
{ {
std::stringstream ss_er; std::stringstream ss_er;
......
...@@ -134,6 +134,7 @@ shape_of_vector ...@@ -134,6 +134,7 @@ shape_of_vector
shape_of_matrix shape_of_matrix
shape_of_5d shape_of_5d
sum_stable_acc sum_stable_acc
sum_trivial_in_double
product_2d_to_scalar_int32 product_2d_to_scalar_int32
product_to_scalar_int32 product_to_scalar_int32
product_to_scalar_int8 product_to_scalar_int8
...@@ -141,3 +142,6 @@ max_matrix_rows_zero_int32 ...@@ -141,3 +142,6 @@ max_matrix_rows_zero_int32
max_to_scalar_int8 max_to_scalar_int8
min_to_scalar_int8 min_to_scalar_int8
max_3d_to_scalar_double max_3d_to_scalar_double
argmin_trivial_in_i32
argmax_4D_axis_3_i64_in_i32
argmin_trivial_in_double
...@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call( ...@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call(
const std::vector<std::shared_ptr<runtime::Tensor>>& outputs, const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
{ {
auto cfunc = m_cache.try_lookup(func); auto cfunc = m_cache.compile(func, &m_compiler);
if (!cfunc)
{
cfunc = m_compiler.compile(func);
}
cfunc->schedule_invocation(inputs, outputs); cfunc->schedule_invocation(inputs, outputs);
return true; return true;
} }
......
...@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation( ...@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
NGRAPH_DEBUG << "Binding PlaidML function " << this; NGRAPH_DEBUG << "Binding PlaidML function " << this;
m_bound_inputs.resize(inputs.size());
m_bound_outputs.resize(outputs.size());
std::size_t input_count = 0; std::size_t input_count = 0;
for (const auto& param : m_func->get_parameters()) for (const auto& param : m_func->get_parameters())
{ {
for (std::size_t idx = 0; idx < param->get_output_size(); ++idx) for (std::size_t idx = 0; idx < param->get_output_size(); ++idx)
{ {
descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get(); descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get();
auto rtv = dynamic_cast<PlaidML_Tensor*>(inputs[input_count++].get()); auto& input = inputs.at(input_count);
auto rtv = dynamic_cast<PlaidML_Tensor*>(input.get());
if (!rtv) if (!rtv)
{ {
throw std::runtime_error{ throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"}; "The PlaidML backend only operates on PlaidML tensor views"};
} }
rtv->sync_input(); rtv->sync_input();
auto& bound_input = m_bound_inputs.at(input_count);
++input_count;
if (bound_input.lock() == input)
{
// No need to re-bind this input.
continue;
}
bound_input = input;
NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv; NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv;
m_invoker.set_input(m_input_names.at(tv), rtv->tensor()); m_invoker.set_input(m_input_names.at(tv), rtv->tensor());
} }
...@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation( ...@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
for (std::size_t idx = 0; idx < result->get_output_size(); ++idx) for (std::size_t idx = 0; idx < result->get_output_size(); ++idx)
{ {
descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get(); descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get();
auto rtv = dynamic_cast<PlaidML_Tensor*>(outputs[output_count++].get()); auto& output = outputs.at(output_count);
auto rtv = dynamic_cast<PlaidML_Tensor*>(output.get());
if (!rtv) if (!rtv)
{ {
throw std::runtime_error{ throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"}; "The PlaidML backend only operates on PlaidML tensor views"};
}
auto& bound_output = m_bound_outputs.at(output_count);
++output_count;
if (bound_output.lock() == output)
{
// No need to re-bind this output.
continue;
} }
bound_output = output;
NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv; NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv;
m_invoker.set_output(m_output_names.at(tv), rtv->tensor()); m_invoker.set_output(m_output_names.at(tv), rtv->tensor());
} }
...@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation( ...@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
if (!rtv) if (!rtv)
{ {
throw std::runtime_error{ throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"}; "The PlaidML backend only operates on PlaidML tensor views"};
} }
rtv->sync_output(); rtv->sync_output();
} }
......
...@@ -58,5 +58,7 @@ private: ...@@ -58,5 +58,7 @@ private:
std::shared_ptr<Function> m_func; std::shared_ptr<Function> m_func;
std::unordered_map<descriptor::Tensor*, std::string> m_input_names; std::unordered_map<descriptor::Tensor*, std::string> m_input_names;
std::unordered_map<descriptor::Tensor*, std::string> m_output_names; std::unordered_map<descriptor::Tensor*, std::string> m_output_names;
mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_inputs;
mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_outputs;
mutable vertexai::plaidml::invoker m_invoker; mutable vertexai::plaidml::invoker m_invoker;
}; };
...@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse ...@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse
void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input() void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input()
{ {
if (!get_stale())
{
return;
}
set_stale(false);
if (!m_memory) if (!m_memory)
{ {
if (m_is_logically_zero) if (m_is_logically_zero)
...@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output() ...@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output()
{ {
// The tensor's been used for an output, so it's no longer logically zero. // The tensor's been used for an output, so it's no longer logically zero.
m_is_logically_zero = false; m_is_logically_zero = false;
set_stale(false);
if (!m_memory) if (!m_memory)
{ {
......
...@@ -26,12 +26,15 @@ topk_1d_max_one # No plans to implement TopK ...@@ -26,12 +26,15 @@ topk_1d_max_one # No plans to implement TopK
topk_1d_min_all # No plans to implement TopK topk_1d_min_all # No plans to implement TopK
topk_1d_min_partial # No plans to implement TopK topk_1d_min_partial # No plans to implement TopK
topk_1d_min_one # No plans to implement TopK topk_1d_min_one # No plans to implement TopK
topk_3d_large_input_max # No plans to implement TopK
topk_3d_large_input_min # No plans to implement TopK
topk_3d_max_all # No plans to implement TopK topk_3d_max_all # No plans to implement TopK
topk_3d_max_partial # No plans to implement TopK topk_3d_max_partial # No plans to implement TopK
topk_3d_max_one # No plans to implement TopK topk_3d_max_one # No plans to implement TopK
topk_3d_min_all # No plans to implement TopK topk_3d_min_all # No plans to implement TopK
topk_3d_min_partial # No plans to implement TopK topk_3d_min_partial # No plans to implement TopK
topk_3d_min_one # No plans to implement TopK topk_3d_min_one # No plans to implement TopK
topk_3d_single_output # No plans to implement TopK
topk_2d_max_all # No plans to implement TopK topk_2d_max_all # No plans to implement TopK
topk_2d_max_partial # No plans to implement TopK topk_2d_max_partial # No plans to implement TopK
topk_2d_max_one # No plans to implement TopK topk_2d_max_one # No plans to implement TopK
...@@ -43,15 +46,21 @@ topk_5d_max_partial # No plans to implement TopK ...@@ -43,15 +46,21 @@ topk_5d_max_partial # No plans to implement TopK
# Tests that PlaidML might be able to run at some point. # Tests that PlaidML might be able to run at some point.
backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3 backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3
backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2
backwards_maxpool_n4_c1_hw4_2x2_max
backwards_maxpool_n2_c1_hw5_3x3_str2_max
backwards_slice backwards_slice
batchnorm_fprop_bprop # To debug batchnorm_fprop_bprop # To debug
batchnorm_fprop_bprop_2step # To debug batchnorm_fprop_bprop_2step # To debug
softmax_axis_3d_double # To debug
reduce_matrix_rows_zero # To debug: possible broadcasting error? reduce_matrix_rows_zero # To debug: possible broadcasting error?
reduce_matrix_cols_zero # To debug: possible broadcasting error? reduce_matrix_cols_zero # To debug: possible broadcasting error?
reduce_3d_to_vector # To debug: possible broadcasting error? reduce_3d_to_vector # To debug: possible broadcasting error?
replace_slice_matrix_inplace replace_slice_matrix_inplace
max_pool_2d_1channel_1image_overpadded max_pool_2d_1channel_1image_overpadded
max_pool_3d max_pool_3d
maxpool_bprop_larger_than_cache
reduce_window_emulating_max_pool_1d_1channel_1image reduce_window_emulating_max_pool_1d_1channel_1image
reduce_window_emulating_max_pool_1d_1channel_2image reduce_window_emulating_max_pool_1d_1channel_2image
reduce_window_emulating_max_pool_1d_2channel_2image reduce_window_emulating_max_pool_1d_2channel_2image
...@@ -60,21 +69,34 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided ...@@ -60,21 +69,34 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided
select_and_scatter_with_overlap select_and_scatter_with_overlap
select_and_scatter_without_overlap select_and_scatter_without_overlap
select_and_scatter_3d_without_overlap select_and_scatter_3d_without_overlap
generate_mask
avg_pool_3d avg_pool_3d
avg_pool_3d_uneven_strided_padded_include_in_computation avg_pool_3d_uneven_strided_padded_include_in_computation
dequantize_int8_zero_offset # Quantization/Dequantization is unimplemented
dequantize_int32 # Quantization/Dequantization is unimplemented
dequantize_int32_zero_offset # Quantization/Dequantization is unimplemented
dequantize_zero_offset # Quantization/Dequantization is unimplemented dequantize_zero_offset # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_ZERO # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_UPWARD # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_UPWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_DOWNWARD # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_DOWNWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_EVEN # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_TOWARD_EVEN # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_INFINITY # Quantization/Dequantization is unimplemented quantize_ROUND_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_ZERO # Quantization/Dequantization is unimplemented quantize_ROUND_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_UP # Quantization/Dequantization is unimplemented quantize_ROUND_UP # Quantization/Dequantization is unimplemented
quantize_ROUND_DOWN # Quantization/Dequantization is unimplemented quantize_ROUND_DOWN # Quantization/Dequantization is unimplemented
quantize # Quantization/Dequantization is unimplemented quantize # Quantization/Dequantization is unimplemented
quantize_zero_offset # Quantization/Dequantization is unimplemented
quantize_axes # Quantization/Dequantization is unimplemented quantize_axes # Quantization/Dequantization is unimplemented
quantize_int8 # Quantization/Dequantization is unimplemented quantize_int8 # Quantization/Dequantization is unimplemented
quantize_int8_zero_offset # Quantization/Dequantization is unimplemented
quantize_int32 # Quantization/Dequantization is unimplemented
quantize_int32_zero_offset # Quantization/Dequantization is unimplemented
quantize_clamp # Quantization/Dequantization is unimplemented quantize_clamp # Quantization/Dequantization is unimplemented
quantize_clamp_int8 # Quantization/Dequantization is unimplemented
quantize_clamp_int32 # Quantization/Dequantization is unimplemented
quantize_clamp_int32_zero_offset # Quantization/Dequantization is unimplemented
quantize_clamp_uint8 # Quantization/Dequantization is unimplemented
dequantize # Quantization/Dequantization is unimplemented dequantize # Quantization/Dequantization is unimplemented
dequantize_axes # Quantization/Dequantization is unimplemented dequantize_axes # Quantization/Dequantization is unimplemented
dequantize_int8 # Quantization/Dequantization is unimplemented dequantize_int8 # Quantization/Dequantization is unimplemented
...@@ -88,3 +110,8 @@ dot_matrix_2x0_0x2 # Empty dims apparently should produce shape ...@@ -88,3 +110,8 @@ dot_matrix_2x0_0x2 # Empty dims apparently should produce shape
dot_2x0_0 # Empty dims apparently should produce shaped 0s dot_2x0_0 # Empty dims apparently should produce shaped 0s
numeric_float_nan numeric_float_nan
numeric_double_nan numeric_double_nan
shape_of_scalar
shape_of_vector
shape_of_matrix
shape_of_5d
/******************************************************************************* //*****************************************************************************
* Copyright 2018 Intel Corporation // Copyright 2017-2018 Intel Corporation
* //
* Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
* You may obtain a copy of the License at // You may obtain a copy of the License at
* //
* http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
* //
* Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
* limitations under the License. // limitations under the License.
*******************************************************************************/ //*****************************************************************************
#include <getopt.h> #include <getopt.h>
......
...@@ -311,3 +311,82 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3) ...@@ -311,3 +311,82 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3)
.get_vector()), .get_vector()),
read_vector<int>(result)); read_vector<int>(result));
} }
NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_i32)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::i32, shape);
copy_data(a, vector<int32_t>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::i32, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<int>{3, 2, 1}), read_vector<int>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3_i64_in_i32)
{
Shape shape{2, 2, 5, 5}; // NCHW ->(0,1,2,3)
Shape rshape{2, 2, 5};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::ArgMax>(A, 3, element::i64), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::i32, shape);
copy_data(a,
test::NDArray<int32_t, 4>({{{{0, 1, 0, 2, 1}, // img 0 ch 0
{0, 3, 2, 0, 0},
{2, 0, 0, 0, 1},
{2, 0, 1, 1, 2},
{0, 2, 1, 0, 0}},
{{0, 0, 0, 2, 0}, // img 0 ch 1
{0, 2, 3, 0, 1},
{2, 0, 1, 0, 2},
{3, 1, 0, 0, 0},
{2, 0, 0, 0, 0}}},
{{{0, 2, 1, 1, 0}, // img 1 ch 0
{0, 0, 2, 0, 1},
{0, 0, 1, 2, 3},
{2, 0, 0, 3, 0},
{0, 0, 0, 0, 0}},
{{2, 1, 0, 0, 1}, // img 1 ch 1
{0, 2, 0, 0, 0},
{1, 1, 2, 0, 2},
{1, 1, 1, 0, 1},
{1, 0, 0, 0, 2}}}})
.get_vector());
auto result = backend->create_tensor(element::i64, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((test::NDArray<int64_t, 3>({{{3, 1, 0, 0, 1}, {3, 2, 0, 0, 0}}, //ch0
{{1, 2, 4, 3, 0}, {0, 1, 2, 0, 4}}}) //ch1
.get_vector()),
read_vector<int64_t>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_double)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::f64, shape);
auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f64, shape);
copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::i32, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<int32_t>{3, 2, 1}), read_vector<int32_t>(result));
}
...@@ -485,6 +485,24 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_2d_to_scalar_int8) ...@@ -485,6 +485,24 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_2d_to_scalar_int8)
EXPECT_EQ(std::vector<int8_t>{45}, read_vector<int8_t>(result)); EXPECT_EQ(std::vector<int8_t>{45}, read_vector<int8_t>(result));
} }
NGRAPH_TEST(${BACKEND_NAME}, sum_trivial_in_double)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::f64, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f64, shape);
copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::f64, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<double>{30, 22, 26}), read_vector<double>(result));
}
#if NGRAPH_INTERPRETER_ENABLE #if NGRAPH_INTERPRETER_ENABLE
NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc) NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment