Unverified Commit b3d70927 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

Merge branch 'master' into master

parents c0b0bf8f a4b9e6b7
# nGraph Compiler Stack
# nGraph Compiler Stack (Beta)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/NervanaSystems/ngraph/blob/master/LICENSE) [![Build Status][build-status-badge]][build-status]
......@@ -16,12 +16,12 @@ workloads on CPU for inference, please refer to the links below.
| Framework (Version) | Installation guide | Notes
|----------------------------|----------------------------------------|-----------------------------------
| TensorFlow* 1.12 | [Pip package] or [Build from source] | 17 [Validated workloads]
| MXNet* 1.4 | [Enable the module] or [Source compile]| 17 [Validated workloads]
| ONNX 1.3 | [Pip package] | 14 [Validated workloads]
| TensorFlow* 1.12 | [Pip install](https://github.com/NervanaSystems/ngraph-tf) or [Build from source](https://github.com/NervanaSystems/ngraph-tf) | 20 [Validated workloads]
| MXNet* 1.3 | [Pip install](https://github.com/NervanaSystems/ngraph-mxnet#Installation) or [Build from source](https://github.com/NervanaSystems/ngraph-mxnet#building-with-ngraph-support)| 18 [Validated workloads]
| ONNX 1.3 | [Pip install](https://github.com/NervanaSystems/ngraph-onnx#installation) | 14 [Validated workloads]
Frameworks using nGraph Compiler stack to execute workloads have shown
**up to 45X** performance boost when compared to native framework
[**up to 45X**](https://ai.intel.com/ngraph-compiler-stack-beta-release/) performance boost when compared to native framework
implementations. We've also seen performance boosts running workloads that
are not included on the list of [Validated workloads], thanks to our
powerful subgraph pattern matching.
......@@ -100,9 +100,6 @@ to improve it:
[develop-without-lockin]: doc/sphinx/source/graphics/develop-without-lockin.png "Develop on any part of the stack wtihout lockin"
[Movidius™ Myriad™ 2]:https://www.movidius.com/solutions/vision-processing-unit
[PlaidML]: https://github.com/plaidml/plaidml
[Pip package]: https://github.com/NervanaSystems/ngraph-onnx#installing-ngraph-onnx
[Build from source]: https://github.com/NervanaSystems/ngraph-tf
[Enable the module]: https://github.com/NervanaSystems/ngraph/blob/mbrookhart/mxnet_tutorial/doc/sphinx/source/shared/mxnet_tutorial.rst
[Source compile]: https://github.com/NervanaSystems/ngraph-mxnet/blob/master/README.md
[nGraph-ONNX]: https://github.com/NervanaSystems/ngraph-onnx/blob/master/README.md
[nGraph-ONNX adaptable]: https://ai.intel.com/adaptable-deep-learning-solutions-with-ngraph-compiler-and-onnx/
......
......@@ -45,6 +45,7 @@ ExternalProject_Add(
GIT_REPOSITORY ${HALIDE_GIT_REPO_URL}
GIT_TAG ${HALIDE_GIT_TAG}
UPDATE_COMMAND ""
PATCH_COMMAND patch -p1 --forward --reject-file=- -i ${CMAKE_SOURCE_DIR}/cmake/halide.patch || exit 0
CMAKE_ARGS
-DLLVM_DIR=${HALIDE_LLVM_DIR}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
......
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d70fdc79d..60aa4c3b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,7 +131,8 @@ function(check_llvm_target TARGET HAS_TARGET)
set(_llvm_required_version ${ARGV2})
endif()
if (NOT LLVM_VERSION LESS _llvm_required_version)
- list(FIND LLVM_TARGETS_TO_BUILD ${TARGET} _found_target)
+ set(NGRAPH_TARGETS_TO_BUILD "X86")
+ list(FIND NGRAPH_TARGETS_TO_BUILD ${TARGET} _found_target)
if (_found_target GREATER -1)
set(${HAS_TARGET} ON PARENT_SCOPE)
else()
......@@ -1634,7 +1634,7 @@ body {
color: #38403f;
min-height: 100%;
overflow-x: hidden;
background: #edf0f2;
background: #fcfcfc;
}
.wy-text-left {
......@@ -3193,7 +3193,7 @@ footer span.commit code, footer span.commit .rst-content tt, .rst-content footer
}
@media screen and (min-width: 1400px) {
.wy-nav-content-wrap {
background: #0C7881;
background: #fcfcfc;
}
.wy-nav-content {
......
......@@ -73,9 +73,11 @@ author = 'Intel Corporation'
# built documents.
#
# The short X.Y version.
version = '0.9'
# The full version, including alpha/beta/rc tags.
release = '0.9.0'
version = '0.10'
# The Documentation full version, including alpha/beta/rc tags. Some features
# available in the latest code will not necessarily be documented first
release = '0.10.1'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......
......@@ -50,4 +50,4 @@ nGraph-TensorFlow bridge.
.. _MXNet: http://mxnet.incubator.apache.org
.. _DSO: http://csweb.cs.wfu.edu/%7Etorgerse/Kokua/More_SGI/007-2360-010/sgi_html/ch03.html
.. _being the fastest: https://github.com/soumith/convnet-benchmarks
.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf
.. _ngraph tensorflow bridge README: https://github.com/NervanaSystems/ngraph-tf/blob/master/README.md
......@@ -15,19 +15,22 @@ TensorFlow
:widths: 27, 53
:escape: ~
Resnet50 v1 and v2, Image recognition
Inception V3 and V4, Image recognition
Resnet50 v1, Image recognition
Resnet50 v2, Image recognition
Inception V3, Image recognition
Inception V4, Image recognition
Inception-ResNetv2, Image recognition
MobileNet v1, Image recognition
SqueezeNet v1.1, Image recognition
DenseNet-121, Image recognition
MobileNet v2, Image recognition
VGG16, Image recognition
SSD-VGG16, Object detection
SSD-MobileNetv1, Object detection
R-FCN, Object detection
Faster RCNN, Object detection
Yolo v2, Object detection
Transformer-LT, Language translation
Wide & Deep, Recommender system
NCF, Recommender system
WaveNet, Speech generation
U-Net, Image segmentation
DCGAN, Generative adversarial network
DRAW, Image generation
......@@ -41,7 +44,8 @@ MXNet
:widths: 27, 53
:escape: ~
Resnet50 v1 and v2, Image recognition
Resnet50 v1, Image recognition
Resnet50 v2, Image recognition
DenseNet-121, Image recognition
InceptionV3, Image recognition
InceptionV4, Image recognition
......@@ -70,10 +74,10 @@ Additionally, we validated the following workloads are functional through nGraph
:widths: 27, 53
:escape: ~
ResNet-50, Image recognition
DenseNet-121, Image recognition
Inception-v1, Image recognition
Inception-v2, Image recognition
ResNet-50, Image recognition
Shufflenet, Image recognition
SqueezeNet, Image recognition
VGG-19, Image recognition
......
The MPL 2.0 license used by the eigen library used by this ngraph core
component requires distribution of the following information:
Eigen source code can be viewed or downloaded from here:
http://eigen.tuxfamily.org
......@@ -22,7 +22,7 @@ import os
import distutils.ccompiler
__version__ = os.environ.get('NGRAPH_VERSION', '0.0.0-dev')
PYNGRAPH_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__))
PYNGRAPH_ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
NGRAPH_DEFAULT_INSTALL_DIR = os.environ.get('HOME')
NGRAPH_ONNX_IMPORT_ENABLE = os.environ.get('NGRAPH_ONNX_IMPORT_ENABLE')
......@@ -50,7 +50,7 @@ def find_pybind_headers_dir():
if os.environ.get('PYBIND_HEADERS_PATH'):
pybind_headers_dir = os.environ.get('PYBIND_HEADERS_PATH')
else:
pybind_headers_dir = os.path.join(PYNGRAPH_SOURCE_DIR, 'pybind11')
pybind_headers_dir = os.path.join(PYNGRAPH_ROOT_DIR, 'pybind11')
found = os.path.exists(os.path.join(pybind_headers_dir, 'include/pybind11'))
if not found:
......@@ -233,13 +233,13 @@ sources = [
]
package_dir = {
'ngraph': PYNGRAPH_SOURCE_DIR + "/ngraph",
'ngraph.utils': PYNGRAPH_SOURCE_DIR + "/ngraph/utils",
'ngraph.impl': PYNGRAPH_SOURCE_DIR + "/ngraph/impl",
'ngraph.impl.op': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/op",
'ngraph.impl.op.util': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/op/util",
'ngraph.impl.passes': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/passes",
'ngraph.impl.runtime': PYNGRAPH_SOURCE_DIR + "/ngraph/impl/runtime",
'ngraph': PYNGRAPH_ROOT_DIR + "/ngraph",
'ngraph.utils': PYNGRAPH_ROOT_DIR + "/ngraph/utils",
'ngraph.impl': PYNGRAPH_ROOT_DIR + "/ngraph/impl",
'ngraph.impl.op': PYNGRAPH_ROOT_DIR + "/ngraph/impl/op",
'ngraph.impl.op.util': PYNGRAPH_ROOT_DIR + "/ngraph/impl/op/util",
'ngraph.impl.passes': PYNGRAPH_ROOT_DIR + "/ngraph/impl/passes",
'ngraph.impl.runtime': PYNGRAPH_ROOT_DIR + "/ngraph/impl/runtime",
}
packages = [
'ngraph',
......@@ -251,9 +251,9 @@ packages = [
'ngraph.impl.runtime',
]
sources = [PYNGRAPH_SOURCE_DIR + "/" + source for source in sources]
sources = [PYNGRAPH_ROOT_DIR + "/" + source for source in sources]
include_dirs = [PYNGRAPH_SOURCE_DIR, NGRAPH_CPP_INCLUDE_DIR, PYBIND11_INCLUDE_DIR]
include_dirs = [PYNGRAPH_ROOT_DIR, NGRAPH_CPP_INCLUDE_DIR, PYBIND11_INCLUDE_DIR]
library_dirs = [NGRAPH_CPP_LIBRARY_DIR]
......@@ -274,13 +274,13 @@ data_files = [
(
'licenses',
[
PYNGRAPH_SOURCE_DIR + "/../licenses/" + license
for license in os.listdir(PYNGRAPH_SOURCE_DIR + "/../licenses")
PYNGRAPH_ROOT_DIR + "/../licenses/" + license
for license in os.listdir(PYNGRAPH_ROOT_DIR + "/../licenses")
],
),
(
'',
[PYNGRAPH_SOURCE_DIR + "/../LICENSE"],
[PYNGRAPH_ROOT_DIR + "/../LICENSE"],
)
]
......@@ -302,10 +302,10 @@ if NGRAPH_ONNX_IMPORT_ENABLE == 'TRUE':
'pyngraph/pyngraph_onnx_import.cpp',
'pyngraph/onnx_import/onnx_import.cpp',
]
onnx_sources = [PYNGRAPH_SOURCE_DIR + "/" + source for source in onnx_sources]
onnx_sources = [PYNGRAPH_ROOT_DIR + "/" + source for source in onnx_sources]
package_dir['ngraph.impl.onnx_import'] = (
PYNGRAPH_SOURCE_DIR + "/ngraph/impl/onnx_import"
PYNGRAPH_ROOT_DIR + "/ngraph/impl/onnx_import"
)
packages.append('ngraph.impl.onnx_import')
......@@ -360,17 +360,17 @@ class BuildExt(build_ext):
build_ext.build_extensions(self)
with open(os.path.join(PYNGRAPH_SOURCE_DIR, 'requirements.txt')) as req:
with open(os.path.join(PYNGRAPH_ROOT_DIR, 'requirements.txt')) as req:
requirements = req.read().splitlines()
setup(
name='ngraph-core',
description=open(os.path.join(PYNGRAPH_ROOT_DIR, 'README.md')).read(),
version=__version__,
author='Intel',
author_email='intelnervana@intel.com',
url='https://ai.intel.com/',
license='License :: OSI Approved :: Apache Software License',
description='Python API for nGraph',
long_description='',
ext_modules=ext_modules,
package_dir=package_dir,
......
......@@ -28,26 +28,6 @@
using namespace std;
using namespace ngraph;
namespace
{
class NilStreamBuf final : public streambuf
{
// N.B. We derive from the base streambuf implementation, in
// which underflow() and overflow() both return
// Traits::eof() -- any access returns a failure.
};
}
ostream& ngraph::get_nil_stream()
{
// N.B. When debug logging is disabled, multiple threads may
// access the nil stream simultaneously, so it's important to
// return a threadsafe nil stream implementation.
static NilStreamBuf nil_buf;
static ostream nil{&nil_buf};
return nil;
}
void ngraph::default_logger_handler_func(const string& s)
{
cout << s << endl;
......
......@@ -100,8 +100,6 @@ namespace ngraph
static std::deque<std::string> m_queue;
};
extern std::ostream& get_nil_stream();
void default_logger_handler_func(const std::string& s);
#define NGRAPH_ERR \
......@@ -133,6 +131,33 @@ namespace ngraph
ngraph::default_logger_handler_func) \
.stream()
#else
#define NGRAPH_DEBUG ngraph::get_nil_stream()
struct NullLogger
{
};
template <typename T>
NullLogger&& operator<<(NullLogger&& logger, T&&)
{
return std::move(logger);
}
template <typename T>
NullLogger&& operator<<(NullLogger&& logger, const T&)
{
return std::move(logger);
}
inline NullLogger&&
operator<<(NullLogger&& logger,
std::basic_ostream<char, std::char_traits<char>>& (&)(std::basic_ostream<
char,
std::char_traits<char>>&))
{
return std::move(logger);
}
#define NGRAPH_DEBUG \
::ngraph::NullLogger {}
#endif
}
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <numeric>
......
......@@ -16,3 +16,6 @@ quantize_clamp_int32
# failing in CI build but passing on local machine
max_3d_to_scalar_int32
argmin_trivial_in_i32
argmax_4D_axis_3_i64_in_i32
......@@ -165,6 +165,15 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
const ReductionMode& reduction_mode)
{
auto input_type = dtypes[0];
bool use_cudnn_reduce = !((reduction_mode == ReductionMode::Reduce) &&
((input_type == element::i32) || (input_type == element::i8)));
NGRAPH_ASSERT(use_cudnn_reduce)
<< "cuDNN reduce for input type int32_t or int8_t currently not supported";
bool unsupported_int8_type_arg_reduce =
!((reduction_mode == ReductionMode::ArgReduce) && (input_type == element::i8));
NGRAPH_ASSERT(unsupported_int8_type_arg_reduce)
<< "cuDNN arg_reduce for input type int8_t currently not supported";
auto output_type = dtypes[1];
std::stringstream ss;
ss << "reduce_" << reduce_op << "_" << input_type.c_type_string() << "_"
......@@ -180,7 +189,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
}
auto& desc = m_descriptors.build<cudnnReduceTensorDescriptor_t>();
cudnnDataType_t data_type = get_cudnn_datatype(input_type);
auto modified_input_type = (input_type == element::i32) ? element::f64 : input_type;
cudnnDataType_t data_type = get_cudnn_datatype(modified_input_type);
cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format);
Shape output_shape = input_shape;
......@@ -193,15 +203,6 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t input_buffer_size = shape_size(input_shape) * input_type.size();
if (workspace_size < input_buffer_size)
{
workspace_size = input_buffer_size;
}
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
......@@ -217,6 +218,12 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
// emit reduce operation
reduce.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
......@@ -243,70 +250,82 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
{
if (output_type == element::i32 || output_type == element::i64)
{
size_t indices_size = shape_size(output_shape) * output_type.size();
// Since cuDNN only outputs int32 indices
size_t indices_size = shape_size(output_shape) * element::i32.size();
size_t reduce_buffer_idx =
allocator.reserve_workspace(shape_size(output_shape) * input_type.size());
allocator.reserve_workspace(shape_size(output_shape) * modified_input_type.size());
CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(desc,
reduce_op,
data_type,
CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
std::function<void(void**, void**)> convert_output = [](void** inputs, void** outputs) {
};
std::function<void*(void*)> convert_output_space = [](void* ptr) { return ptr; };
if (output_type == element::i64)
{
size_t workspace_indices_idx =
allocator.reserve_workspace(shape_size(output_shape) * input_type.size());
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
auto convert_idx = cuda_emitter->build_elementwise<op::Convert>(
size_t workspace_indices_idx = allocator.reserve_workspace(indices_size);
auto convert_idx = cuda_emitter->template build_elementwise<op::Convert>(
{element::i32.c_type_string(), element::i64.c_type_string()}, output_shape);
reduce.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
void* workspace_indices_ptr =
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_indices_idx);
void* workspace_ptr =
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc,
workspace_indices_ptr,
indices_size,
workspace_ptr,
workspace_size,
alpha,
input_desc,
inputs[0],
beta,
output_desc,
reduce_buffer));
gpu::invoke_primitive(m_ctx, convert_idx, &workspace_indices_ptr, outputs);
debug_sync();
}});
convert_output = [=](void** inputs, void** outputs) {
gpu::invoke_primitive(m_ctx, convert_idx, inputs, outputs);
};
convert_output_space = [=](void* ptr) {
return runtime::gpu::invoke_memory_primitive(m_ctx, workspace_indices_idx);
};
}
else
std::function<void(void**, void**)> convert_input = [](void** inputs, void** outputs) {
};
std::function<void*(void*)> convert_input_space = [](void* ptr) { return ptr; };
if (input_type == element::i32)
{
reduce.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
void* workspace_ptr =
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc,
outputs[0],
indices_size,
workspace_ptr,
workspace_size,
alpha,
input_desc,
inputs[0],
beta,
output_desc,
reduce_buffer));
debug_sync();
}});
size_t input_idx = allocator.reserve_workspace(shape_size(input_shape) *
modified_input_type.size());
auto convert_input_idx = cuda_emitter->template build_elementwise<op::Convert>(
{input_type.c_type_string(), modified_input_type.c_type_string()}, input_shape);
convert_input = [=](void** inputs, void** outputs) {
gpu::invoke_primitive(m_ctx, convert_input_idx, inputs, outputs);
};
convert_input_space = [=](void* ptr) {
return runtime::gpu::invoke_memory_primitive(m_ctx, input_idx);
};
}
reduce.reset(new gpu::primitive{[=, &desc, &input_desc, &output_desc](void** inputs,
void** outputs) {
void* input_ptr = convert_input_space(inputs[0]);
void* workspace_indices_ptr = convert_output_space(outputs[0]);
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
convert_input(inputs, &input_ptr);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc,
workspace_indices_ptr,
indices_size,
workspace_ptr,
workspace_size,
alpha,
input_desc,
input_ptr,
beta,
output_desc,
reduce_buffer));
convert_output(&workspace_indices_ptr, outputs);
debug_sync();
}});
}
else
{
......
......@@ -134,6 +134,7 @@ shape_of_vector
shape_of_matrix
shape_of_5d
sum_stable_acc
sum_trivial_in_double
product_2d_to_scalar_int32
product_to_scalar_int32
product_to_scalar_int8
......@@ -141,3 +142,6 @@ max_matrix_rows_zero_int32
max_to_scalar_int8
min_to_scalar_int8
max_3d_to_scalar_double
argmin_trivial_in_i32
argmax_4D_axis_3_i64_in_i32
argmin_trivial_in_double
......@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call(
const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
{
auto cfunc = m_cache.try_lookup(func);
if (!cfunc)
{
cfunc = m_compiler.compile(func);
}
auto cfunc = m_cache.compile(func, &m_compiler);
cfunc->schedule_invocation(inputs, outputs);
return true;
}
......
......@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
NGRAPH_DEBUG << "Binding PlaidML function " << this;
m_bound_inputs.resize(inputs.size());
m_bound_outputs.resize(outputs.size());
std::size_t input_count = 0;
for (const auto& param : m_func->get_parameters())
{
for (std::size_t idx = 0; idx < param->get_output_size(); ++idx)
{
descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get();
auto rtv = dynamic_cast<PlaidML_Tensor*>(inputs[input_count++].get());
auto& input = inputs.at(input_count);
auto rtv = dynamic_cast<PlaidML_Tensor*>(input.get());
if (!rtv)
{
throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"};
"The PlaidML backend only operates on PlaidML tensor views"};
}
rtv->sync_input();
auto& bound_input = m_bound_inputs.at(input_count);
++input_count;
if (bound_input.lock() == input)
{
// No need to re-bind this input.
continue;
}
bound_input = input;
NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv;
m_invoker.set_input(m_input_names.at(tv), rtv->tensor());
}
......@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
for (std::size_t idx = 0; idx < result->get_output_size(); ++idx)
{
descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get();
auto rtv = dynamic_cast<PlaidML_Tensor*>(outputs[output_count++].get());
auto& output = outputs.at(output_count);
auto rtv = dynamic_cast<PlaidML_Tensor*>(output.get());
if (!rtv)
{
throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"};
"The PlaidML backend only operates on PlaidML tensor views"};
}
auto& bound_output = m_bound_outputs.at(output_count);
++output_count;
if (bound_output.lock() == output)
{
// No need to re-bind this output.
continue;
}
bound_output = output;
NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv;
m_invoker.set_output(m_output_names.at(tv), rtv->tensor());
}
......@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
if (!rtv)
{
throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"};
"The PlaidML backend only operates on PlaidML tensor views"};
}
rtv->sync_output();
}
......
......@@ -58,5 +58,7 @@ private:
std::shared_ptr<Function> m_func;
std::unordered_map<descriptor::Tensor*, std::string> m_input_names;
std::unordered_map<descriptor::Tensor*, std::string> m_output_names;
mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_inputs;
mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_outputs;
mutable vertexai::plaidml::invoker m_invoker;
};
......@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse
void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input()
{
if (!get_stale())
{
return;
}
set_stale(false);
if (!m_memory)
{
if (m_is_logically_zero)
......@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output()
{
// The tensor's been used for an output, so it's no longer logically zero.
m_is_logically_zero = false;
set_stale(false);
if (!m_memory)
{
......
......@@ -26,12 +26,15 @@ topk_1d_max_one # No plans to implement TopK
topk_1d_min_all # No plans to implement TopK
topk_1d_min_partial # No plans to implement TopK
topk_1d_min_one # No plans to implement TopK
topk_3d_large_input_max # No plans to implement TopK
topk_3d_large_input_min # No plans to implement TopK
topk_3d_max_all # No plans to implement TopK
topk_3d_max_partial # No plans to implement TopK
topk_3d_max_one # No plans to implement TopK
topk_3d_min_all # No plans to implement TopK
topk_3d_min_partial # No plans to implement TopK
topk_3d_min_one # No plans to implement TopK
topk_3d_single_output # No plans to implement TopK
topk_2d_max_all # No plans to implement TopK
topk_2d_max_partial # No plans to implement TopK
topk_2d_max_one # No plans to implement TopK
......@@ -43,15 +46,21 @@ topk_5d_max_partial # No plans to implement TopK
# Tests that PlaidML might be able to run at some point.
backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3
backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2
backwards_maxpool_n4_c1_hw4_2x2_max
backwards_maxpool_n2_c1_hw5_3x3_str2_max
backwards_slice
batchnorm_fprop_bprop # To debug
batchnorm_fprop_bprop_2step # To debug
softmax_axis_3d_double # To debug
reduce_matrix_rows_zero # To debug: possible broadcasting error?
reduce_matrix_cols_zero # To debug: possible broadcasting error?
reduce_3d_to_vector # To debug: possible broadcasting error?
replace_slice_matrix_inplace
max_pool_2d_1channel_1image_overpadded
max_pool_3d
maxpool_bprop_larger_than_cache
reduce_window_emulating_max_pool_1d_1channel_1image
reduce_window_emulating_max_pool_1d_1channel_2image
reduce_window_emulating_max_pool_1d_2channel_2image
......@@ -60,31 +69,49 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided
select_and_scatter_with_overlap
select_and_scatter_without_overlap
select_and_scatter_3d_without_overlap
generate_mask
avg_pool_3d
avg_pool_3d_uneven_strided_padded_include_in_computation
dequantize_zero_offset # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_UPWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_DOWNWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_EVEN # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_UP # Quantization/Dequantization is unimplemented
quantize_ROUND_DOWN # Quantization/Dequantization is unimplemented
quantize # Quantization/Dequantization is unimplemented
quantize_axes # Quantization/Dequantization is unimplemented
quantize_int8 # Quantization/Dequantization is unimplemented
quantize_clamp # Quantization/Dequantization is unimplemented
dequantize # Quantization/Dequantization is unimplemented
dequantize_axes # Quantization/Dequantization is unimplemented
dequantize_int8 # Quantization/Dequantization is unimplemented
sum_matrix_rows_zero # Empty dims apparently should produce shaped 0s
sum_matrix_cols_zero # Empty dims apparently should produce shaped 0s
sum_vector_zero # Empty dims apparently should produce shaped 0s
sum_matrix_to_scalar_zero_by_zero # Empty dims apparently should produce shaped 0s
sum_3d_eliminate_zero_dim # Empty dims apparently should produce shaped 0s
dot_0_0 # Empty dims apparently should produce shaped 0s
dot_matrix_2x0_0x2 # Empty dims apparently should produce shaped 0s
dot_2x0_0 # Empty dims apparently should produce shaped 0s
dequantize_int8_zero_offset # Quantization/Dequantization is unimplemented
dequantize_int32 # Quantization/Dequantization is unimplemented
dequantize_int32_zero_offset # Quantization/Dequantization is unimplemented
dequantize_zero_offset # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_UPWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_DOWNWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_EVEN # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_UP # Quantization/Dequantization is unimplemented
quantize_ROUND_DOWN # Quantization/Dequantization is unimplemented
quantize # Quantization/Dequantization is unimplemented
quantize_zero_offset # Quantization/Dequantization is unimplemented
quantize_axes # Quantization/Dequantization is unimplemented
quantize_int8 # Quantization/Dequantization is unimplemented
quantize_int8_zero_offset # Quantization/Dequantization is unimplemented
quantize_int32 # Quantization/Dequantization is unimplemented
quantize_int32_zero_offset # Quantization/Dequantization is unimplemented
quantize_clamp # Quantization/Dequantization is unimplemented
quantize_clamp_int8 # Quantization/Dequantization is unimplemented
quantize_clamp_int32 # Quantization/Dequantization is unimplemented
quantize_clamp_int32_zero_offset # Quantization/Dequantization is unimplemented
quantize_clamp_uint8 # Quantization/Dequantization is unimplemented
dequantize # Quantization/Dequantization is unimplemented
dequantize_axes # Quantization/Dequantization is unimplemented
dequantize_int8 # Quantization/Dequantization is unimplemented
sum_matrix_rows_zero # Empty dims apparently should produce shaped 0s
sum_matrix_cols_zero # Empty dims apparently should produce shaped 0s
sum_vector_zero # Empty dims apparently should produce shaped 0s
sum_matrix_to_scalar_zero_by_zero # Empty dims apparently should produce shaped 0s
sum_3d_eliminate_zero_dim # Empty dims apparently should produce shaped 0s
dot_0_0 # Empty dims apparently should produce shaped 0s
dot_matrix_2x0_0x2 # Empty dims apparently should produce shaped 0s
dot_2x0_0 # Empty dims apparently should produce shaped 0s
numeric_float_nan
numeric_double_nan
shape_of_scalar
shape_of_vector
shape_of_matrix
shape_of_5d
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <getopt.h>
......
......@@ -311,3 +311,82 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3)
.get_vector()),
read_vector<int>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_i32)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::i32, shape);
copy_data(a, vector<int32_t>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::i32, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<int>{3, 2, 1}), read_vector<int>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3_i64_in_i32)
{
Shape shape{2, 2, 5, 5}; // NCHW ->(0,1,2,3)
Shape rshape{2, 2, 5};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::ArgMax>(A, 3, element::i64), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::i32, shape);
copy_data(a,
test::NDArray<int32_t, 4>({{{{0, 1, 0, 2, 1}, // img 0 ch 0
{0, 3, 2, 0, 0},
{2, 0, 0, 0, 1},
{2, 0, 1, 1, 2},
{0, 2, 1, 0, 0}},
{{0, 0, 0, 2, 0}, // img 0 ch 1
{0, 2, 3, 0, 1},
{2, 0, 1, 0, 2},
{3, 1, 0, 0, 0},
{2, 0, 0, 0, 0}}},
{{{0, 2, 1, 1, 0}, // img 1 ch 0
{0, 0, 2, 0, 1},
{0, 0, 1, 2, 3},
{2, 0, 0, 3, 0},
{0, 0, 0, 0, 0}},
{{2, 1, 0, 0, 1}, // img 1 ch 1
{0, 2, 0, 0, 0},
{1, 1, 2, 0, 2},
{1, 1, 1, 0, 1},
{1, 0, 0, 0, 2}}}})
.get_vector());
auto result = backend->create_tensor(element::i64, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((test::NDArray<int64_t, 3>({{{3, 1, 0, 0, 1}, {3, 2, 0, 0, 0}}, //ch0
{{1, 2, 4, 3, 0}, {0, 1, 2, 0, 4}}}) //ch1
.get_vector()),
read_vector<int64_t>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_double)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::f64, shape);
auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f64, shape);
copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::i32, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<int32_t>{3, 2, 1}), read_vector<int32_t>(result));
}
......@@ -485,6 +485,24 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_2d_to_scalar_int8)
EXPECT_EQ(std::vector<int8_t>{45}, read_vector<int8_t>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, sum_trivial_in_double)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::f64, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f64, shape);
copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::f64, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<double>{30, 22, 26}), read_vector<double>(result));
}
#if NGRAPH_INTERPRETER_ENABLE
NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment