Merge branch 'master' into quantized_conv_temp

e5757725 · nishant.b.patel · b13eacf6 · d0f03eec · e5757725 · e5757725
Commit e5757725 authored Jun 20, 2019 by nishant.b.patel
36 changed files
--- a/python/test/conftest.py
+++ b/python/test/conftest.py
@@ -19,7 +19,7 @@ import test

 def pytest_addoption(parser):
    parser.addoption('--backend', default='INTERPRETER',
-                     choices=['INTERPRETER', 'CPU', 'GPU', 'NNP', 'PlaidML'],
+                     choices=['INTERPRETER', 'CPU', 'GPU', 'NNP', 'PlaidML', 'INTELGPU'],
                     help='Select from available backends')


@@ -31,20 +31,25 @@ def pytest_configure(config):
 def pytest_collection_modifyitems(config, items):
    backend_name = config.getvalue('backend')

-    gpu_skip = pytest.mark.skip(reason='Skipping test on the GPU backend.')
-    cpu_skip = pytest.mark.skip(reason='Skipping test on the CPU backend.')
-    nnp_skip = pytest.mark.skip(reason='Skipping test on the NNP backend.')
-    interpreter_skip = pytest.mark.skip(reason='Skipping test on the INTERPRETER backend.')
-    plaidml_skip = pytest.mark.skip(reason='Skipping test on the PlaidML backend.')
+    keywords = {
+        'GPU': 'skip_on_gpu',
+        'CPU': 'skip_on_cpu',
+        'NNP': 'skip_on_nnp',
+        'INTERPRETER': 'skip_on_interpreter',
+        'PlaidML': 'skip_on_plaidml',
+        'INTELGPU': 'skip_on_intelgpu',
+    }
+
+    skip_markers = {
+        'GPU': pytest.mark.skip(reason='Skipping test on the GPU backend.'),
+        'CPU': pytest.mark.skip(reason='Skipping test on the CPU backend.'),
+        'NNP': pytest.mark.skip(reason='Skipping test on the NNP backend.'),
+        'INTERPRETER': pytest.mark.skip(reason='Skipping test on the INTERPRETER backend.'),
+        'PlaidML': pytest.mark.skip(reason='Skipping test on the PlaidML backend.'),
+        'INTELGPU': pytest.mark.skip(reason='Skipping test on the INTELGPU backend.'),
+    }

    for item in items:
-        if backend_name == 'GPU' and 'skip_on_gpu' in item.keywords:
-            item.add_marker(gpu_skip)
-        if backend_name == 'CPU' and 'skip_on_cpu' in item.keywords:
-            item.add_marker(cpu_skip)
-        if backend_name == 'NNP' and 'skip_on_nnp' in item.keywords:
-            item.add_marker(nnp_skip)
-        if backend_name == 'INTERPRETER' and 'skip_on_interpreter' in item.keywords:
-            item.add_marker(interpreter_skip)
-        if backend_name == 'PlaidML' and 'skip_on_plaidml' in item.keywords:
-            item.add_marker(plaidml_skip)
+        skip_this_backend = keywords[backend_name]
+        if skip_this_backend in item.keywords:
+            item.add_marker(skip_markers[backend_name])
--- a/python/test/ngraph/test_ops_unary.py
+++ b/python/test/ngraph/test_ops_unary.py
@@ -33,7 +33,6 @@ from test.ngraph.util import run_op_numeric_data, run_op_node
    (ng.exp, np.exp, -100., 100.),
    (ng.floor, np.floor, -100., 100.),
    (ng.log, np.log, 0, 100.),
-    (ng.logical_not, np.logical_not, -10, 10),
    (ng.relu, lambda x: np.maximum(0, x), -100., 100.),
    (ng.sign, np.sign, -100., 100.),
    (ng.sin, np.sin, -100., 100.),
@@ -68,7 +67,6 @@ def test_unary_op_array(ng_api_fn, numpy_fn, range_start, range_end):
    (ng.exp, np.exp, np.float32(1.5)),
    (ng.floor, np.floor, np.float32(1.5)),
    (ng.log, np.log, np.float32(1.5)),
-    (ng.logical_not, np.logical_not, np.int32(0)),
    (ng.relu, lambda x: np.maximum(0, x), np.float32(-0.125)),
    (ng.sign, np.sign, np.float32(0.)),
    (ng.sin, np.sin, np.float32(np.pi / 4.0)),
@@ -86,3 +84,19 @@ def test_unary_op_scalar(ng_api_fn, numpy_fn, input_data):

    result = run_op_numeric_data(input_data, ng_api_fn)
    assert np.allclose(result, expected)
+
+
+@pytest.mark.parametrize('input_data', [
+    (np.array([True, False, True, False])),
+    (np.array(True)),
+    (np.array(False)),
+])
+@pytest.mark.skip_on_gpu
+def test_logical_not(input_data):
+    expected = np.logical_not(input_data)
+
+    result = run_op_node([input_data], ng.logical_not)[0]
+
+    assert np.array_equal(result, expected)
+    result = run_op_numeric_data(input_data, ng.logical_not)[0]
+    assert np.array_equal(result, expected)
--- a/python/test/test_ops.py
+++ b/python/test/test_ops.py
@@ -818,6 +818,7 @@ def test_slice():


 @pytest.mark.skip_on_gpu
+@pytest.mark.skip_on_intelgpu
 def test_replace_slice():

    element_type = Type.f32

--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -482,6 +482,8 @@ set(SRC ${SRC}

 if(NGRAPH_JSON_ENABLE)
    list(APPEND SRC serializer.cpp serializer.hpp event_tracing.cpp event_tracing.hpp)
+else()
+    list(APPEND SRC serializer_stub.cpp)
 endif()

 configure_file(version.in.hpp version.hpp)

--- a/src/ngraph/distributed.cpp
+++ b/src/ngraph/distributed.cpp
@@ -22,6 +22,41 @@

 using namespace ngraph;

+NGRAPH_API const reduction::Type reduction::sum(reduction::Type_t::sum);
+NGRAPH_API const reduction::Type reduction::prod(reduction::Type_t::prod);
+NGRAPH_API const reduction::Type reduction::min(reduction::Type_t::min);
+NGRAPH_API const reduction::Type reduction::max(reduction::Type_t::max);
+
+std::ostream& reduction::operator<<(std::ostream& out, const reduction::Type& obj)
+{
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+    switch (obj.get_type())
+    {
+    case reduction::Type_t::sum: out << "sum"; break;
+    case reduction::Type_t::prod: out << "prod"; break;
+    case reduction::Type_t::min: out << "min"; break;
+    case reduction::Type_t::max: out << "max"; break;
+    }
+#if !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8)
+#pragma GCC diagnostic pop
+#endif
+    return out;
+};
+
+bool reduction::Type::operator==(const reduction::Type& other) const
+{
+    return m_type == other.m_type;
+}
+
+reduction::Type_t reduction::Type::get_type() const
+{
+    return m_type;
+}
+
 static std::unique_ptr<DistributedInterface> s_distributed_interface;

 void ngraph::set_distributed_interface(std::unique_ptr<DistributedInterface> distributed_interface)

--- a/src/ngraph/distributed.hpp
+++ b/src/ngraph/distributed.hpp
@@ -24,6 +24,38 @@

 namespace ngraph
 {
+    namespace reduction
+    {
+        enum class Type_t
+        {
+            sum,
+            prod,
+            min,
+            max,
+        };
+
+        class Type
+        {
+        public:
+            Type(const Type_t t)
+                : m_type(t)
+            {
+            }
+            friend std::ostream& operator<<(std::ostream&, const Type&);
+            bool operator==(const Type& other) const;
+            bool operator!=(const Type& other) const { return !(*this == other); }
+            Type_t get_type() const;
+
+        private:
+            Type_t m_type;
+        };
+        std::ostream& operator<<(std::ostream& out, const Type& obj);
+        extern NGRAPH_API const Type sum;
+        extern NGRAPH_API const Type prod;
+        extern NGRAPH_API const Type min;
+        extern NGRAPH_API const Type max;
+    }
+
    class DistributedInterface
    {
    public:
@@ -33,8 +65,11 @@ namespace ngraph
        virtual int get_rank() = 0;
        virtual void log_print(const std::string& timestamp, const std::vector<char>& buf) = 0;

-        virtual void
-            all_reduce(void* in, void* out, element::Type_t element_type, size_t count) = 0;
+        virtual void all_reduce(void* in,
+                                void* out,
+                                element::Type_t element_type,
+                                reduction::Type reduce_type,
+                                size_t count) = 0;
        virtual void
            broadcast(void* in, element::Type_t element_type, size_t count, int root_id) = 0;
    };

--- a/src/ngraph/distributed/mlsl.hpp
+++ b/src/ngraph/distributed/mlsl.hpp
@@ -65,8 +65,11 @@ namespace ngraph
                std::printf("%s [MLSL RANK: %d]: %s\n", timestamp.c_str(), get_rank(), buf.data());
            }

-            void
-                all_reduce(void* in, void* out, element::Type_t element_type, size_t count) override
+            void all_reduce(void* in,
+                            void* out,
+                            element::Type_t element_type,
+                            reduction::Type reduce_type,
+                            size_t count) override
            {
                auto data_type = MLSL::DT_FLOAT;

@@ -83,10 +86,29 @@ namespace ngraph
                    throw std::runtime_error("AllReduce op supports only f32 and f64 types");
                }

+                decltype(MLSL::RT_SUM) mlsl_reduce_type;
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+                switch (reduce_type.get_type())
+                {
+                case reduction::Type_t::sum: mlsl_reduce_type = MLSL::RT_SUM; break;
+                case reduction::Type_t::prod:
+                    throw std::runtime_error("MLSL doesn't support allreduce prod");
+                    break;
+                case reduction::Type_t::min: mlsl_reduce_type = MLSL::RT_MIN; break;
+                case reduction::Type_t::max: mlsl_reduce_type = MLSL::RT_MAX; break;
+                }
+#if !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8)
+#pragma GCC diagnostic pop
+#endif
+
                MLSL::Environment& env = MLSL::Environment::GetEnv();
                MLSL::Distribution* distribution = env.CreateDistribution(env.GetProcessCount(), 1);
-                MLSL::CommReq* req =
-                    distribution->AllReduce(in, out, count, data_type, MLSL::RT_SUM, MLSL::GT_DATA);
+                MLSL::CommReq* req = distribution->AllReduce(
+                    in, out, count, data_type, mlsl_reduce_type, MLSL::GT_DATA);
                env.Wait(req);
                env.DeleteDistribution(distribution);
            }

--- a/src/ngraph/distributed/null.hpp
+++ b/src/ngraph/distributed/null.hpp
@@ -35,8 +35,11 @@ namespace ngraph
            {
                std::printf("%s: %s\n", timestamp.c_str(), buf.data());
            }
-            void
-                all_reduce(void* in, void* out, element::Type_t element_type, size_t count) override
+            void all_reduce(void* in,
+                            void* out,
+                            element::Type_t element_type,
+                            reduction::Type reduce_type,
+                            size_t count) override
            {
                throw ngraph_error("Distributed Library not supported/mentioned");
            }

--- a/src/ngraph/distributed/open_mpi.hpp
+++ b/src/ngraph/distributed/open_mpi.hpp
@@ -77,8 +77,11 @@ namespace ngraph
                    "%s [OpenMPI RANK: %d]: %s\n", timestamp.c_str(), get_rank(), buf.data());
            }

-            void
-                all_reduce(void* in, void* out, element::Type_t element_type, size_t count) override
+            void all_reduce(void* in,
+                            void* out,
+                            element::Type_t element_type,
+                            reduction::Type reduce_type,
+                            size_t count) override
            {
                auto data_type = MPI_FLOAT;

@@ -95,7 +98,24 @@ namespace ngraph
                    throw std::runtime_error("AllReduce op supports only f32 and f64 types");
                }

-                MPI_Allreduce(in, out, count, data_type, MPI_SUM, MPI_COMM_WORLD);
+                decltype(MPI_SUM) mpi_reduce_type;
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+                switch (reduce_type.get_type())
+                {
+                case reduction::Type_t::sum: mpi_reduce_type = MPI_SUM; break;
+                case reduction::Type_t::prod: mpi_reduce_type = MPI_PROD; break;
+                case reduction::Type_t::min: mpi_reduce_type = MPI_MIN; break;
+                case reduction::Type_t::max: mpi_reduce_type = MPI_MAX; break;
+                }
+#if !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8)
+#pragma GCC diagnostic pop
+#endif
+
+                MPI_Allreduce(in, out, count, data_type, mpi_reduce_type, MPI_COMM_WORLD);
            }

            void broadcast(void* in,

--- a/src/ngraph/op/allreduce.cpp
+++ b/src/ngraph/op/allreduce.cpp
@@ -22,11 +22,13 @@ using namespace ngraph;
 const string op::AllReduce::type_name{"AllReduce"};

 op::AllReduce::AllReduce()
+    : m_reduce_type(reduction::sum)
 {
 }

-op::AllReduce::AllReduce(const shared_ptr<Node>& arg)
+op::AllReduce::AllReduce(const shared_ptr<Node>& arg, const reduction::Type reduce_type)
    : Op(check_single_output_args({arg}))
+    , m_reduce_type(reduce_type)
 {
    constructor_validate_and_infer_types();
 }
@@ -47,5 +49,10 @@ void op::AllReduce::validate_and_infer_types()
 shared_ptr<Node> op::AllReduce::copy_with_new_args(const NodeVector& new_args) const
 {
    check_new_args_count(this, new_args);
-    return make_shared<AllReduce>(new_args.at(0));
+    return make_shared<AllReduce>(new_args.at(0), get_reduce_type());
+}
+
+reduction::Type op::AllReduce::get_reduce_type() const
+{
+    return m_reduce_type;
 }
--- a/src/ngraph/op/allreduce.hpp
+++ b/src/ngraph/op/allreduce.hpp
@@ -30,11 +30,16 @@ namespace ngraph
            static const std::string type_name;
            const std::string& description() const override { return type_name; }
            AllReduce();
-            AllReduce(const std::shared_ptr<Node>& arg);
+            AllReduce(const std::shared_ptr<Node>& arg,
+                      const reduction::Type reduce_type = reduction::sum);

            void validate_and_infer_types() override;

            std::shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override;
+            reduction::Type get_reduce_type() const;
+
+        private:
+            const reduction::Type m_reduce_type;
        };
    }
 }
--- a/src/ngraph/op/fused/group_conv.cpp
+++ b/src/ngraph/op/fused/group_conv.cpp
@@ -27,8 +27,14 @@
 using namespace std;
 using namespace ngraph;

-op::GroupConvolution::GroupConvolution(const shared_ptr<Node>& data_batch,
-                                       const shared_ptr<Node>& filters,
+const string op::GroupConvolution::type_name{"GroupConvolution"};
+
+op::GroupConvolution::GroupConvolution()
+{
+}
+
+op::GroupConvolution::GroupConvolution(const Output<Node>& data_batch,
+                                       const Output<Node>& filters,
                                       const Strides& window_movement_strides,
                                       const Strides& window_dilation_strides,
                                       const CoordinateDiff& padding_below,
@@ -36,7 +42,7 @@ op::GroupConvolution::GroupConvolution(const shared_ptr<Node>& data_batch,
                                       const Strides& data_dilation_strides,
                                       const size_t groups,
                                       const PadType& pad_type)
-    : FusedOp("GroupConvolution", check_single_output_args({data_batch, filters}))
+    : FusedOp({data_batch, filters})
    , m_window_movement_strides(window_movement_strides)
    , m_window_dilation_strides(window_dilation_strides)
    , m_padding_below(padding_below)
@@ -45,7 +51,6 @@ op::GroupConvolution::GroupConvolution(const shared_ptr<Node>& data_batch,
    , m_groups(groups)
    , m_pad_type(pad_type)
 {
-    // TODO: Move this out of constructor to validate_and_infer_types()
    constructor_validate_and_infer_types();
 }

@@ -129,35 +134,35 @@ shared_ptr<Node> op::GroupConvolution::copy_with_new_args(const NodeVector& new_

 NodeVector op::GroupConvolution::decompose_op() const
 {
-    auto data = get_argument(0);
-    auto filters = get_argument(1);
+    auto data = input(0);
+    auto filters = input(1);
    // Split one convolution op to N ops where N is the number of groups
    // and concat results after computation.
    // reference: https://github.com/NervanaSystems/ngraph-mxnet/blob/fdd692/src/ngraph/ngraph_emitter.cc#L822-L856
-    std::size_t n_data_channels{data->get_shape().at(1)};
-    std::size_t n_filters_channels{filters->get_shape().at(0)};
+    std::size_t n_data_channels{data.get_shape().at(1)};
+    std::size_t n_filters_channels{filters.get_shape().at(0)};
    std::size_t data_group_size{n_data_channels / m_groups};
    std::size_t filters_group_size{n_filters_channels / m_groups};
    NodeVector convolution_nodes;

    // initial bounds for splice
-    std::vector<std::size_t> data_lower_bounds(data->get_shape().size());
-    std::vector<std::size_t> data_upper_bounds{data->get_shape()};
-    std::vector<std::size_t> filters_lower_bounds(filters->get_shape().size());
-    std::vector<std::size_t> filters_upper_bounds{filters->get_shape()};
+    std::vector<std::size_t> data_lower_bounds(data.get_shape().size());
+    std::vector<std::size_t> data_upper_bounds{data.get_shape()};
+    std::vector<std::size_t> filters_lower_bounds(filters.get_shape().size());
+    std::vector<std::size_t> filters_upper_bounds{filters.get_shape()};

    for (std::size_t group{0}; group < m_groups; ++group)
    {
        // slice data
        data_lower_bounds[1] = group * data_group_size;
        data_upper_bounds[1] = (group + 1) * data_group_size;
-        auto sliced_data =
-            std::make_shared<ngraph::op::Slice>(data, data_lower_bounds, data_upper_bounds);
+        auto sliced_data = std::make_shared<ngraph::op::Slice>(
+            data.get_source_output(), data_lower_bounds, data_upper_bounds);
        // slice filters
        filters_lower_bounds[0] = group * filters_group_size;
        filters_upper_bounds[0] = (group + 1) * filters_group_size;
        auto sliced_filters = std::make_shared<ngraph::op::Slice>(
-            filters, filters_lower_bounds, filters_upper_bounds);
+            filters.get_source_output(), filters_lower_bounds, filters_upper_bounds);

        convolution_nodes.push_back(
            std::make_shared<ngraph::op::Convolution>(sliced_data,

--- a/src/ngraph/op/fused/group_conv.hpp
+++ b/src/ngraph/op/fused/group_conv.hpp
@@ -29,8 +29,12 @@ namespace ngraph
        class GroupConvolution : public ngraph::op::util::FusedOp
        {
        public:
-            GroupConvolution(const std::shared_ptr<Node>& data_batch,
-                             const std::shared_ptr<Node>& filters,
+            NGRAPH_API
+            static const std::string type_name;
+            const std::string& description() const override { return type_name; }
+            GroupConvolution();
+            GroupConvolution(const Output<Node>& data_batch,
+                             const Output<Node>& filters,
                             const Strides& window_movement_strides,
                             const Strides& window_dilation_strides,
                             const CoordinateDiff& padding_below,

--- a/src/ngraph/op/slice.cpp
+++ b/src/ngraph/op/slice.cpp
@@ -19,11 +19,17 @@
 using namespace std;
 using namespace ngraph;

-op::Slice::Slice(const shared_ptr<Node>& arg,
+const string op::Slice::type_name{"Slice"};
+
+op::Slice::Slice()
+{
+}
+
+op::Slice::Slice(const Output<Node>& arg,
                 const Coordinate& lower_bounds,
                 const Coordinate& upper_bounds,
                 const Strides& strides)
-    : Op("Slice", check_single_output_args({arg}))
+    : Op({arg})
    , m_lower_bounds(lower_bounds)
    , m_upper_bounds(upper_bounds)
    , m_strides(strides)
@@ -31,10 +37,10 @@ op::Slice::Slice(const shared_ptr<Node>& arg,
    constructor_validate_and_infer_types();
 }

-op::Slice::Slice(const shared_ptr<Node>& arg,
+op::Slice::Slice(const Output<Node>& arg,
                 const Coordinate& lower_bounds,
                 const Coordinate& upper_bounds)
-    : Op("Slice", check_single_output_args({arg}))
+    : Op({arg})
    , m_lower_bounds(lower_bounds)
    , m_upper_bounds(upper_bounds)
    , m_strides(Strides())

--- a/src/ngraph/op/slice.hpp
+++ b/src/ngraph/op/slice.hpp
@@ -28,6 +28,11 @@ namespace ngraph
        class Slice : public Op
        {
        public:
+            NGRAPH_API
+            static const std::string type_name;
+            const std::string& description() const override { return type_name; }
+            /// \brief Constructs a tensor slice operation
+            Slice();
            /// \brief Constructs a tensor slice operation.
            ///
            /// \param arg The tensor to be sliced.
@@ -35,17 +40,16 @@ namespace ngraph
            /// \param upper_bounds The axiswise upper bounds of the slice (exclusive).
            /// \param strides The slicing strides; for example, strides of `{n,m}` means to take
            ///                every nth row and every mth column of the input matrix.
-            Slice(const std::shared_ptr<Node>& arg,
+            Slice(const Output<Node>& arg,
                  const Coordinate& lower_bounds,
                  const Coordinate& upper_bounds,
                  const Strides& strides);
-
            /// \brief Constructs a tensor slice operation with unit strides; i.e., every element inside the bounding box will be copied to the output slice.
            ///
            /// \param arg The tensor to be sliced.
            /// \param lower_bounds The axiswise lower bounds of the slice (inclusive).
            /// \param upper_bounds The axiswise upper bounds of the slice (exclusive).
-            Slice(const std::shared_ptr<Node>& arg,
+            Slice(const Output<Node>& arg,
                  const Coordinate& lower_bounds,
                  const Coordinate& upper_bounds);


--- a/src/ngraph/op/util/fused_op.cpp
+++ b/src/ngraph/op/util/fused_op.cpp
@@ -30,6 +30,11 @@ op::util::FusedOp::FusedOp(const NodeVector& args)
 {
 }

+op::util::FusedOp::FusedOp(const OutputVector& args)
+    : Op(args)
+{
+}
+
 op::util::FusedOp::FusedOp(const std::string& node_type, const NodeVector& args)
    : Op(node_type, args)
 {

--- a/src/ngraph/op/util/fused_op.hpp
+++ b/src/ngraph/op/util/fused_op.hpp
@@ -51,6 +51,8 @@ namespace ngraph
                /// \param args Nodes that produce the input tensors for the fused op
                FusedOp(const NodeVector& args);

+                FusedOp(const OutputVector& args);
+
                /// \brief Constructs a FusedOp
                ///
                /// \param args Nodes that produce the input tensors for the fused op

--- a/src/ngraph/runtime/cpu/builder/allreduce.cpp
+++ b/src/ngraph/runtime/cpu/builder/allreduce.cpp
@@ -37,6 +37,9 @@ namespace ngraph
                auto out_buffer_index = external_function->get_buffer_index(out[0].get_name());
                auto count = static_cast<int>(out[0].get_size());
                auto data_type = args[0].get_element_type().get_type_enum();
+                const ngraph::op::AllReduce* allreduce =
+                    static_cast<const ngraph::op::AllReduce*>(node);
+                auto reduce_type = allreduce->get_reduce_type();

                auto external_function_name = external_function->get_function_name();
                NGRAPH_DEBUG_PRINT(
@@ -48,11 +51,13 @@ namespace ngraph
                    node->get_friendly_name().c_str(),
                    count);

-                auto functor = [&, count, data_type, arg_buffer_index, out_buffer_index](
+                auto functor =
+                    [&, count, reduce_type, data_type, arg_buffer_index, out_buffer_index](
                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
                        get_distributed_interface()->all_reduce(ctx->buffer_data[arg_buffer_index],
                                                                ctx->buffer_data[out_buffer_index],
                                                                data_type,
+                                                                reduce_type,
                                                                count);
                    };
                functors.emplace_back(functor);

--- a/src/ngraph/runtime/cpu/builder/gather.cpp
+++ b/src/ngraph/runtime/cpu/builder/gather.cpp
@@ -53,9 +53,11 @@ namespace ngraph

                    if (is_int64)
                    {
-                        if (args[0].get_element_type() == element::f32 ||
+                        if ((args[0].get_element_type() == element::f32 ||
                             args[0].get_element_type() == element::f64 ||
-                            args[0].get_element_type() == element::u8)
+                             args[0].get_element_type() == element::u8 ||
+                             args[0].get_element_type() == element::i8) &&
+                            params_shape.size() <= 3 && out_shape.size() <= 3)
                        {
                            std::function<decltype(runtime::cpu::kernel::gather_i64<float, 2, 2>)>
                                kernel;
@@ -111,9 +113,11 @@ namespace ngraph

                    else
                    {
-                        if (args[0].get_element_type() == element::f32 ||
+                        if ((args[0].get_element_type() == element::f32 ||
                             args[0].get_element_type() == element::f64 ||
-                            args[0].get_element_type() == element::u8)
+                             args[0].get_element_type() == element::u8 ||
+                             args[0].get_element_type() == element::i8) &&
+                            params_shape.size() <= 3 && out_shape.size() <= 3)
                        {
                            std::function<decltype(runtime::cpu::kernel::gather_i32<float, 2, 2>)>
                                kernel;

--- a/src/ngraph/runtime/cpu/builder/scatter_add.cpp
+++ b/src/ngraph/runtime/cpu/builder/scatter_add.cpp
@@ -46,7 +46,9 @@ namespace ngraph
                }

                if (args[0].get_element_type() != element::f64 &&
-                    args[0].get_element_type() != element::f32)
+                    args[0].get_element_type() != element::f32 &&
+                    args[0].get_element_type() != element::u8 &&
+                    args[0].get_element_type() != element::i8)
                {
                    throw ngraph_error("Unsupported type in CPU Builder for ScatterAdd");
                }
@@ -59,6 +61,8 @@ namespace ngraph
                auto element_type = args[0].get_element_type();

                if (is_int64)
+                {
+                    if (inputs_shape.size() <= 3 && updates_shape.size() <= 3)
                    {
                        std::function<decltype(runtime::cpu::kernel::scatter_add_i64<float, 2, 2>)>
                            kernel;
@@ -91,6 +95,13 @@ namespace ngraph
                        functors.emplace_back(functor);
                    }
                    else
+                    {
+                        throw ngraph_error("Unsupported ranks in CPU Builder for ScatterAdd");
+                    }
+                }
+                else
+                {
+                    if (inputs_shape.size() <= 3 && updates_shape.size() <= 3)
                    {
                        std::function<decltype(runtime::cpu::kernel::scatter_add_i32<float, 2, 2>)>
                            kernel;
@@ -122,6 +133,11 @@ namespace ngraph
                        };
                        functors.emplace_back(functor);
                    }
+                    else
+                    {
+                        throw ngraph_error("Unsupported ranks in CPU Builder for ScatterAdd");
+                    }
+                }
            }
            REGISTER_OP_BUILDER(ScatterAdd);
        }

--- a/src/ngraph/runtime/cpu/cpu_builder.hpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.hpp
@@ -211,14 +211,6 @@
        KV = K<ET, 2, R2>;                                                                         \
    else if (R1 == 3)                                                                              \
        KV = K<ET, 3, R2>;                                                                         \
-    else if (R1 == 4)                                                                              \
-        KV = K<ET, 4, R2>;                                                                         \
-    else if (R1 == 5)                                                                              \
-        KV = K<ET, 5, R2>;                                                                         \
-    else if (R1 == 6)                                                                              \
-        KV = K<ET, 6, R2>;                                                                         \
-    else if (R1 == 7)                                                                              \
-        KV = K<ET, 7, R2>;                                                                         \
    else                                                                                           \
        throw ngraph_error("Unsupported first rank " + std::to_string(R1) + " for kernel " #K);

@@ -235,22 +227,6 @@
    {                                                                                              \
        SELECT_RANK1(KV, ET, R1, 3, K);                                                            \
    }                                                                                              \
-    else if (R2 == 4)                                                                              \
-    {                                                                                              \
-        SELECT_RANK1(KV, ET, R1, 4, K);                                                            \
-    }                                                                                              \
-    else if (R2 == 5)                                                                              \
-    {                                                                                              \
-        SELECT_RANK1(KV, ET, R1, 5, K);                                                            \
-    }                                                                                              \
-    else if (R2 == 6)                                                                              \
-    {                                                                                              \
-        SELECT_RANK1(KV, ET, R1, 6, K);                                                            \
-    }                                                                                              \
-    else if (R2 == 7)                                                                              \
-    {                                                                                              \
-        SELECT_RANK1(KV, ET, R1, 7, K);                                                            \
-    }                                                                                              \
    else                                                                                           \
    {                                                                                              \
        throw ngraph_error("Unsupported second rank " + std::to_string(R2) + " for kernel " #K);   \
@@ -270,6 +246,10 @@
    {                                                                                              \
        SELECT_2RANKS(KV, uint8_t, R1, R2, K);                                                     \
    }                                                                                              \
+    else if (ET == element::i8)                                                                    \
+    {                                                                                              \
+        SELECT_2RANKS(KV, int8_t, R1, R2, K);                                                      \
+    }                                                                                              \
    else                                                                                           \
    {                                                                                              \
        throw ngraph_error("Unsupported element type " + ET.c_type_string() + " for kernel " #K);  \

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -271,10 +271,13 @@ namespace ngraph
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::AllReduce)
            {
+                const ngraph::op::AllReduce* allreduce =
+                    static_cast<const ngraph::op::AllReduce*>(node);
                writer << "ngraph::get_distributed_interface()->all_reduce(" << args[0].get_name()
                       << ", " << out[0].get_name() << ", "
                       << "ngraph::element::Type_t::" << args[0].get_element_type().get_type_name()
-                       << ", " << out[0].get_size() << ");\n";
+                       << ", " << out[0].get_size() << ", "
+                       << "ngraph::Reduce_t::" << allreduce->get_reduce_type() << ");\n";
            }

            template <>
@@ -1842,8 +1845,9 @@ namespace ngraph
                writer.block_begin();
                if ((args[0].get_element_type() == element::f64 ||
                     args[0].get_element_type() == element::f32 ||
-                     args[0].get_element_type() == element::u8) &&
-                    gather->get_axis() == 0)
+                     args[0].get_element_type() == element::u8 ||
+                     args[0].get_element_type() == element::i8) &&
+                    args[0].get_shape().size() <= 3 && out[0].get_shape().size() <= 3)
                {
                    writer << "cpu::kernel::gather<" << args[0].get_type() << ", "
                           << args[1].get_element_type().c_type_string() << ", "
@@ -1903,8 +1907,11 @@ namespace ngraph
                }

                writer.block_begin();
-                if (args[0].get_element_type() == element::f64 ||
-                    args[0].get_element_type() == element::f32)
+                if ((args[0].get_element_type() == element::f64 ||
+                     args[0].get_element_type() == element::f32 ||
+                     args[0].get_element_type() == element::u8 ||
+                     args[0].get_element_type() == element::i8) &&
+                    args[0].get_shape().size() <= 3 && args[2].get_shape().size() <= 3)
                {
                    writer << "cpu::kernel::scatter_add<" << args[0].get_type() << ", "
                           << args[1].get_element_type().c_type_string() << ", "

--- a/src/ngraph/runtime/cpu/kernel/gather.hpp
+++ b/src/ngraph/runtime/cpu/kernel/gather.hpp
@@ -31,7 +31,7 @@ namespace ngraph
        {
            namespace kernel
            {
-                // Calculate the indices from position 0 to rank-1.
+                // Calculate the indices for positions 0 to rank-1.
                static void
                    get_indices(const Shape& shape, int index, std::vector<int>& indices, int rank)
                {
@@ -93,8 +93,11 @@ namespace ngraph

                    if (indices_rank == 0)
                    {
+//TODO Enable this if compiler issue with CODEGEN is fixed or DEX needs it.
+#if 0
 #ifdef _OPENMP
 #pragma omp parallel for
+#endif
 #endif
                        for (int i = 0; i < outer_loop_num; i++)
                        {
@@ -142,7 +145,11 @@ namespace ngraph
                    }
                    else
                    {
-                        auto num_indices = shape_size(indices_shape);
+                        size_t num_indices = 1;
+                        for (auto d : indices_shape)
+                        {
+                            num_indices *= d;
+                        }

 #ifdef _OPENMP
 #pragma omp parallel for

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -2238,6 +2238,32 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconvb_add()
            std::dynamic_pointer_cast<ngraph::op::Add>(m.get_match_root()->get_argument(0));
        auto dq_l_m = std::dynamic_pointer_cast<ngraph::op::Dequantize>(pattern_map[dq_l_label]);
        auto dq_r_m = std::dynamic_pointer_cast<ngraph::op::Dequantize>(pattern_map[dq_r_label]);
+
+        // both left and right are QuantizedConvolutionBias
+        if (dq_r_m->get_argument(0)->description() == "QuantizedConvolutionBias")
+        {
+            for (auto user : m.get_match_root()->get_users())
+            {
+                auto q_m = std::dynamic_pointer_cast<ngraph::op::Quantize>(user);
+                if (q_m)
+                {
+                    auto q_m_scale = q_m->get_argument(1);
+                    auto dq_l_m_scale = dq_l_m->get_argument(1);
+                    auto dq_r_m_scale = dq_r_m->get_argument(1);
+                    if (!ngraph::compare_constants(q_m_scale, dq_l_m_scale) &&
+                        ngraph::compare_constants(q_m_scale, dq_r_m_scale))
+                    {
+                        NGRAPH_DEBUG << "Scales of Q and DQ of right branch match";
+                        // switch left and right branch
+                        auto temp = dq_l_m;
+                        dq_l_m = dq_r_m;
+                        dq_r_m = temp;
+                    }
+                    break;
+                }
+            }
+        }
+
        auto qconv =
            std::static_pointer_cast<ngraph::op::QuantizedConvolutionBias>(dq_l_m->get_argument(0));
        auto inplace_input = dq_r_m->get_argument(0);

--- a/src/ngraph/runtime/interpreter/int_executable.hpp
+++ b/src/ngraph/runtime/interpreter/int_executable.hpp
@@ -24,6 +24,7 @@
 #include <vector>

 #include "ngraph/op/all.hpp"
+#include "ngraph/op/allreduce.hpp"
 #include "ngraph/op/any.hpp"
 #include "ngraph/op/argmax.hpp"
 #include "ngraph/op/argmin.hpp"
@@ -255,9 +256,12 @@ private:
        }
        case OP_TYPEID::AllReduce:
        {
+            const ngraph::op::AllReduce* allreduce =
+                static_cast<const ngraph::op::AllReduce*>(&node);
            reference::allreduce<T>(args[0]->get_data_ptr<T>(),
                                    out[0]->get_data_ptr<T>(),
                                    node.get_input_element_type(0).get_type_enum(),
+                                    allreduce->get_reduce_type(),
                                    static_cast<int>(shape_size(node.get_input_shape(0))));
            break;
        }

--- a/src/ngraph/runtime/plaidml/plaidml_impl.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_impl.hpp
@@ -76,11 +76,11 @@ namespace ngraph
                // input count.
                void check_inputs(std::size_t expected_input_count) const
                {
-                    if (op().get_input_size() != expected_input_count)
+                    if (op().get_input_size() < expected_input_count)
                    {
                        std::ostringstream os;
                        os << "The PlaidML nGraph backend only supports " << op().description()
-                           << " operations with an input count == " << expected_input_count
+                           << " operations with an input count >= " << expected_input_count
                           << " (got " << op().get_input_size() << " inputs)";
                        throw std::runtime_error{os.str()};
                    }

--- a/src/ngraph/runtime/plaidml/unit_test.manifest
+++ b/src/ngraph/runtime/plaidml/unit_test.manifest
@@ -261,92 +261,25 @@ batch_mat_mul_forward
 dot_matrix_2x0_0x2

 # dgkutnic ww24.5: these tests are to be triaged by the PlaidML team
-
-convolution_3d_1item_large_5o3i_padded_uneven_filter_uneven_data_dilation_data_dilated
-select
-product_trivial
-product_trivial_5d
-product_to_scalar
-product_matrix_columns
-product_matrix_rows
-product_3d_to_matrix_most_sig
-product_3d_to_matrix_least_sig
-product_3d_to_vector
-product_3d_to_scalar
-product_2d_to_scalar_int32
-product_to_scalar_int32
-product_to_scalar_int8
-max_trivial
-max_trivial_5d
-max_to_scalar
-max_to_scalar_int8
-max_matrix_columns
-max_matrix_rows
-max_matrix_rows_int32
-max_3d_to_matrix_most_sig
-max_3d_to_matrix_least_sig
-max_3d_to_vector
-max_3d_to_scalar
-max_3d_to_scalar_int32
-min_trivial
-min_trivial_5d
-min_trivial_5d_int32
-min_to_scalar
-min_to_scalar_int8
-min_matrix_columns
-min_matrix_rows
-min_matrix_rows_int32
-min_3d_to_matrix_most_sig
-min_3d_to_matrix_least_sig
-min_3d_to_vector
-min_3d_to_scalar
-min_3d_to_scalar_int32
-sum_to_scalar
+# ww25.2: re-scrubbed this list of tests after fixing check_inputs
+# initial debug points to some of these failing due to precision issues
+sqrt
+batch_norm_inference_0eps_f32
+batch_norm_inference_f32
+batch_norm_training_0eps_f32
+argmin_trivial
+argmax_trivial
+argmin_trivial_in_i32
 sum_large_1d_to_scalar
-sum_matrix_columns
-sum_matrix_6d
-sum_matrix_rows
-sum_3d_to_matrix_most_sig
-sum_3d_to_matrix_least_sig
-sum_3d_to_vector
-sum_3d_to_scalar
-sum_3d_to_scalar_int32
-sum_5d_to_scalar
-sum_5d_to_scalar_int32
-sum_2d_to_scalar_int8
 sum_stable_acc
-sum_stable_simple_float
+one_hot_scalar_2_in_3
+one_hot_scalar_1_in_3
+one_hot_scalar_0_in_3
+lstm_cell_no_bias_no_peepholes
+lstm_cell_bias_peepholes
+lstm_cell_bias_peepholes_clip_input_forget
+lstm_cell_activaction_functions
+group_conv_transpose
+group_conv_transpose_output_shape
 divide_python_rounding_int32
-any_2x2_to_scalar_true
-any_2x2_to_scalar_false
-any_2x3_eliminate_col_dim
-any_2x3_eliminate_row_dim
-any_2x2x3_eliminate_dim_1
-any_2x2x3_eliminate_dim_2
-any_2x2x3_eliminate_dims_0_1
-any_2x2x3_eliminate_dims_0_2
-any_2x2x3_eliminate_dims_1_2
-any_2x2x3_eliminate_dims_0_1_2
-all_trivial
-all_2x2_to_scalar_false
-all_2x2_to_scalar_true
-all_2x3_eliminate_col_dim
-all_2x3_eliminate_row_dim
-all_2x2x3_eliminate_dim_0
-all_2x2x3_eliminate_dim_1
-all_2x2x3_eliminate_dim_2
-all_2x2x3_eliminate_dims_0_1
-all_2x2x3_eliminate_dims_0_2
-all_2x2x3_eliminate_dims_1_2
-all_2x2x3_eliminate_dims_0_1_2
-all_dynamic_axis
-all_change_axis
-backwards_broadcast0
-backwards_broadcast1
-backwards_select
-backwards_select_nested
-backwards_sum_v2s
-backwards_sum_m2s
-backwards_sum_m2v_0
-backwards_sum_m2v_1
 backwards_batchmatmul_tensor2_tensor2
--- a/src/ngraph/runtime/reference/allreduce.hpp
+++ b/src/ngraph/runtime/reference/allreduce.hpp
@@ -25,9 +25,13 @@ namespace ngraph
        namespace reference
        {
            template <typename T>
-            void allreduce(T* arg, T* out, const element::Type_t element_type, int count)
+            void allreduce(T* arg,
+                           T* out,
+                           const element::Type_t element_type,
+                           const reduction::Type reduce_type,
+                           int count)
            {
-                get_distributed_interface()->all_reduce(arg, out, element_type, count);
+                get_distributed_interface()->all_reduce(arg, out, element_type, reduce_type, count);
            }
        }
    }

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
--- a/src/ngraph/serializer.hpp
+++ b/src/ngraph/serializer.hpp
@@ -62,42 +62,3 @@ namespace ngraph
    /// Option may be enabled by setting the environment variable NGRAPH_SERIALIZER_OUTPUT_SHAPES
    void set_serialize_output_shapes(bool enable);
 }
-
-#ifdef NGRAPH_JSON_DISABLE
-// Rather than making every reference to the serializer conditionally compile here we just
-// provide some null stubs to resolve link issues
-// The `inline` is so we don't get multiple definitions of function
-std::string inline ngraph::serialize(std::shared_ptr<ngraph::Function> func, size_t indent)
-{
-    return "";
-}
-
-void inline ngraph::serialize(const std::string& path,
-                              std::shared_ptr<ngraph::Function> func,
-                              size_t indent)
-{
-    throw std::runtime_error("serializer disabled in build");
-}
-
-void inline ngraph::serialize(std::ostream& out,
-                              std::shared_ptr<ngraph::Function> func,
-                              size_t indent)
-{
-    throw std::runtime_error("serializer disabled in build");
-}
-
-std::shared_ptr<ngraph::Function> inline ngraph::deserialize(std::istream& in)
-{
-    throw std::runtime_error("serializer disabled in build");
-}
-
-std::shared_ptr<ngraph::Function> inline ngraph::deserialize(const std::string& str)
-{
-    throw std::runtime_error("serializer disabled in build");
-}
-
-void inline ngraph::set_serialize_output_shapes(bool enable)
-{
-    throw std::runtime_error("serializer disabled in build");
-}
-#endif
--- a/src/ngraph/serializer_stub.cpp
+++ b/src/ngraph/serializer_stub.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "ngraph/serializer.hpp"
+
+std::string ngraph::serialize(std::shared_ptr<ngraph::Function> func, size_t indent)
+{
+    throw std::runtime_error("serializer disabled in build");
+}
+
+void ngraph::serialize(const std::string& path,
+                       std::shared_ptr<ngraph::Function> func,
+                       size_t indent)
+{
+    throw std::runtime_error("serializer disabled in build");
+}
+
+void ngraph::serialize(std::ostream& out, std::shared_ptr<ngraph::Function> func, size_t indent)
+{
+    throw std::runtime_error("serializer disabled in build");
+}
+
+std::shared_ptr<ngraph::Function> ngraph::deserialize(std::istream& in)
+{
+    throw std::runtime_error("serializer disabled in build");
+}
+
+std::shared_ptr<ngraph::Function> ngraph::deserialize(const std::string& str)
+{
+    throw std::runtime_error("serializer disabled in build");
+}
+
+void ngraph::set_serialize_output_shapes(bool enable)
+{
+    throw std::runtime_error("serializer disabled in build");
+}
--- a/test/backend_scatter.in.cpp
+++ b/test/backend_scatter.in.cpp
@@ -35,6 +35,7 @@ using namespace ngraph;

 static string s_manifest = "${MANIFEST}";

+#if 0
 NGRAPH_TEST(${BACKEND_NAME}, scatter_add_4d_indices)
 {
    Shape ref_shape{3, 3, 3};
@@ -122,13 +123,14 @@ NGRAPH_TEST(${BACKEND_NAME}, scatter_add_3d_indices)
        read_vector<float>(result),
        MIN_FLOAT_TOLERANCE_BITS));
 }
+#endif

 NGRAPH_TEST(${BACKEND_NAME}, scatter_add_2d_indices)
 {
-    Shape ref_shape{2, 3, 3};
+    Shape ref_shape{3};
    Shape indices_shape{2, 2};
-    Shape updates_shape{2, 2, 3, 3};
-    Shape out_shape{2, 3, 3};
+    Shape updates_shape{2, 2};
+    Shape out_shape{3};
    auto R = make_shared<op::Parameter>(element::f32, ref_shape);
    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
    auto U = make_shared<op::Parameter>(element::f32, updates_shape);
@@ -140,20 +142,17 @@ NGRAPH_TEST(${BACKEND_NAME}, scatter_add_2d_indices)

    // Create some tensors for input/output
    auto r = backend->create_tensor(element::f32, ref_shape);
-    copy_data(r, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    copy_data(r, vector<float>{0, 1, 2});
    auto i = backend->create_tensor(element::i32, indices_shape);
    copy_data(i, vector<int32_t>{0, 1, 1, 0});
    auto u = backend->create_tensor(element::f32, updates_shape);
-    copy_data(u, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                               1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8});
+    copy_data(u, vector<float>{1, 2, 3, 4});
    auto result = backend->create_tensor(element::f32, out_shape);

    auto c = backend->compile(f);
    c->call_with_validate({result}, {r, i, u});
    EXPECT_TRUE(test::all_close_f(
-        (vector<float>{0, 3, 6, 9, 12, 15, 18, 21, 24, 3, 6, 9, 12, 15, 18, 21, 24, 27}),
-        read_vector<float>(result),
-        MIN_FLOAT_TOLERANCE_BITS));
+        (vector<float>{5, 6, 2}), read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
 }

 NGRAPH_TEST(${BACKEND_NAME}, scatter_add_1d_indices)

--- a/test/build_graph.cpp
+++ b/test/build_graph.cpp
@@ -150,3 +150,19 @@ TEST(build_graph, no_arg_construction)
    validate_nodes_and_infer_types(ops);
    ASSERT_EQ(add1->get_output_shape(0), Shape{7});
 }
+
+TEST(build_graph, multi_output_split)
+{
+    const auto data = make_shared<op::Parameter>(element::f32, Shape{64, 8, 100, 150});
+    auto filters = make_shared<op::Parameter>(element::f32, Shape{128, 2, 10, 20});
+    const auto split = make_shared<op::Split>(data, 1, 2);
+    auto conv = make_shared<op::GroupConvolution>(split->output(1),
+                                                  filters,
+                                                  Strides{1, 1},
+                                                  Strides{1, 1},
+                                                  CoordinateDiff{0, 0},
+                                                  CoordinateDiff{0, 0},
+                                                  Strides{1, 1},
+                                                  2);
+    EXPECT_EQ(conv->get_shape(), (Shape{64, 128, 91, 131}));
+}
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -3691,6 +3691,120 @@ TEST(cpu_quant_fusion, qconvba)
    EXPECT_TRUE(test::all_close(cpu1_results.at(0), cpu2_results.at(0)));
 }

+TEST(cpu_quant_fusion, qconvba_q)
+{
+    auto make_function = []() {
+        Shape shape_input{1, 2, 2, 2};
+        Shape shape_weights{1, 2, 1, 1};
+        Shape shape_summand{1, 1, 2, 2};
+        auto input_l = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights_l = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias_l = std::make_shared<op::Parameter>(element::f32, Shape{shape_weights[0]});
+        auto input_r = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights_r = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias_r = std::make_shared<op::Parameter>(element::f32, Shape{shape_weights[0]});
+
+        auto input_scale_l = op::Constant::create(element::f32, Shape{}, {2.0f});
+        auto weights_scale_l = op::Constant::create(element::f32, Shape{}, {2.0f});
+        auto output_scale_l = op::Constant::create(element::f32, Shape{}, {4.0f});
+        auto input_scale_r = op::Constant::create(element::f32, Shape{}, {5.0f});
+        auto weights_scale_r = op::Constant::create(element::f32, Shape{}, {5.0f});
+        auto output_scale_r = op::Constant::create(element::f32, Shape{}, {20.0f});
+
+        auto int8_zero = op::Constant::create(element::i8, Shape{}, {0});
+        auto int32_zero = op::Constant::create(element::i32, Shape{}, {0});
+        auto uint8_zero = op::Constant::create(element::u8, Shape{}, {0});
+
+        op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+        auto q_input_l = std::make_shared<op::Quantize>(
+            input_l, input_scale_l, uint8_zero, element::u8, AxisSet{}, round_mode);
+        auto q_weights_l = std::make_shared<op::Quantize>(
+            weights_l, weights_scale_l, int8_zero, element::i8, AxisSet{}, round_mode);
+        auto q_bias_l = std::make_shared<op::Quantize>(bias_l,
+                                                       input_scale_l * weights_scale_l,
+                                                       int32_zero,
+                                                       element::i32,
+                                                       AxisSet{},
+                                                       round_mode);
+        auto q_input_r = std::make_shared<op::Quantize>(
+            input_r, input_scale_r, uint8_zero, element::u8, AxisSet{}, round_mode);
+        auto q_weights_r = std::make_shared<op::Quantize>(
+            weights_r, weights_scale_r, int8_zero, element::i8, AxisSet{}, round_mode);
+        auto q_bias_r = std::make_shared<op::Quantize>(bias_r,
+                                                       input_scale_r * weights_scale_r,
+                                                       int32_zero,
+                                                       element::i32,
+                                                       AxisSet{},
+                                                       round_mode);
+
+        // Left Graph
+        auto requant_scale_l = (input_scale_l * weights_scale_l) / output_scale_l;
+        auto conv_l = std::make_shared<op::QuantizedConvolutionBias>(q_input_l,
+                                                                     q_weights_l,
+                                                                     q_bias_l,
+                                                                     Strides{1, 1},
+                                                                     Strides{1, 1},
+                                                                     CoordinateDiff{0, 0},
+                                                                     CoordinateDiff{0, 0},
+                                                                     Strides{1, 1},
+                                                                     requant_scale_l);
+        auto dq_l = std::make_shared<op::Dequantize>(
+            conv_l, output_scale_l, int8_zero, element::f32, AxisSet{});
+        auto r_l = std::make_shared<op::Reshape>(dq_l, AxisVector{0, 1, 2, 3}, Shape{1, 2, 2});
+        auto b_l = std::make_shared<op::Broadcast>(r_l, Shape{1, 1, 2, 2}, AxisSet{0});
+
+        // Right Graph
+        auto requant_scale_r = (input_scale_r * weights_scale_r) / output_scale_r;
+        auto conv_r = std::make_shared<op::QuantizedConvolutionBias>(q_input_r,
+                                                                     q_weights_r,
+                                                                     q_bias_r,
+                                                                     Strides{1, 1},
+                                                                     Strides{1, 1},
+                                                                     CoordinateDiff{0, 0},
+                                                                     CoordinateDiff{0, 0},
+                                                                     Strides{1, 1},
+                                                                     requant_scale_r);
+        auto dq_r = std::make_shared<op::Dequantize>(
+            conv_r, output_scale_r, int8_zero, element::f32, AxisSet{});
+        auto r_r = std::make_shared<op::Reshape>(dq_r, AxisVector{0, 1, 2, 3}, Shape{1, 2, 2});
+        auto b_r = std::make_shared<op::Broadcast>(r_r, Shape{1, 1, 2, 2}, AxisSet{0});
+        auto add = b_l + b_r;
+        auto relu = std::make_shared<op::Relu>(add);
+        auto q = std::make_shared<op::Quantize>(
+            relu, output_scale_r, uint8_zero, element::u8, AxisSet{}, round_mode);
+        auto dq = std::make_shared<op::Dequantize>(
+            q, output_scale_r, uint8_zero, element::f32, AxisSet{});
+        return make_shared<Function>(
+            NodeVector{dq},
+            ParameterVector{input_l, weights_l, bias_l, input_r, weights_r, bias_r});
+    };
+
+    auto cpu_f1 = make_function();
+    auto cpu_f2 = make_function();
+
+    test::Uniform<float> rng(2.0f, 2.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f1->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    // Disable CPUQuantFusion
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:0", 1);
+    auto cpu1_results = execute(cpu_f1, args, "CPU");
+    // Enable CPUQuantFusion
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:1", 1);
+    auto cpu2_results = execute(cpu_f2, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu1_results.at(0), cpu2_results.at(0)));
+
+    auto backend = runtime::Backend::create("CPU");
+    auto fuse = make_function();
+    backend->compile(fuse);
+    ASSERT_EQ(count_ops_of_type<op::Quantize>(fuse), 6);
+}
+
 #ifndef NGRAPH_JSON_DISABLE
 // Tests that rely on deserializing json files
 TEST(cpu_fusion, fuse_conv_bias)

--- a/test/distributed.in.cpp
+++ b/test/distributed.in.cpp
@@ -29,25 +29,61 @@
 using namespace std;
 using namespace ngraph;

-TEST(distributed_${BACKEND_NAME}, allreduce)
+static void test_allreduce_common(reduction::Type reduce_type)
 {
    auto comm_size = get_distributed_interface()->get_size();
    if (comm_size > 1)
    {
        auto shape = Shape{2, 2};
        auto A = make_shared<op::Parameter>(element::f32, shape);
-        auto f = make_shared<Function>(make_shared<op::AllReduce>(A), ParameterVector{A});
+        auto f =
+            make_shared<Function>(make_shared<op::AllReduce>(A, reduce_type), ParameterVector{A});

        auto backend = runtime::Backend::create("${BACKEND_NAME}");

        auto v = vector<float>{1, 2, 3, 4};
        auto a = backend->create_tensor(element::f32, shape);
-        copy_data(a, vector<float>{1, 2, 3, 4});
-
        auto result = backend->create_tensor(element::f32, shape);

+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+        switch (reduce_type.get_type())
+        {
+        case reduction::Type_t::sum:
+            copy_data(a, v);
            std::transform(
                v.begin(), v.end(), v.begin(), std::bind1st(std::multiplies<float>(), comm_size));
+            break;
+        case reduction::Type_t::prod:
+            copy_data(a, v);
+            std::transform(v.begin(), v.end(), v.begin(), [&](float elm) -> float {
+                return pow(elm, comm_size);
+            });
+            break;
+        case reduction::Type_t::min:
+        case reduction::Type_t::max:
+            auto shift = get_distributed_interface()->get_rank();
+            std::rotate(v.begin(), v.begin() + shift % v.size(), v.end());
+            copy_data(a, v);
+            if (reduce_type == reduction::Type_t::min)
+            {
+                std::fill(v.begin(), v.end(), 1);
+                for (int i = 1; i < static_cast<int>(v.size()) - comm_size + 1; i++)
+                    v[i] = i + 1;
+            }
+            else
+            {
+                std::fill(v.begin(), v.end(), v.size());
+                for (int i = 0; i < static_cast<int>(v.size()) - comm_size; i++)
+                    v[i] = i + 2;
+            }
+        }
+#if !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8)
+#pragma GCC diagnostic pop
+#endif

        auto handle = backend->compile(f);
        handle->call_with_validate({result}, {a});
@@ -55,6 +91,28 @@ TEST(distributed_${BACKEND_NAME}, allreduce)
    }
 }

+TEST(distributed_${BACKEND_NAME}, allreduce_sum)
+{
+    test_allreduce_common(reduction::sum);
+}
+
+TEST(distributed_${BACKEND_NAME}, allreduce_min)
+{
+    test_allreduce_common(reduction::min);
+}
+
+TEST(distributed_${BACKEND_NAME}, allreduce_max)
+{
+    test_allreduce_common(reduction::max);
+}
+
+#if !defined(NGRAPH_DISTRIBUTED_MLSL_ENABLE)
+TEST(distributed_${BACKEND_NAME}, allreduce_prod)
+{
+    test_allreduce_common(reduction::prod);
+}
+#endif
+
 TEST(distributed_${BACKEND_NAME}, broadcastdistributed)
 {
    auto shape = Shape{2, 2};

--- a/test/serialize.cpp
+++ b/test/serialize.cpp
@@ -324,3 +324,19 @@ TEST(serialize, constant_infinity_nan)
    EXPECT_NE(str.find(R"(label="C)"), string::npos);
    EXPECT_NE(str.find(R"(label="D)"), string::npos);
 }
+
+TEST(serialize, non_zero_node_output)
+{
+    auto arg = make_shared<op::Parameter>(element::f32, Shape{10});
+    auto topk = make_shared<op::TopK>(arg, 0, element::i32, 5, true);
+    auto abs = make_shared<op::Abs>(Output<Node>(topk, 1));
+    auto result = make_shared<op::Result>(abs);
+    auto f = make_shared<Function>(ResultVector{result}, ParameterVector{arg});
+    string s = serialize(f);
+    shared_ptr<Function> g = deserialize(s);
+    auto g_result = g->get_results().at(0);
+    auto g_abs = g_result->input(0).get_source_output().get_node_shared_ptr();
+    auto topk_out = g_abs->input(0).get_source_output();
+    EXPECT_EQ(topk_out.get_index(), 1);
+    EXPECT_EQ(topk_out.get_node()->description(), "TopK");
+}