Merge branch 'master' into master

0f05495c · Scott Cyphers · GitHub · 1b5340c4 · 244c9fcf · 0f05495c
Unverified Commit 0f05495c authored Dec 04, 2018 by Scott Cyphers Committed by GitHub Dec 04, 2018
5 changed files
--- a/cmake/external_cldnn.cmake
+++ b/cmake/external_cldnn.cmake
@@ -22,7 +22,7 @@ include(ExternalProject)
 #------------------------------------------------------------------------------
 set(CLDNN_GIT_REPO_URL https://github.com/intel/clDNN.git)
-set(CLDNN_GIT_LABEL df28d2861716cac7a6a9eff4e49e47162959a748)
+set(CLDNN_GIT_LABEL 02add7c4ce2baa81e2a32fa02d733dcc4f013108)
 set(BOOST_VERSION 1.64.0)
 set(OUT_DIR ${EXTERNAL_PROJECTS_ROOT}/cldnn/out)

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -30,6 +30,7 @@
 #include <CPP/crop.hpp>
 #include <CPP/data.hpp>
 #include <CPP/eltwise.hpp>
+#include <CPP/gemm.hpp>
 #include <CPP/input_layout.hpp>
 #include <CPP/layout.hpp>
 #include <CPP/lrn.hpp>
@@ -675,6 +676,41 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        {
            arguments_check(op, 2, 1);
+            const shared_ptr<op::Dot> dot_inst = static_pointer_cast<op::Dot>(op);
+            const size_t axes_count = dot_inst->get_reduction_axes_count();
+            const Shape& input0_shape = get_input_shape(op, 0);
+            const Shape& input1_shape = get_input_shape(op, 1);
+            const size_t input0_elem_count = shape_size(input0_shape);
+            const size_t input1_elem_count = shape_size(input1_shape);
+            if (get_input_type(op) == element::f32 && get_input_type(op, 1) == element::f32 &&
+                get_output_type(op) == element::f32 && input0_elem_count && input1_elem_count &&
+                (axes_count == 1) && (input0_shape.size() < 3) && (input1_shape.size() < 3) &&
+                !input0_shape.empty() && !input1_shape.empty())
+            {
+                string input1_name = get_input_name(op, 1);
+                // If we have A[5] and B[5] here, in cldnn we have A[1, 1, 1, 5] and B[1, 1, 1, 5]
+                // it needs to be reshaped into A[1, 1, 1, 5] and B[1, 1, 5, 1]
+                if (!input0_shape.empty() && (input1_shape.size() == 1))
+                {
+                    const string new_name = input1_name + "_reshaped";
+                    Shape new_shape = input1_shape;
+                    new_shape.push_back(1);
+                    const cldnn::tensor reshaped_tensor =
+                        intelgpu_space::create_cldnn_tensor(new_shape);
+                    const cldnn::reshape reshape_op(new_name, input1_name, reshaped_tensor);
+                    topology.add(reshape_op);
+                    input1_name = new_name;
+                }
+                const cldnn::gemm dot_op(get_output_name(op), get_input_name(op, 0), input1_name);
+                topology.add(dot_op);
+            }
+            else
+            {
                do_dot_operation(topology,
                                 get_input_name(op, 0),
                                 get_input_shape(op, 0),
@@ -682,7 +718,9 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
                                 get_input_shape(op, 1),
                                 get_output_name(op),
                                 get_output_shape(op),
-                             get_output_type(op));
+                                 get_output_type(op),
+                                 axes_count);
+            }
            break;
        }
        case OP_TYPEID::MaxPool:

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -306,16 +306,6 @@ static string access_dims_strided(const Shape& dimentions,
    return buffer;
 }
-static void do_dot_operation_error(const Shape& input0_shape,
-                                   const Shape& input1_shape,
-                                   const Shape& output_shape)
-{
-    throw invalid_argument("IntelGPU Dot operation. Conbination input0_shape" +
-                           runtime::intelgpu::array_dims(input0_shape) + ", input1_shape" +
-                           runtime::intelgpu::array_dims(input1_shape) + ", output_shape" +
-                           runtime::intelgpu::array_dims(output_shape) + " is not supported.");
-}
 void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
                                         const string& input_name,
                                         const Shape& input_shape,
@@ -742,82 +732,30 @@ void runtime::intelgpu::do_avg_pool_backprop_operation(cldnn::topology& topology
    topology.add(op_avg_pool_backprop);
 }
-static void do_1d_scalar_mul(codegen::CodeWriter& writer,
+void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
-                             string& entry_point_name,
+                                         const string& input0_name,
-                             const Shape& input0_shape,
-                             const Shape& input1_shape,
-                             const string& type_name)
-{
-    const size_t input0_count = input0_shape.empty() ? 0 : shape_size<Shape>(input0_shape);
-    const size_t input1_count = input1_shape.empty() ? 0 : shape_size<Shape>(input1_shape);
-    const size_t output_count = max(input0_count, input1_count);
-    entry_point_name += "_do_1d_scalar_mul";
-    writer << "__kernel void " << entry_point_name << "(const __global " << type_name << "* input0"
-           << ", const __global " << type_name << "* input1, __global " << type_name
-           << "* output)\n";
-    writer.block_begin();
-    {
-        writer << "for (uint i1 = 0; i1 < " << output_count << "; ++i1)\n";
-        writer.block_begin();
-        {
-            writer << "output[i1] = input0[" << (input0_count > 0 ? "i1" : "0") << "] * input1["
-                   << (input1_count > 0 ? "i1" : "0") << "];\n";
-        }
-        writer.block_end();
-    }
-    writer.block_end();
-}
-static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
-                                   string& entry_point_name,
                                         const Shape& input0_shape,
+                                         const string& input1_name,
                                         const Shape& input1_shape,
+                                         const string& output_name,
                                         const Shape& output_shape,
-                                   const string& type_name)
+                                         const element::Type& output_type,
+                                         size_t reduction_axes_count)
 {
-    entry_point_name += "_do_2d_2d_mul";
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    string entry_point_name = "dot_" + output_name;
+    const string type_name = get_opencl_type_name(output_type);
+    const size_t input0_axes = input0_shape.size() - reduction_axes_count;
+    size_t var_idx = reduction_axes_count;
+    Shape reduction_shape;
+    codegen::CodeWriter writer;
    vector<size_t> gws;
-    runtime::intelgpu::gen_func_def(writer,
+    for (auto it = input1_shape.begin(); (it != input1_shape.end()) && (var_idx > 0); ++it)
-                                    entry_point_name,
-                                    {2, type_name},
-                                    {input0_shape, input1_shape},
-                                    type_name,
-                                    output_shape);
-    writer.block_begin();
    {
-        // Main loops
+        reduction_shape.push_back(*it);
-        gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
+        --var_idx;
-        // Inner loop
-        writer << type_name << " sum = 0;\n";
-        writer << "for (uint i2 = 0; i2 < " << input0_shape.at(1) << "; ++i2)\n";
-        writer.block_begin();
-        {
-            writer << "sum += input0[i0][i2] * input1[i2][i1];\n";
-        }
-        writer.block_end();
-        writer << "output[i0][i1] = sum;\n";
-        // Closing brackets for main loops
-        runtime::intelgpu::generate_loops(writer, output_shape, false);
    }
-    writer.block_end();
-    return gws;
-}
-static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
-                                   string& entry_point_name,
-                                   const Shape& input0_shape,
-                                   const Shape& input1_shape,
-                                   const Shape& output_shape,
-                                   const string& type_name)
-{
-    entry_point_name += "_do_3d_3d_mul";
-    vector<size_t> gws;
    runtime::intelgpu::gen_func_def(writer,
                                    entry_point_name,
@@ -825,219 +763,84 @@ static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
                                    {input0_shape, input1_shape},
                                    type_name,
                                    output_shape);
    writer.block_begin();
    {
-        // Main loops
+        writer << "// reduction_axes_count:" << reduction_axes_count << "\n"
-        gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
+               << "// reduction_shape:" << reduction_shape << "\n";
-        // Inner loop
-        writer << type_name << " sum = 0;\n";
-        writer << "for (uint i4 = 0; i4 < " << input0_shape.back() << "; ++i4)\n";
-        writer.block_begin();
-        {
-            writer << "sum += input0[i0][i1][i4] * input1[i4][i2][i3];\n";
-        }
-        writer.block_end();
-        writer << "output[i0][i1][i2][i3] = sum;\n";
-        // Closing brackets for main loops
-        runtime::intelgpu::generate_loops(writer, output_shape, false);
-    }
-    writer.block_end();
-    return gws;
-}
-static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
-                                   string& entry_point_name,
-                                   const Shape& input0_shape,
-                                   const Shape& input1_shape,
-                                   const Shape& output_shape,
-                                   const string& type_name)
-{
-    entry_point_name += "_do_3d_2d_mul";
-    vector<size_t> gws;
-    runtime::intelgpu::gen_func_def(writer,
-                                    entry_point_name,
-                                    {2, type_name},
-                                    {input0_shape, input1_shape},
-                                    type_name,
-                                    output_shape);
-    writer.block_begin();
-    {
        // Main loops
        gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
-        // Inner loop
        writer << type_name << " sum = 0;\n";
-        writer << "for (uint i3 = 0; i3 < " << input0_shape.back() << "; ++i3)\n";
-        writer.block_begin();
-        {
-            writer << "sum += input0[i0][i1][i3] * input1[i3][i2];\n";
-        }
-        writer.block_end();
-        writer << "output[i0][i1][i2] = sum;\n";
-        // Closing brackets for main loops
+        // Reduction loops
-        runtime::intelgpu::generate_loops(writer, output_shape, false);
+        var_idx = 0;
-    }
+        for (auto const& i : reduction_shape)
-    writer.block_end();
-    return gws;
-}
-static vector<size_t> do_2d_1d_mul(codegen::CodeWriter& writer,
-                                   string& entry_point_name,
-                                   const Shape& input0_shape,
-                                   const Shape& input1_shape,
-                                   const Shape& output_shape,
-                                   const string& type_name)
-{
-    entry_point_name += "_do_2d_1d_mul";
-    vector<size_t> gws;
-    runtime::intelgpu::gen_func_def(writer,
-                                    entry_point_name,
-                                    {2, type_name},
-                                    {input0_shape, input1_shape},
-                                    type_name,
-                                    output_shape);
-    writer.block_begin();
        {
-        // Main loops
+            writer << "for (uint k" << var_idx << " = 0; k" << var_idx << " < " << i << "; ++k"
-        gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
+                   << var_idx << ")\n";
-        writer << type_name << " sum = 0;\n";
-        // Inner loop
-        writer << "for (uint i1 = 0; i1 < " << input0_shape.at(1) << "; ++i1)\n";
            writer.block_begin();
-        {
-            writer << "sum += input0[i0][i1] * input1[i1];\n";
-        }
-        writer.block_end();
-        writer << "output[i0] = sum;\n";
-        // Closing brackets for main loops
+            ++var_idx;
-        runtime::intelgpu::generate_loops(writer, output_shape, false);
        }
-    writer.block_end();
-    return gws;
+        writer << "sum += input0";
-}
-static void do_scalar_scalar_mul(codegen::CodeWriter& writer,
-                                 string& entry_point_name,
-                                 const string& type_name)
-{
-    entry_point_name += "_scalar_scalar_mul";
-    runtime::intelgpu::gen_func_def(
-        writer, entry_point_name, {2, type_name}, {{1}, {1}}, type_name, {1});
-    writer.block_begin();
+        if (input0_shape.empty())
        {
-        writer << "output[0] = input0[0] * input1[0];\n";
+            writer << "[0]";
        }
-    writer.block_end();
+        else
-}
+        {
+            // main axes indexes
-static void do_1d_1d_mul(codegen::CodeWriter& writer,
+            for (size_t i = 0; i < input0_axes; ++i)
-                         string& entry_point_name,
-                         const Shape& shape,
-                         const string& type_name)
-{
-    if (shape.size() > 1)
            {
-        throw invalid_argument("do_1d_1d_mul: Shape" + runtime::intelgpu::array_dims(shape) +
+                writer << "[i" << i << "]";
-                               " must be 1D");
            }
-    entry_point_name += "_do_1d_1d_mul";
+            // reduction axes indexes
+            for (size_t i = 0; i < reduction_shape.size(); ++i)
-    runtime::intelgpu::gen_func_def(
-        writer, entry_point_name, {2, type_name}, {2, shape}, type_name, {1});
-    writer.block_begin();
-    {
-        writer << type_name << " sum = 0;\n"
-               << "for (uint i = 0; i < " << shape.front() << "; ++i)\n";
-        writer.block_begin();
            {
-            writer << "sum += input0[i] * input1[i];\n";
+                writer << "[k" << i << "]";
            }
-        writer.block_end();
-        writer << "output[0] = sum;\n";
        }
-    writer.block_end();
-}
-void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
-                                         const string& input0_name,
-                                         const Shape& input0_shape,
-                                         const string& input1_name,
-                                         const Shape& input1_shape,
-                                         const string& output_name,
-                                         const Shape& output_shape,
-                                         const element::Type& output_type)
-{
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    string entry_point_name = "dot_" + output_name;
-    const string type_name = get_opencl_type_name(output_type);
-    codegen::CodeWriter writer;
-    vector<size_t> gws = {1};
-    const bool is_input0_scalar = input0_shape.empty();
+        // operation
-    const bool is_input1_scalar = input1_shape.empty();
+        writer << " * input1";
-    const bool is_output_scalar = output_shape.empty();
-    if (is_input0_scalar && is_input1_scalar && is_output_scalar)
+        if (input1_shape.empty())
-    {
-        do_scalar_scalar_mul(writer, entry_point_name, type_name);
-    }
-    else if (((is_input0_scalar && !is_input1_scalar) || (!is_input0_scalar && is_input1_scalar)) &&
-             !is_output_scalar)
        {
-        do_1d_scalar_mul(writer, entry_point_name, input0_shape, input1_shape, type_name);
+            writer << "[0]";
        }
-    else if (!is_input0_scalar && !is_input1_scalar && is_output_scalar)
+        else
-    {
-        do_1d_1d_mul(writer, entry_point_name, input1_shape, type_name);
-    }
-    else if (!is_input0_scalar && !is_input1_scalar && !is_output_scalar)
-    {
-        if (input0_shape.size() == 2 && input1_shape.size() == 1)
        {
-            gws = do_2d_1d_mul(
+            // reduction axes indexes
-                writer, entry_point_name, input0_shape, input1_shape, output_shape, type_name);
+            for (size_t i = 0; i < reduction_shape.size(); ++i)
-        }
-        else if (input0_shape.size() == 2 && input1_shape.size() == 2)
            {
-            gws = do_2d_2d_mul(
+                writer << "[k" << i << "]";
-                writer, entry_point_name, input0_shape, input1_shape, output_shape, type_name);
            }
-        else if (input0_shape.size() == 3 && input1_shape.size() == 3)
+            // main axes indexes
+            for (size_t i = input0_axes; i < output_shape.size(); ++i)
            {
-            gws = do_3d_3d_mul(
+                writer << "[i" << i << "]";
-                writer, entry_point_name, input0_shape, input1_shape, output_shape, type_name);
            }
-        else if (input0_shape.size() == 3 && input1_shape.size() == 2)
-        {
-            gws = do_3d_2d_mul(
-                writer, entry_point_name, input0_shape, input1_shape, output_shape, type_name);
        }
-        else
+        writer << ";\n";
+        //  Closing brackets for reduction loops
+        for (auto const& i : reduction_shape)
        {
-            do_dot_operation_error(input0_shape, input1_shape, output_shape);
+            writer.block_end();
-        }
        }
-    else
-    {
+        writer << "output" << runtime::intelgpu::access_dims(output_shape) << " = sum;\n";
-        do_dot_operation_error(input0_shape, input1_shape, output_shape);
+        // Closing brackets for main loops
+        runtime::intelgpu::generate_loops(writer, output_shape, false);
    }
+    writer.block_end();
    const cldnn::custom_gpu_primitive op_dot(output_name,
                                             {input0_name, input1_name},

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -72,7 +72,8 @@ namespace ngraph
                                  const Shape& inputB_shape,
                                  const std::string& output_name,
                                  const Shape& output_shape,
-                                  const element::Type& output_type);
+                                  const element::Type& output_type,
+                                  size_t reduction_axes_count);
            void do_slice_operation(cldnn::topology& topology,
                                    const std::string& input_name,

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
 avg_pool_2d_2channel_2image_padded_only_above_do_not_include_in_computation
 avg_pool_2d_2channel_2image_padded_only_above_include_in_computation
 avg_pool_3d_uneven_strided_padded
+backwards_batch_norm_three_outputs
 backwards_batch_norm_training
 backwards_dot_scalar_tensor
 backwards_dot_tensor3_tensor3
@@ -9,7 +10,6 @@ backwards_dot_tensor_vector
 backwards_exp
 backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_maxpool_n4_c1_hw4_2x2_max
-backwards_relu
 backwards_replace_slice
 backwards_reverse_sequence_n3_c2_h3
 backwards_reverse_sequence_n4d2c3h2w2
@@ -21,41 +21,37 @@ batch_norm_training_0eps_f64
 batch_norm_one_output
 batch_norm_three_outputs
 dequantize
-dequantize_zero_offset
 dequantize_axes
-dequantize_int8
-dequantize_int8_zero_offset
 dequantize_int32
 dequantize_int32_zero_offset
+dequantize_int8
+dequantize_int8_zero_offset
+dequantize_zero_offset
 divide_by_zero_int32
-dot_3d_multi_axis
-dot_4d_5d_multi_axis
-dot_4d_5d_multi_axis_more
-generate_mask
 function_call
+generate_mask
 max_pool_3d
-maxpool_bprop_larger_than_cache
 numeric_double_inf
 numeric_double_nan
 quantize
-quantize_zero_offset
 quantize_axes
-quantize_int8
+quantize_clamp_int32
-quantize_int8_zero_offset
+quantize_clamp_int8
+quantize_clamp_uint8
 quantize_int32
 quantize_int32_zero_offset
-quantize_clamp_uint8
+quantize_int8
-quantize_clamp_int8
+quantize_int8_zero_offset
-quantize_clamp_int32
+quantize_ROUND_DOWN
-quantize_ROUND_NEAREST_TOWARD_ZERO
-quantize_ROUND_NEAREST_TOWARD_INFINITY
-quantize_ROUND_NEAREST_UPWARD
 quantize_ROUND_NEAREST_DOWNWARD
 quantize_ROUND_NEAREST_TOWARD_EVEN
+quantize_ROUND_NEAREST_TOWARD_INFINITY
+quantize_ROUND_NEAREST_TOWARD_ZERO
+quantize_ROUND_NEAREST_UPWARD
 quantize_ROUND_TOWARD_INFINITY
 quantize_ROUND_TOWARD_ZERO
 quantize_ROUND_UP
-quantize_ROUND_DOWN
+quantize_zero_offset
 reduce_window_emulating_max_pool_1d_1channel_1image
 reduce_window_emulating_max_pool_1d_1channel_2image
 reduce_window_emulating_max_pool_1d_2channel_2image
@@ -73,9 +69,14 @@ reverse_sequence_n2c3h4w2
 reverse_sequence_n4c3h2w2
 reverse_sequence_n4d2c3h2w2
 select_and_scatter_3d_without_overlap
-select_and_scatter_with_overlap
 select_and_scatter_without_overlap
+select_and_scatter_with_overlap
+shape_of_5d
+shape_of_matrix
+shape_of_scalar
+shape_of_vector
 softmax_axis_3d_double
+sum_stable_acc
 topk_1d_max_all
 topk_1d_max_one
 topk_1d_max_partial
@@ -88,17 +89,17 @@ topk_2d_max_partial
 topk_2d_min_all
 topk_2d_min_one
 topk_2d_min_partial
+topk_3d_large_input_max
+topk_3d_large_input_min
 topk_3d_max_all
 topk_3d_max_one
 topk_3d_max_partial
 topk_3d_min_all
 topk_3d_min_one
 topk_3d_min_partial
+topk_3d_single_output
 topk_5d_max_partial
 topk_int64
-topk_3d_large_input_max
-topk_3d_large_input_min
-topk_3d_single_output
 zero_sized_abs
 zero_sized_acos
 zero_sized_add
@@ -145,3 +146,4 @@ max_3d_to_scalar_double
 argmin_trivial_in_i32
 argmax_4D_axis_3_i64_in_i32
 argmin_trivial_in_double
+maxpool_bprop_larger_than_cache