IntelGPU backend: Double datatype workaround implemented (#2435)

62e1bc26 · Sergey Shalnov · Scott Cyphers · f75e10c3 · 62e1bc26 · 62e1bc26
Commit 62e1bc26 authored Feb 14, 2019 by Sergey Shalnov Committed by Scott Cyphers Feb 14, 2019
4 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -1332,7 +1332,8 @@ shared_ptr<runtime::Executable>
            arguments_check(op, 5, 1);
-            if (get_input_shape(op, 2).size() != 4)
+            if ((get_input_shape(op, 2).size() != 4) ||
+                (get_input_type(op) != ngraph::element::f32))
            {
                do_batch_norm_operation(topology,
                                        get_output_name(op),
@@ -1364,7 +1365,8 @@ shared_ptr<runtime::Executable>
                static_pointer_cast<op::BatchNormTraining>(op);
            const double eps = bnorm->get_eps_value();
-            if (get_input_shape(op, 2).size() != 4)
+            if ((get_input_shape(op, 2).size() != 4) ||
+                (get_input_type(op) != ngraph::element::f32))
            {
                string mean_name;
                string variance_name;

--- a/src/ngraph/runtime/intelgpu/intelgpu_layout.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_layout.cpp
@@ -54,32 +54,19 @@ bool runtime::intelgpu::IntelGPULayout::
 cldnn::data_types
    runtime::intelgpu::IntelGPULayout::get_cldnn_type(const element::Type& element_type)
 {
-    if ((element_type == ngraph::element::i8) || (element_type == ngraph::element::boolean))
+    switch (element_type.get_type_enum())
    {
-        return cldnn::data_types::i8;
+    case element::Type_t::i8:
+    case element::Type_t::boolean: return cldnn::data_types::i8;
+    case element::Type_t::u8: return cldnn::data_types::u8;
+    case element::Type_t::i32: return cldnn::data_types::i32;
+    case element::Type_t::i64: return cldnn::data_types::i64;
+    case element::Type_t::f32: return cldnn::data_types::f32;
    }
-    else if (element_type == ngraph::element::u8)
-    {
-        return cldnn::data_types::u8;
-    }
-    else if (element_type == ngraph::element::i32)
-    {
-        return cldnn::data_types::i32;
-    }
-    else if (element_type == ngraph::element::i64)
-    {
-        return cldnn::data_types::i64;
-    }
-    else if (element_type == ngraph::element::f32)
-    {
-        return cldnn::data_types::f32;
-    }
-    else
-    {
    ostringstream os;
    os << "IntelGPULayout::get_cldnn_type: Unknown type " << element_type;
    throw invalid_argument(os.str());
-    }
 }
 cldnn::tensor runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(const Shape& element_shape)
@@ -131,13 +118,27 @@ cldnn::layout runtime::intelgpu::IntelGPULayout::create_cldnn_layout(
    const cldnn::tensor tensor = create_cldnn_tensor(element_shape);
    cldnn::data_types data_type;
-    if ((element_type == ngraph::element::i16) || (element_type == ngraph::element::u16))
+    switch (element_type.get_type_enum())
+    {
+    case element::Type_t::i16:
+    case element::Type_t::u16:
    {
        data_type = cldnn::data_types::f16;
+        break;
    }
-    else
+    case element::Type_t::u32:
    {
-        data_type = get_cldnn_type(element_type);
+        data_type = cldnn::data_types::i32;
+        break;
+    }
+    case element::Type_t::u64:
+    case element::Type_t::f64:
+    {
+        data_type = cldnn::data_types::i64;
+        break;
+    }
+    default: { data_type = get_cldnn_type(element_type);
+    }
    }
    return cldnn::layout(data_type, format, tensor);

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
@@ -64,11 +64,12 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
    const string entry_point_name = "create_mean_" + output_name;
    const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
+    const string kernel_data_type = get_opencl_type_name(output_type);
    codegen::CodeWriter writer;
-    writer << "__kernel void " << entry_point_name << "( const __global float input"
+    writer << "__kernel void " << entry_point_name << "( const __global " << kernel_data_type
-           << array_dims(input_shape) << ", __global float output" << array_dims(channel_shape)
+           << " input" << array_dims(input_shape) << ", __global " << kernel_data_type << " output"
-           << ")\n";
+           << array_dims(channel_shape) << ")\n";
    writer.block_begin();
    { // Main function body
@@ -78,7 +79,7 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
               << input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
        writer.block_begin();
        {
-            writer << "float sum = 0.0f;\n";
+            writer << kernel_data_type << " sum = 0.0f;\n";
            size_t var_idx = 0;
            // Main loops
            for (auto const& i : input_shape)
@@ -139,11 +140,13 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
    const string entry_point_name = "create_variance_" + output_name;
    const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
+    const string kernel_data_type = get_opencl_type_name(output_type);
    codegen::CodeWriter writer;
-    writer << "__kernel void " << entry_point_name << "( const __global float input"
+    writer << "__kernel void " << entry_point_name << "( const __global " << kernel_data_type
-           << array_dims(input_shape) << ", const __global float mean" << array_dims(channel_shape)
+           << " input" << array_dims(input_shape) << ", const __global " << kernel_data_type
-           << ", __global float output" << array_dims(channel_shape) << ")\n";
+           << " mean" << array_dims(channel_shape) << ", __global " << kernel_data_type << " output"
+           << array_dims(channel_shape) << ")\n";
    writer.block_begin();
    { // Main function body
@@ -153,7 +156,7 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
               << input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
        writer.block_begin();
        {
-            writer << "float sum = 0.0f;\n";
+            writer << kernel_data_type << " sum = 0.0f;\n";
            size_t var_idx = 0;
            // Main loops
@@ -168,8 +171,8 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
                ++var_idx;
            }
-            writer << "float mean_diff = input" << access_dims(input_shape) << " - mean[i"
+            writer << kernel_data_type << " mean_diff = input" << access_dims(input_shape)
-                   << channel_axis << "];\n";
+                   << " - mean[i" << channel_axis << "];\n";
            writer << "sum += mean_diff * mean_diff;\n";
            var_idx = 0;
@@ -217,14 +220,17 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, input_shape);
    const vector<size_t> gws(input_shape.begin(), input_shape.begin() + 2);
    const string entry_point_name = "batch_norm_" + output_name;
+    const string kernel_data_type = get_opencl_type_name(output_type);
    codegen::CodeWriter writer;
    // The kernel name and parameters
    writer << "__attribute__((reqd_work_group_size(1,1,1)))\n"
-           << "__kernel void " << entry_point_name
+           << "__kernel void " << entry_point_name << "(const __global " << kernel_data_type
-           << "(const __global float *input0, const __global float *input1,"
+           << " *input0, const __global " << kernel_data_type << " *input1,"
-           << " const __global float *input2, const __global float *input3,"
+           << " const __global " << kernel_data_type << " *input2, const __global "
-           << " const __global float *input4, __global float *output)\n";
+           << kernel_data_type << " *input3,"
+           << " const __global " << kernel_data_type << " *input4, __global " << kernel_data_type
+           << " *output)\n";
    writer.block_begin();
    { // Main function body
@@ -234,11 +240,12 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
               << "); /* channel_axis trip count " << input_shape.at(channel_axis) << "*/\n";
        // Invariants for the rest of the loops
-        writer << "const float    gamma = input1[i" << channel_axis << "];\n"
+        writer << "const " << kernel_data_type << "    gamma = input1[i" << channel_axis << "];\n"
-               << "const float     beta = input2[i" << channel_axis << "];\n"
+               << "const " << kernel_data_type << "     beta = input2[i" << channel_axis << "];\n"
-               << "const float     mean = input3[i" << channel_axis << "];\n"
+               << "const " << kernel_data_type << "     mean = input3[i" << channel_axis << "];\n"
-               << "const float variance = input4[i" << channel_axis << "];\n"
+               << "const " << kernel_data_type << " variance = input4[i" << channel_axis << "];\n"
-               << "const float var_sqrt = (gamma / sqrt(variance + " << eps << "));\n";
+               << "const " << kernel_data_type << " var_sqrt = (gamma / sqrt(variance + " << eps
+               << "));\n";
        writer << "const uint i0 = get_global_id(0);"
               << " /* batch axis trip count " << input_shape.at(0) << "*/\n";
@@ -285,14 +292,16 @@ void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology,
    const Shape channel_shape = get_channel_shape(input_shape, "create_variance_back");
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
    const string entry_point_name = "create_variance_back_" + output_name;
+    const string kernel_data_type = get_opencl_type_name(output_type);
    codegen::CodeWriter writer;
    vector<size_t> gws;
-    writer << "__kernel void " << entry_point_name << "(const __global float input"
+    writer << "__kernel void " << entry_point_name << "(const __global " << kernel_data_type
-           << array_dims(input_shape) << ", const __global float delta" << array_dims(input_shape)
+           << " input" << array_dims(input_shape) << ", const __global " << kernel_data_type
-           << ", const __global float mean" << array_dims(channel_shape)
+           << " delta" << array_dims(input_shape) << ", const __global " << kernel_data_type
-           << ", const __global float variance" << array_dims(channel_shape)
+           << " mean" << array_dims(channel_shape) << ", const __global " << kernel_data_type
-           << ", __global float output" << array_dims(channel_shape) << ")\n";
+           << " variance" << array_dims(channel_shape) << ", __global " << kernel_data_type
+           << " output" << array_dims(channel_shape) << ")\n";
    writer.block_begin();
    { // Main function body
@@ -302,10 +311,12 @@ void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology,
        writer << "\nconst uint i" << channel_axis << " = get_global_id(" << channel_axis
               << "); /* channel_axis trip count " << input_shape.at(channel_axis) << "*/\n";
        gws.push_back(input_shape.at(channel_axis));
-        writer << "const float     mean_loc = mean[i" << channel_axis << "];\n"
+        writer << "const " << kernel_data_type << "     mean_loc = mean[i" << channel_axis << "];\n"
-               << "const float variance_loc = variance[i" << channel_axis << "];\n"
+               << "const " << kernel_data_type << " variance_loc = variance[i" << channel_axis
-               << "const float var_sqrt = 1.0f / sqrt(variance_loc + " << eps << ");\n";
+               << "];\n"
-        writer << "float sum = 0.0f;\n";
+               << "const " << kernel_data_type << " var_sqrt = 1.0f / sqrt(variance_loc + " << eps
+               << ");\n";
+        writer << kernel_data_type << " sum = 0.0f;\n";
        // Main loops
        writer << "for (uint i0 = 0; i0 < " << input_shape.at(0) << "; ++i0)\n";
@@ -317,8 +328,10 @@ void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology,
                writer << "for (uint i3 = 0; i3 < " << input_shape.at(3) << "; ++i3)\n";
                writer.block_begin();
                {
-                    writer << "const float input_loc = input" << access_dims(input_shape) << ";\n";
+                    writer << "const " << kernel_data_type << " input_loc = input"
-                    writer << "const float delta_loc = delta" << access_dims(input_shape) << ";\n";
+                           << access_dims(input_shape) << ";\n";
+                    writer << "const " << kernel_data_type << " delta_loc = delta"
+                           << access_dims(input_shape) << ";\n";
                    writer << "sum += (input_loc - mean_loc) * var_sqrt * delta_loc;\n";
                }
                writer.block_end();
@@ -360,17 +373,19 @@ void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topolo
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(type, shape);
    const string entry_point_name = "batch_norm_backprop_" + output_name;
    const size_t r_axes_size = shape_size(shape) / shape_size(channel_shape);
+    const string kernel_data_type = get_opencl_type_name(type);
    codegen::CodeWriter writer;
    vector<size_t> gws;
-    writer << "__kernel void " << entry_point_name << "(const __global float input"
+    writer << "__kernel void " << entry_point_name << "(const __global " << kernel_data_type
-           << array_dims(shape) << ", const __global float delta" << array_dims(shape)
+           << " input" << array_dims(shape) << ", const __global " << kernel_data_type << " delta"
-           << ", const __global float mean" << array_dims(channel_shape)
+           << array_dims(shape) << ", const __global " << kernel_data_type << " mean"
-           << ", const __global float variance" << array_dims(channel_shape)
+           << array_dims(channel_shape) << ", const __global " << kernel_data_type << " variance"
-           << ", const __global float gamma" << array_dims(channel_shape)
+           << array_dims(channel_shape) << ", const __global " << kernel_data_type << " gamma"
-           << ", const __global float gamma_backprop" << array_dims(channel_shape)
+           << array_dims(channel_shape) << ", const __global " << kernel_data_type
-           << ", const __global float beta_backprop" << array_dims(channel_shape)
+           << " gamma_backprop" << array_dims(channel_shape) << ", const __global "
-           << ", __global float output" << array_dims(shape) << ")\n";
+           << kernel_data_type << " beta_backprop" << array_dims(channel_shape) << ", __global "
+           << kernel_data_type << " output" << array_dims(shape) << ")\n";
    writer.block_begin();
    { // Main function body
@@ -378,10 +393,11 @@ void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topolo
        // Main loops
        gws = runtime::intelgpu::generate_loops(writer, shape, true);
-        writer << "float stddev = sqrt(variance[i" << channel_axis << "] + " << eps << ");\n";
+        writer << kernel_data_type << " stddev = sqrt(variance[i" << channel_axis << "] + " << eps
-        writer << "float xhat = (input" << access_dims(shape) << " - mean[i" << channel_axis
+               << ");\n";
-               << "]) / stddev;\n";
+        writer << kernel_data_type << " xhat = (input" << access_dims(shape) << " - mean[i"
-        writer << "float norma = gamma[i" << channel_axis << "] / stddev;\n";
+               << channel_axis << "]) / stddev;\n";
+        writer << kernel_data_type << " norma = gamma[i" << channel_axis << "] / stddev;\n";
        writer << "output" << access_dims(shape) << " = norma * (delta" << access_dims(shape)
               << " - (xhat * gamma_backprop[i" << channel_axis << "] + beta_backprop[i"

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
 all_2x2x3_eliminate_dims_0_1
-argmin_trivial_in_double
 avg_pool_2d_2channel_2image_padded_only_above_do_not_include_in_computation
 avg_pool_2d_2channel_2image_padded_only_above_include_in_computation
 avg_pool_3d_uneven_strided_padded
@@ -30,10 +29,7 @@ embedding_lookup_10x1_arbitrary
 embedding_lookup_10x1_arbitrary_index_type_int
 embedding_lookup_4x5_reverse
 generate_mask
-max_3d_to_scalar_double
 max_pool_3d
-numeric_double_inf
-numeric_double_nan
 quantize
 quantize_axes
 quantize_clamp_int32
@@ -69,9 +65,6 @@ shape_of_matrix
 shape_of_scalar
 shape_of_vector
 softmax_axis_3d_double
-sum_stable_acc_double
-sum_stable_simple_double
-sum_trivial_in_double
 topk_1d_max_all
 topk_1d_max_one
 topk_1d_max_partial