GPU elementwise emitters now respect input and output tensor types. (#633)

* GPU elementwise emitters now respect input and output tensor types. This enables the use of binary comparison ops and op::Convert. * Removed comments. * All kernels now have type signature even if the i/o tensors are equivalent type so that kernels for specific type tensors are unique. NGMX-391 #close

GPU elementwise emitters now respect input and output tensor types. (#633)
* GPU elementwise emitters now respect input and output tensor types. This enables the use of binary comparison ops and op::Convert. * Removed comments. * All kernels now have type signature even if the i/o tensors are equivalent type so that kernels for specific type tensors are unique. NGMX-391 #close
d3ea93e2 · Chris Sullivan · GitHub · 429eae9a · d3ea93e2 · d3ea93e2
Unverified Commit d3ea93e2 authored Mar 13, 2018 by Chris Sullivan Committed by GitHub Mar 13, 2018
8 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -18,18 +18,19 @@
 using namespace ngraph;
-void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
+void runtime::gpu::CudaKernelBuilder::get_elementwise_op(
-                                                         const std::string& name,
+    codegen::CodeWriter& writer,
-                                                         const std::string& data_type,
+    const std::string& name,
-                                                         const std::string& op,
+    const std::string& op,
-                                                         const size_t& num_inputs)
+    const std::array<std::string, 2>& data_types,
+    const size_t& num_inputs)
 {
    writer << "extern \"C\" __global__ void cuda_" << name << "(";
    for (size_t i = 0; i < num_inputs; i++)
    {
-        writer << data_type << "* in" << i << ", ";
+        writer << data_types[0] << "* in" << i << ", ";
    }
-    writer << data_type << "* out,"
+    writer << data_types[1] << "* out,"
           << "size_t n)\n";
    writer << "{\n";
    writer.indent++;
@@ -55,20 +56,21 @@ void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& wr
    return;
 }
-void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
+void runtime::gpu::CudaKernelBuilder::get_device_helper(
-                                                        const std::string& name,
+    codegen::CodeWriter& writer,
-                                                        const std::string& data_type,
+    const std::string& name,
-                                                        const std::string& math_kernel,
+    const std::string& math_kernel,
-                                                        const size_t& num_inputs)
+    const std::array<std::string, 2>& data_types,
+    const size_t& num_inputs)
 {
    if (math_kernel.size())
    {
-        writer << "__device__ " << data_type << " " << name << "(";
+        writer << "__device__ " << data_types[1] << " " << name << "(";
        for (size_t i = 0; i < num_inputs - 1; i++)
        {
-            writer << data_type << " x" << i << ", ";
+            writer << data_types[0] << " x" << i << ", ";
        }
-        writer << data_type << " x" << num_inputs - 1;
+        writer << data_types[0] << " x" << num_inputs - 1;
        writer << ")\n";
        writer << "{\n";
        writer.indent++;
@@ -80,3 +82,16 @@ void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& wri
    }
    return;
 }
+void runtime::gpu::CudaKernelBuilder::add_pod_typedefs(codegen::CodeWriter& writer)
+{
+    writer << "typedef signed char int8_t;\n";
+    writer << "typedef signed short int16_t;\n";
+    writer << "typedef signed int int32_t;\n";
+    writer << "typedef signed long int int64_t;\n";
+    writer << "typedef unsigned char uint8_t;\n";
+    writer << "typedef unsigned short uint16_t;\n";
+    writer << "typedef unsigned int uint32_t;\n";
+    writer << "typedef unsigned long int uint64_t;\n";
+    writer << "\n";
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -16,6 +16,7 @@
 #pragma once
+#include <array>
 #include <string>
 #include <vector>
@@ -34,15 +35,16 @@ namespace ngraph
            public:
                static void get_elementwise_op(codegen::CodeWriter& writer,
                                               const std::string& name,
-                                               const std::string& data_type,
                                               const std::string& op,
+                                               const std::array<std::string, 2>& data_types,
                                               const size_t& num_inputs);
                static void get_device_helper(codegen::CodeWriter& writer,
                                              const std::string& name,
-                                              const std::string& data_type,
                                              const std::string& math_kernel,
+                                              const std::array<std::string, 2>& data_types,
                                              const size_t& num_inputs);
+                static void add_pod_typedefs(codegen::CodeWriter& writer);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -21,6 +21,7 @@
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
 using namespace ngraph;
 void runtime::gpu::emit_broadcast(
    void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
 {

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -16,6 +16,9 @@
 #pragma once
+#include <array>
+#include <string>
 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/coordinate.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
@@ -36,43 +39,49 @@ namespace ngraph
            template <typename T, typename... Inputs>
            void emit_elementwise_op(std::string name,
+                                     std::array<std::string, 2> data_types,
                                     size_t count,
                                     CUdeviceptr out,
                                     Inputs&&... inputs)
            {
-                // Create an instance of nvrtcProgram with the code string.
+                std::string type_signature = "_" + data_types[0] + "_" + data_types[1];
-                if (CudaFunctionPool::instance().get(name) == nullptr)
+                if (CudaFunctionPool::instance().get(name + type_signature) == nullptr)
                {
                    codegen::CodeWriter writer;
+                    CudaKernelBuilder::add_pod_typedefs(writer);
+                    std::string op_name = CudaOpMap<T>::op;
                    if (CudaOpMap<T>::math_kernel)
                    {
+                        op_name += type_signature;
                        CudaKernelBuilder::get_device_helper(writer,
-                                                             CudaOpMap<T>::op,
+                                                             op_name,
-                                                             CudaOpMap<T>::type,
                                                             CudaOpMap<T>::math_kernel,
+                                                             data_types,
                                                             sizeof...(inputs));
                    }
                    CudaKernelBuilder::get_elementwise_op(
-                        writer, name, CudaOpMap<T>::type, CudaOpMap<T>::op, sizeof...(inputs));
+                        writer, name + type_signature, op_name, data_types, sizeof...(inputs));
                    std::string kernel = writer.get_code();
-                    CudaFunctionPool::instance().set(name, kernel);
+                    CudaFunctionPool::instance().set(name + type_signature, kernel);
                }
                //convert runtime ptr to driver api ptr
                void* args_list[] = {&inputs..., &out, &count};
-                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                CUDA_SAFE_CALL(
-                                              count,
+                    cuLaunchKernel(*CudaFunctionPool::instance().get(name + type_signature).get(),
-                                              1,
+                                   count,
-                                              1, // grid dim
+                                   1,
-                                              1,
+                                   1, // grid dim
-                                              1,
+                                   1,
-                                              1, // block dim
+                                   1,
-                                              0,
+                                   1, // block dim
-                                              NULL, // shared mem and stream
+                                   0,
-                                              args_list,
+                                   NULL, // shared mem and stream
-                                              0));  // arguments
+                                   args_list,
+                                   0));             // arguments
                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
            }
        }

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
@@ -38,8 +38,6 @@ namespace ngraph
        class Subtract;
        class Divide;
        class Sign;
-        // requires different input and output types
        class Convert;
        class Equal;
        class NotEqual;
@@ -47,8 +45,6 @@ namespace ngraph
        class GreaterEq;
        class Less;
        class LessEq;
-        // Unimplemented or unused in favor of cuDNN impl.
        class Max;
        class Min;
        class Negative;
@@ -63,7 +59,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Abs>
            {
                static constexpr const char* op = "fabsf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -71,7 +66,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Acos>
            {
                static constexpr const char* op = "acosf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -79,7 +73,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Asin>
            {
                static constexpr const char* op = "asinf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -87,7 +80,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Atan>
            {
                static constexpr const char* op = "atanf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -95,7 +87,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Ceiling>
            {
                static constexpr const char* op = "ceilf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -103,7 +94,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Cos>
            {
                static constexpr const char* op = "cosf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -111,7 +101,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Cosh>
            {
                static constexpr const char* op = "coshf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -119,7 +108,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Exp>
            {
                static constexpr const char* op = "expf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -127,7 +115,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Floor>
            {
                static constexpr const char* op = "floorf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -135,7 +122,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Log>
            {
                static constexpr const char* op = "logf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -143,7 +129,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Max>
            {
                static constexpr const char* op = "fmaxf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -151,7 +136,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Min>
            {
                static constexpr const char* op = "fminf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -159,7 +143,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Sin>
            {
                static constexpr const char* op = "sinf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -167,7 +150,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Sinh>
            {
                static constexpr const char* op = "sinhf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -175,7 +157,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Sqrt>
            {
                static constexpr const char* op = "sqrtf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -183,7 +164,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Tan>
            {
                static constexpr const char* op = "tanf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -191,7 +171,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Tanh>
            {
                static constexpr const char* op = "tanhf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -199,7 +178,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Power>
            {
                static constexpr const char* op = "powf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -207,7 +185,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Subtract>
            {
                static constexpr const char* op = "subtractf";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = "x0-x1";
            };
@@ -215,7 +192,6 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Divide>
            {
                static constexpr const char* op = "fdividef";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = nullptr;
            };
@@ -223,9 +199,57 @@ namespace ngraph
            struct CudaOpMap<ngraph::op::Sign>
            {
                static constexpr const char* op = "sign";
-                static constexpr const char* type = "float";
                static constexpr const char* math_kernel = "(x0 > 0) - (x0 < 0)";
            };
+            template <>
+            struct CudaOpMap<ngraph::op::Convert>
+            {
+                static constexpr const char* op = "convert";
+                static constexpr const char* math_kernel = "x0";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Equal>
+            {
+                static constexpr const char* op = "equal";
+                static constexpr const char* math_kernel = "x0 == x1";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::NotEqual>
+            {
+                static constexpr const char* op = "not_equal";
+                static constexpr const char* math_kernel = "x0 != x1";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Greater>
+            {
+                static constexpr const char* op = "greater";
+                static constexpr const char* math_kernel = "x0 > x1";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::GreaterEq>
+            {
+                static constexpr const char* op = "greater_equal";
+                static constexpr const char* math_kernel = "x0 >= x1";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::Less>
+            {
+                static constexpr const char* op = "less";
+                static constexpr const char* math_kernel = "x0 < x1";
+            };
+            template <>
+            struct CudaOpMap<ngraph::op::LessEq>
+            {
+                static constexpr const char* op = "less_equal";
+                static constexpr const char* math_kernel = "x0 <= x1";
+            };
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -97,15 +97,13 @@
 #include "ngraph/util.hpp"
 using namespace std;
-using namespace ngraph;
 namespace ngraph
 {
    namespace runtime
    {
        namespace gpu
        {
-            void runtime::gpu::GPU_Emitter::EmitElementwise(
+            void GPU_Emitter::EmitElementwise(
                GPU_ExternalFunction* external_function,
                codegen::CodeWriter& writer,
                const ngraph::Node* n,
@@ -116,12 +114,14 @@ namespace ngraph
                {
                    return;
                }
                writer << "{  // " << n->get_name() << "\n";
                writer.indent++;
                writer << "int count = " << out[0].get_size() << ";\n";
                writer << "if(count == 0) return;\n";
                writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::"
                       << n->description() << ">(\"" << n->description() << "\""
+                       << ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
                       << ", count"
                       << ", (CUdeviceptr) " << out[0].get_name();
                for (size_t i = 0; i < args.size(); i++)

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -168,23 +168,23 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
    {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise},
-    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Equal>},
+    {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::GetOutputElement),
     &runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
-    {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Greater>},
+    {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitElementwise},
-    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::GreaterEq>},
+    {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitElementwise},
-    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Less>},
+    {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitElementwise},
-    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::LessEq>},
+    {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
-    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit<ngraph::op::NotEqual>},
+    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
    {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
-    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convert>},
+    {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitElementwise},
    {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
    {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
    {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -751,7 +751,6 @@ TEST(${BACKEND_NAME}, divide_by_zero_int32)
 TEST(${BACKEND_NAME}, equal)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -1245,7 +1244,6 @@ TEST(${BACKEND_NAME}, dot_matrix_vector_int64)
 TEST(${BACKEND_NAME}, greater)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -1269,7 +1267,6 @@ TEST(${BACKEND_NAME}, greater)
 TEST(${BACKEND_NAME}, greatereq)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -1293,7 +1290,6 @@ TEST(${BACKEND_NAME}, greatereq)
 TEST(${BACKEND_NAME}, less)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -1317,7 +1313,6 @@ TEST(${BACKEND_NAME}, less)
 TEST(${BACKEND_NAME}, lesseq)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -1341,7 +1336,6 @@ TEST(${BACKEND_NAME}, lesseq)
 TEST(${BACKEND_NAME}, lesseq_bool)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::boolean, shape);
    auto B = make_shared<op::Parameter>(element::boolean, shape);
@@ -1461,7 +1455,6 @@ TEST(${BACKEND_NAME}, negative)
 TEST(${BACKEND_NAME}, notequal)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto B = make_shared<op::Parameter>(element::f32, shape);
@@ -1943,7 +1936,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_2)
 TEST(${BACKEND_NAME}, convert_int32_float32)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::i32, shape);
    auto f =
@@ -1965,7 +1957,6 @@ TEST(${BACKEND_NAME}, convert_int32_float32)
 TEST(${BACKEND_NAME}, convert_int32_bool)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::i32, shape);
    auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
@@ -1987,7 +1978,6 @@ TEST(${BACKEND_NAME}, convert_int32_bool)
 TEST(${BACKEND_NAME}, convert_float32_bool)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),