Tracer of real data in ngraph-CPU for debug purpose (#3099)

* new debugger to trace real tensors in each kernel * update after style-apply * add unit test to debugger * after style-apply * templatize data + fixed issues from PR * solve conflict * change names of env flags + fixed typo + pass unit-test * add different types to dump, change arch of debug class, add support to allow debug to be set programmatically (all requested changed fixed) * add curly braces to single-line blocks * fix shadow names + add int32 for dump + fix for names with coding standarts of ngraph * fix names of member variables + delete some constructors * try to find issue with CI, do not merge! * fix one issue with CI * fix style-apply

Tracer of real data in ngraph-CPU for debug purpose (#3099)
* new debugger to trace real tensors in each kernel * update after style-apply * add unit test to debugger * after style-apply * templatize data + fixed issues from PR * solve conflict * change names of env flags + fixed typo + pass unit-test * add different types to dump, change arch of debug class, add support to allow debug to be set programmatically (all requested changed fixed) * add curly braces to single-line blocks * fix shadow names + add int32 for dump + fix for names with coding standarts of ngraph * fix names of member variables + delete some constructors * try to find issue with CI, do not merge! * fix one issue with CI * fix style-apply
4b009f09 · andreykhaykin · Scott Cyphers · d1af0bb7 · 4b009f09 · 4b009f09
Commit 4b009f09 authored Jul 11, 2019 by andreykhaykin Committed by Scott Cyphers Jul 11, 2019
7 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -31,6 +31,7 @@ set(SRC
    cpu_visualize_tree.cpp
    cpu_cse.cpp
    cpu_debugger.cpp
+    cpu_debug_tracer.cpp
    builder/add.cpp
    builder/allreduce.cpp
    builder/avg_pool.cpp

--- a/src/ngraph/runtime/cpu/cpu_debug_tracer.cpp
+++ b/src/ngraph/runtime/cpu/cpu_debug_tracer.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/runtime/cpu/cpu_debug_tracer.hpp"
+using namespace std;
+using namespace ngraph;
+runtime::cpu::CPU_DebugTracer::CPU_DebugTracer()
+    : m_serial_number(0)
+{
+    static const auto debug_t = std::getenv("NGRAPH_CPU_DEBUG_TRACER");
+    if (debug_t != nullptr)
+    {
+        m_enable_tracing = true;
+        init_streams();
+    }
+}
+void runtime::cpu::CPU_DebugTracer::init_streams()
+{
+    if (m_tracer_stream.is_open())
+    {
+        return;
+    }
+    static auto trace_file_path = std::getenv("NGRAPH_CPU_TRACER_LOG");
+    static auto trace_bin_file_path = std::getenv("NGRAPH_CPU_BIN_TRACER_LOG");
+    if (trace_file_path == nullptr)
+    {
+        trace_file_path = const_cast<char*>("trace_meta.log");
+    }
+    if (trace_bin_file_path == nullptr)
+    {
+        trace_bin_file_path = const_cast<char*>("trace_bin_data.log");
+    }
+    m_tracer_stream.open(trace_file_path, ios_base::out | ios_base::ate);
+    m_tracer_bin_stream.open(trace_bin_file_path, std::ios_base::out | std::ios_base::ate);
+}
+void runtime::cpu::CPU_DebugTracer::set_enable_tracing(bool new_state)
+{
+    if (!m_enable_tracing && new_state)
+    {
+        init_streams();
+    }
+    m_enable_tracing = new_state;
+}
+void runtime::cpu::CPU_DebugTracer::end_of_kernel()
+{
+    m_serial_number++;
+    m_tracer_stream.flush();
+    m_tracer_bin_stream.flush();
+}
--- a/src/ngraph/runtime/cpu/cpu_debug_tracer.hpp
+++ b/src/ngraph/runtime/cpu/cpu_debug_tracer.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <algorithm>
+#include <cctype>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+#include "ngraph/runtime/tensor.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            class CPU_DebugTracer
+            {
+            public:
+                CPU_DebugTracer();
+                void set_enable_tracing(bool new_state);
+                bool tracing_is_enabled() { return m_enable_tracing; }
+                void end_of_kernel();
+                template <typename T>
+                void dump_one_tensor(const std::string& kernel_name,
+                                     const void* tensor,
+                                     const std::string& tensor_name,
+                                     const size_t size,
+                                     const ngraph::Shape& shape,
+                                     const std::string& in_out);
+            private:
+                CPU_DebugTracer(const CPU_DebugTracer&) = delete;
+                CPU_DebugTracer(CPU_DebugTracer&&) = delete;
+                CPU_DebugTracer& operator=(const CPU_DebugTracer&) = delete;
+                void init_streams();
+                size_t m_serial_number;
+                std::fstream m_tracer_stream;
+                std::fstream m_tracer_bin_stream;
+                bool m_enable_tracing = false;
+            };
+        }
+    }
+}
+// use of kahan sum to reduce numeric error
+template <typename T>
+float find_variance(const std::vector<T>& f_data, float mean, size_t size)
+{
+    float sum = 0.0f;
+    float c = 0.0f;
+    for (auto num : f_data)
+    {
+        num = (num - mean) * (num - mean);
+        float y = num - c;
+        float t = sum + y;
+        c = (t - sum) - y;
+        sum = t;
+    }
+    return sum / size;
+}
+template <typename T>
+void ngraph::runtime::cpu::CPU_DebugTracer::dump_one_tensor(const std::string& kernel_name,
+                                                            const void* tensor,
+                                                            const std::string& tensor_name,
+                                                            const size_t size,
+                                                            const ngraph::Shape& shape,
+                                                            const std::string& in_out)
+{
+    std::string tid{tensor_name.substr(1 + tensor_name.find("_"))};
+    size_t num_bytes{(size * sizeof(T))};
+    std::vector<T> tensor_data(size);
+    memcpy(&tensor_data[0], tensor, num_bytes);
+    m_tracer_stream << " K=" << std::left << std::setw(20) << kernel_name << " S=" << std::left
+                    << std::setw(10) << m_serial_number << " TID=" << std::left << std::setw(10)
+                    << tid << in_out;
+    m_tracer_bin_stream << "TID=" << tid << '\n';
+    m_tracer_stream << " size=" << size << " " << shape << " ";
+    m_tracer_stream << "bin_data_offset=" << m_tracer_bin_stream.tellp();
+    m_tracer_bin_stream.write(reinterpret_cast<const char*>(tensor_data.data()),
+                              tensor_data.size() * sizeof(T));
+    auto mean = std::accumulate(tensor_data.begin(), tensor_data.end(), 0.0f) / size;
+    auto var = find_variance<T>(tensor_data, mean, size);
+    m_tracer_stream << " mean=" << mean;
+    m_tracer_stream << " var=" << var;
+    m_tracer_bin_stream << "\n";
+    m_tracer_stream << "\n";
+}
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -851,12 +851,16 @@ using namespace ngraph::runtime;
        vector<TensorViewWrapper> in;
        vector<string> node_input_names;
        vector<string> node_output_names;
+        vector<TensorTracerAttributes> t_in_attrs;
+        vector<TensorTracerAttributes> t_out_attrs;
        for (const descriptor::Input& input : node->get_inputs())
        {
            const descriptor::Output& output = input.get_output();
            shared_ptr<descriptor::Tensor> tv = output.get_tensor_ptr();
            in.push_back(TensorViewWrapper(tv, m_variable_name_map[tv->get_name()]));
            node_input_names.emplace_back(tv->get_name());
+            t_in_attrs.push_back(TensorTracerAttributes(
+                in.back().get_size(), in.back().get_shape(), in.back().get_element_type()));
        }
        vector<TensorViewWrapper> out;
        for (const descriptor::Output& output : node->get_outputs())
@@ -864,6 +868,8 @@ using namespace ngraph::runtime;
            shared_ptr<descriptor::Tensor> tv = output.get_tensor_ptr();
            out.push_back(TensorViewWrapper(tv, m_variable_name_map[tv->get_name()]));
            node_output_names.emplace_back(tv->get_name());
+            t_out_attrs.push_back(TensorTracerAttributes(
+                out.back().get_size(), out.back().get_shape(), out.back().get_element_type()));
        }
        // Emit operation prologue
@@ -871,7 +877,11 @@ using namespace ngraph::runtime;
        {
            if (m_function->get_name() == m_function_name)
            {
-                m_op_attrs.emplace_back(node->description(), node_output_names, node_input_names);
+                m_op_attrs.emplace_back(node->description(),
+                                        node_output_names,
+                                        node_input_names,
+                                        t_out_attrs,
+                                        t_in_attrs);
            }
            if (m_use_tbb)
            {
@@ -1239,6 +1249,84 @@ bool runtime::cpu::CPU_ExternalFunction::computes_result(Node* node)
    return false;
 }
+static void dump_one_kernel_with_type(runtime::cpu::CPU_DebugTracer& debug_tracer,
+                                      runtime::cpu::TensorTracerAttributes& t_attrs,
+                                      const std::string& kernel_name,
+                                      const void* tensor,
+                                      const std::string& tensor_name,
+                                      const std::string& in_out)
+{
+    switch (t_attrs.m_type_of_element.get_type_enum())
+    {
+    case element::Type_t::f32:
+        debug_tracer.dump_one_tensor<float>(kernel_name,
+                                            tensor,
+                                            tensor_name,
+                                            t_attrs.m_number_of_elements,
+                                            t_attrs.m_t_shape,
+                                            in_out);
+        break;
+    case element::Type_t::i8:
+        debug_tracer.dump_one_tensor<int8_t>(kernel_name,
+                                             tensor,
+                                             tensor_name,
+                                             t_attrs.m_number_of_elements,
+                                             t_attrs.m_t_shape,
+                                             in_out);
+        break;
+    case element::Type_t::u8:
+        debug_tracer.dump_one_tensor<uint8_t>(kernel_name,
+                                              tensor,
+                                              tensor_name,
+                                              t_attrs.m_number_of_elements,
+                                              t_attrs.m_t_shape,
+                                              in_out);
+        break;
+    case element::Type_t::i32:
+        debug_tracer.dump_one_tensor<int32_t>(kernel_name,
+                                              tensor,
+                                              tensor_name,
+                                              t_attrs.m_number_of_elements,
+                                              t_attrs.m_t_shape,
+                                              in_out);
+        break;
+    }
+}
+void runtime::cpu::CPU_ExternalFunction::dump_one_kernel(CPU_DebugTracer& debug_tracer,
+                                                         CPURuntimeContext* ctx,
+                                                         bool is_it_input)
+{
+    size_t index = ctx->pc;
+    if (is_it_input)
+    {
+        for (size_t i = 0; i < m_op_attrs.at(index).Inputs.size(); i++)
+        {
+            dump_one_kernel_with_type(
+                debug_tracer,
+                m_op_attrs.at(index).m_inputs_tensor_attrs.at(i),
+                m_op_attrs.at(index).Description,
+                ctx->buffer_data[get_buffer_index(m_op_attrs.at(index).Inputs.at(i))],
+                m_op_attrs.at(index).Inputs.at(i),
+                ">>");
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < m_op_attrs.at(index).Outputs.size(); i++)
+        {
+            dump_one_kernel_with_type(
+                debug_tracer,
+                m_op_attrs.at(index).m_outputs_tensor_attrs.at(i),
+                m_op_attrs.at(index).Description,
+                ctx->buffer_data[get_buffer_index(m_op_attrs.at(index).Outputs.at(i))],
+                m_op_attrs.at(index).Outputs.at(i),
+                "<<");
+        }
+        debug_tracer.end_of_kernel();
+    }
+}
 void runtime::cpu::CPU_ExternalFunction::build(ngraph::pass::PassConfig& pass_config)
 {
    if (m_is_built)
@@ -1261,6 +1349,12 @@ void runtime::cpu::CPU_ExternalFunction::build(ngraph::pass::PassConfig& pass_co
    register_common_passes(pass_manager, pass_config);
    pass_manager.run_passes(m_function, false);
+    static runtime::cpu::CPU_DebugTracer debug_tracer;
+    if (std::getenv("NGRAPH_CPU_DEBUG_TRACER") != nullptr)
+    {
+        debug_tracer.set_enable_tracing(true);
+    }
    // Store layouts assigned for arguments
    for (const auto& parameter : m_function->get_parameters())
    {
@@ -1405,24 +1499,30 @@ void runtime::cpu::CPU_ExternalFunction::build(ngraph::pass::PassConfig& pass_co
        }
        vector<TensorViewWrapper> in;
        vector<string> in_names;
+        vector<TensorTracerAttributes> t_in_attrs;
        for (const descriptor::Input& input : node->get_inputs())
        {
            const descriptor::Output& output = input.get_output();
            shared_ptr<descriptor::Tensor> tv = output.get_tensor_ptr();
            in.push_back(TensorViewWrapper(tv, tv->get_name()));
            in_names.push_back(tv->get_name());
+            t_in_attrs.push_back(TensorTracerAttributes(
+                in.back().get_size(), in.back().get_shape(), in.back().get_element_type()));
        }
        vector<TensorViewWrapper> out;
        vector<string> out_names;
+        vector<TensorTracerAttributes> t_out_attrs;
        for (const descriptor::Output& output : node->get_outputs())
        {
            shared_ptr<descriptor::Tensor> tv = output.get_tensor_ptr();
            out.push_back(TensorViewWrapper(tv, tv->get_name()));
            out_names.push_back(tv->get_name());
+            t_out_attrs.push_back(TensorTracerAttributes(
+                out.back().get_size(), out.back().get_shape(), out.back().get_element_type()));
        }
-        m_op_attrs.emplace_back(node->description(), out_names, in_names);
+        m_op_attrs.emplace_back(node->description(), out_names, in_names, t_out_attrs, t_in_attrs);
        op_names.push_back(node->get_name());
        handler->second(this, node.get(), in, out);
@@ -1712,6 +1812,7 @@ void runtime::cpu::CPU_ExternalFunction::build(ngraph::pass::PassConfig& pass_co
                    std::stringstream ss;
                    ss << "\nEXECUTION PLAN:\n";
                    for (size_t i = 0; i < functors.size(); i++)
                    {
                        ss << op_names.at(i) << " will be executed with the following inputs:\n";
@@ -1742,8 +1843,21 @@ void runtime::cpu::CPU_ExternalFunction::build(ngraph::pass::PassConfig& pass_co
                    {
                        start_ts = cpu::Clock::now();
                    }
                    CPUExecutionContext ectx{0};
+                    if (debug_tracer.tracing_is_enabled())
+                    {
+                        this->dump_one_kernel(debug_tracer, ctx, true);
+                    }
                    executor::GetCPUExecutor().execute(functors.at(ctx->pc), ctx, &ectx);
+                    if (debug_tracer.tracing_is_enabled())
+                    {
+                        this->dump_one_kernel(debug_tracer, ctx, false);
+                    }
                    if (ctx->breakpoints.count(ctx->pc + 1))
                    {
                        ctx->pc++;

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -44,6 +44,7 @@
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/pass_config.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
+#include "ngraph/runtime/cpu/cpu_debug_tracer.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
 #include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
@@ -61,6 +62,7 @@ namespace ngraph
            class CPU_Emitter;
            class CPU_CallFrame;
            class CPU_Debugger;
+            class CPU_DebugTracer;
 #if !defined(NGRAPH_DEX_ONLY)
@@ -72,18 +74,40 @@ namespace ngraph
            using OpMap = std::unordered_map<std::type_index, OpFunction>;
 #endif
+            struct TensorTracerAttributes
+            {
+                size_t m_number_of_elements;
+                ngraph::Shape m_t_shape;
+                element::Type m_type_of_element;
+                TensorTracerAttributes(const size_t size,
+                                       const ngraph::Shape& shape,
+                                       const element::Type& type)
+                    : m_number_of_elements(size)
+                    , m_t_shape(shape)
+                    , m_type_of_element(type)
+                {
+                }
+            };
            struct OpAttributes
            {
                std::string Description;
                std::vector<std::string> Outputs;
                std::vector<std::string> Inputs;
+                std::vector<TensorTracerAttributes> m_outputs_tensor_attrs;
+                std::vector<TensorTracerAttributes> m_inputs_tensor_attrs;
                OpAttributes(const std::string& desc,
                             const std::vector<std::string>& outputs,
-                             const std::vector<std::string>& inputs)
+                             const std::vector<std::string>& inputs,
+                             const std::vector<TensorTracerAttributes>& out_t_attrs,
+                             const std::vector<TensorTracerAttributes>& in_t_attrs)
                    : Description(desc)
                    , Outputs(outputs)
                    , Inputs(inputs)
+                    , m_outputs_tensor_attrs(out_t_attrs)
+                    , m_inputs_tensor_attrs(in_t_attrs)
                {
                }
            };
@@ -187,6 +211,10 @@ namespace ngraph
                std::vector<ngraph::State*> m_states;
+                void dump_one_kernel(CPU_DebugTracer& debug_tracer,
+                                     CPURuntimeContext* ctx,
+                                     bool is_it_input);
            private:
                // Register passes that are common to codegen and DEX
                void register_common_passes(ngraph::pass::Manager& pass_manager,
@@ -212,6 +240,7 @@ namespace ngraph
                    const Node&,
                    const Node&,
                    const std::unordered_map<const Node*, std::string>& node_cache);
                std::string emit_op_as_function(const Node&, const std::string& function_name);
                std::string strip_comments(const std::string&);

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -103,7 +103,7 @@ endif()
 if (NGRAPH_CPU_ENABLE)
    list(APPEND SRC core_fusion.cpp builder_quantization.cpp)
-    list(APPEND SRC backend_performance.cpp cpu_fusion.cpp cpu_test.cpp cpu_debugger.cpp)
+    list(APPEND SRC backend_performance.cpp cpu_fusion.cpp cpu_test.cpp cpu_debugger.cpp cpu_debug_tracer.cpp)
    if (NOT NGRAPH_DEX_ONLY)
        list(APPEND SRC cpu_codegen.cpp)
    endif()

--- a/test/cpu_debug_tracer.cpp
+++ b/test/cpu_debug_tracer.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <stdio.h>
+#include <vector>
+#include "gtest/gtest.h"
+#include "misc.hpp"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/runtime/cpu/cpu_backend.hpp"
+#include "ngraph/runtime/cpu/cpu_call_frame.hpp"
+#include "ngraph/runtime/cpu/cpu_debug_tracer.hpp"
+#include "util/test_tools.hpp"
+using namespace ngraph;
+using namespace std;
+static void set_env_vars(const string& trace_log, const string& bin_log)
+{
+    set_environment("NGRAPH_CPU_DEBUG_TRACER", "1", 1);
+    set_environment("NGRAPH_CPU_TRACER_LOG", trace_log.c_str(), 1);
+    set_environment("NGRAPH_CPU_BIN_TRACER_LOG", bin_log.c_str(), 1);
+}
+static void unset_env_vars()
+{
+    unset_environment("NGRAPH_CPU_DEBUG_TRACER");
+    unset_environment("NGRAPH_CPU_TRACER_LOG");
+    unset_environment("NGRAPH_CPU_BIN_TRACER_LOG");
+}
+static void open_logs(ifstream& meta, ifstream& bin, const string& trace_log, const string& bin_log)
+{
+    meta.open(trace_log);
+    bin.open(bin_log, std::ios::binary);
+    ASSERT_TRUE(meta.is_open());
+    ASSERT_TRUE(bin.is_open());
+}
+TEST(cpu_debug_tracer, check_flow_with_external_function)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), ParameterVector{A, B});
+    shared_ptr<runtime::Backend> backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{0, 1, 2, 3});
+    copy_data(b, vector<float>{1, 2, 3, 4});
+    const string trace_log_file = "trace_meta.log";
+    const string bin_log_file = "trace_bin_data.log";
+    set_env_vars(trace_log_file, bin_log_file);
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    //open two logs and parse them
+    ifstream f_meta;
+    ifstream f_bin;
+    open_logs(f_meta, f_bin, trace_log_file, bin_log_file);
+    string line;
+    getline(f_meta, line);
+    auto str_mean = line.substr(line.find("mean"));
+    auto mean =
+        std::stod(str_mean.substr(str_mean.find("=") + 1, str_mean.find(" ") - str_mean.find("=")));
+    //mean value of first tensor - a
+    EXPECT_EQ(mean, 1.5);
+    getline(f_meta, line);
+    auto str_var = line.substr(line.find("var"));
+    auto var = std::stod(str_var.substr(str_var.find("=") + 1));
+    //variance value of second tensor - b
+    EXPECT_EQ(var, 1.25);
+    getline(f_meta, line);
+    auto str_bin_offset = line.substr(line.find("bin_data"));
+    auto bin_offset = std::stod(str_bin_offset.substr(
+        str_bin_offset.find("=") + 1, str_bin_offset.find(" ") - str_bin_offset.find("=")));
+    //check output tensor from binary data
+    f_bin.seekg(bin_offset);
+    std::vector<unsigned char> v_c((std::istreambuf_iterator<char>(f_bin)),
+                                   std::istreambuf_iterator<char>());
+    vector<float> v_f(4);
+    memcpy(&v_f[0], &v_c[0], sizeof(float) * 4);
+    EXPECT_EQ((vector<float>{1, 3, 5, 7}), (v_f));
+    remove(trace_log_file.c_str());
+    remove(bin_log_file.c_str());
+    unset_env_vars();
+}