Commit acdde14b authored by Avijit's avatar Avijit Committed by Sang Ik Lee

Avijit/add diag 4 distributed debugging (#2523)

* Fix bug introduced by #2238

* Added a debug logging macro: NGRAPH_DIST_DEBUG which prints the timestamp and MPI rank for distributed AllReduceops

* Fixed code formatting.

* Moved the logging implementation to log.cpp

* Fixed clang warning

* Don't use namespace in header

* Fix ifdef

* disable warning
parent 797e61a4
......@@ -27,6 +27,7 @@
using namespace std;
using namespace ngraph;
using namespace std::chrono;
void ngraph::default_logger_handler_func(const string& s)
{
......@@ -69,3 +70,60 @@ LogHelper::~LogHelper()
}
// Logger::log_item(m_stream.str());
}
#if defined(__linux) || defined(__APPLE__)
std::string ngraph::get_timestamp()
{
// get current time
auto now = system_clock::now();
// get number of nanoseconds for the current second
// (remainder after division into seconds)
auto ns = duration_cast<nanoseconds>(now.time_since_epoch()) % 1000000;
// convert to std::time_t in order to convert to std::tm (broken time)
auto timer = system_clock::to_time_t(now);
// convert to broken time
std::tm bt = *std::localtime(&timer);
std::ostringstream timestamp;
timestamp << std::put_time(&bt, "%H:%M:%S"); // HH:MM:SS
timestamp << '.' << std::setfill('0') << std::setw(3) << ns.count();
return timestamp.str();
}
void ngraph::LogPrintf(const char* fmt, ...)
{
va_list args1;
va_start(args1, fmt);
va_list args2;
va_copy(args2, args1);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
std::vector<char> buf(1 + std::vsnprintf(nullptr, 0, fmt, args1));
va_end(args1);
std::vsnprintf(buf.data(), buf.size(), fmt, args2);
#pragma GCC diagnostic pop
va_end(args2);
#ifdef NGRAPH_DISTRIBUTED_OMPI_ENABLE
ngraph::Distributed dist;
std::printf("%s [RANK: %d]: %s\n", get_timestamp().c_str(), dist.get_rank(), buf.data());
#else
std::printf("%s %s\n", get_timestamp().c_str(), buf.data());
#endif
}
// This function will be executed only once during startup (loading of the DSO)
static bool CheckLoggingLevel()
{
if (std::getenv("NGRAPH_DISABLE_LOGGING") != nullptr)
{
return true;
}
return false;
}
bool ngraph::DISABLE_LOGGING = CheckLoggingLevel();
#endif
......@@ -16,13 +16,41 @@
#pragma once
#include <chrono>
#include <cstdarg>
#include <deque>
#include <functional>
#include <iomanip>
#include <locale>
#include <sstream>
#include <stdexcept>
#if defined(__linux) || defined(__APPLE__)
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
#ifdef NGRAPH_DISTRIBUTED_OMPI_ENABLE
#include "ngraph/distributed.hpp"
#endif
namespace ngraph
{
#if defined(__linux) || defined(__APPLE__)
std::string get_timestamp();
void LogPrintf(const char* fmt, ...);
extern bool DISABLE_LOGGING;
#define NGRAPH_DEBUG_PRINT(fmt, ...) \
do \
{ \
if (!DISABLE_LOGGING) \
{ \
LogPrintf(fmt, __VA_ARGS__); \
} \
} while (0)
#else
#define NGRAPH_DEBUG_PRINT(fmt, ...)
#endif
class ConstString
{
public:
......
......@@ -21,6 +21,7 @@
#include <mpi.h>
#endif
#include "ngraph/log.hpp"
#include "ngraph/op/allreduce.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
......@@ -36,12 +37,23 @@ namespace ngraph
template <>
void Builder::BUILDER_DECL(ngraph::op::AllReduce)
{
auto& functors = external_function->get_functors();
static int call_seq = 0;
auto& functors = external_function->get_functors();
auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto count = static_cast<int>(out[0].get_size());
auto external_function_name = external_function->get_function_name();
NGRAPH_DEBUG_PRINT(
"AllReduce Queued[%d]: Function: %s Node: %s %s Size: "
"%d",
call_seq,
external_function_name.c_str(),
node->get_name().c_str(),
node->get_friendly_name().c_str(),
count);
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
auto data_type = MLSL::DT_FLOAT;
......@@ -72,8 +84,20 @@ namespace ngraph
data_type = MPI_DOUBLE;
}
auto functor = [&, count, data_type](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto node_friendly_name = node->get_friendly_name();
auto node_name = node->get_name();
auto func_name = external_function->get_function_name();
int id = call_seq;
call_seq++;
auto functor = [&, id, count, data_type, func_name, node_friendly_name, node_name](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
NGRAPH_DEBUG_PRINT("AllReduce Execute[%d]: Function: %s Node: %s %s Size: %d",
id,
func_name.c_str(),
node_name.c_str(),
node_friendly_name.c_str(),
count);
MPI_Allreduce(
arg_tensor, out_tensor, count, data_type, MPI_SUM, MPI_COMM_WORLD);
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment