Commit 39fd329b authored by Scott Cyphers's avatar Scott Cyphers Committed by Sang Ik Lee

Cyphers/distdebug (#2535)

* Fix bug introduced by #2238

* Added a debug logging macro: NGRAPH_DIST_DEBUG which prints the timestamp and MPI rank for distributed AllReduceops

* Fix bug introduced by #2238 on r0.15 (#2515)

* Fix bug introduced by #2238

* style

* Fixed code formatting.

* Moved the logging implementation to log.cpp

* Fixed clang warning

* Don't use namespace in header

* Fix ifdef

* disable warning

* Fixed Centos build issues (gcc 4.8.5 doesn't implement std::put_time - hence the issue). Also fixed a logical error in the distributed setup for unit tests
parent 6ffbd254
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <condition_variable> #include <condition_variable>
#include <ctime> #include <ctime>
#include <functional> #include <functional>
#include <iomanip>
#include <iostream> #include <iostream>
#include <mutex> #include <mutex>
#include <thread> #include <thread>
...@@ -69,3 +68,64 @@ LogHelper::~LogHelper() ...@@ -69,3 +68,64 @@ LogHelper::~LogHelper()
} }
// Logger::log_item(m_stream.str()); // Logger::log_item(m_stream.str());
} }
#if defined(__linux) || defined(__APPLE__)
std::string ngraph::get_timestamp()
{
// get current time
auto now = std::chrono::system_clock::now();
// get number of nanoseconds for the current second
// (remainder after division into seconds)
auto ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(now.time_since_epoch()) % 1000000;
// convert to std::time_t in order to convert to std::tm (broken time)
auto timer = std::chrono::system_clock::to_time_t(now);
// convert to broken time
std::tm* bt = std::localtime(&timer);
char buffer[256];
strftime(buffer, sizeof(buffer), "%H:%M:%S", bt);
std::ostringstream timestamp;
timestamp << buffer;
timestamp << '.' << std::setfill('0') << std::setw(3) << ns.count();
return timestamp.str();
}
void ngraph::LogPrintf(const char* fmt, ...)
{
va_list args1;
va_start(args1, fmt);
va_list args2;
va_copy(args2, args1);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
std::vector<char> buf(1 + std::vsnprintf(nullptr, 0, fmt, args1));
va_end(args1);
std::vsnprintf(buf.data(), buf.size(), fmt, args2);
#pragma GCC diagnostic pop
va_end(args2);
#ifdef NGRAPH_DISTRIBUTED_OMPI_ENABLE
ngraph::Distributed dist;
std::printf("%s [RANK: %d]: %s\n", get_timestamp().c_str(), dist.get_rank(), buf.data());
#else
std::printf("%s %s\n", get_timestamp().c_str(), buf.data());
#endif
}
// This function will be executed only once during startup (loading of the DSO)
static bool CheckLoggingLevel()
{
if (std::getenv("NGRAPH_DISABLE_LOGGING") != nullptr)
{
return true;
}
return false;
}
bool ngraph::DISABLE_LOGGING = CheckLoggingLevel();
#endif
...@@ -16,10 +16,23 @@ ...@@ -16,10 +16,23 @@
#pragma once #pragma once
#include <chrono>
#include <cstdarg>
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <iomanip>
#include <locale>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#if defined(__linux) || defined(__APPLE__)
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
#ifdef NGRAPH_DISTRIBUTED_OMPI_ENABLE
#include "ngraph/distributed.hpp"
#endif
namespace ngraph namespace ngraph
{ {
...@@ -160,4 +173,21 @@ namespace ngraph ...@@ -160,4 +173,21 @@ namespace ngraph
#define NGRAPH_DEBUG \ #define NGRAPH_DEBUG \
::ngraph::NullLogger {} ::ngraph::NullLogger {}
#endif #endif
#if defined(__linux) || defined(__APPLE__)
std::string get_timestamp();
void LogPrintf(const char* fmt, ...);
extern bool DISABLE_LOGGING;
#define NGRAPH_DEBUG_PRINT(fmt, ...) \
do \
{ \
if (!ngraph::DISABLE_LOGGING) \
{ \
ngraph::LogPrintf(fmt, __VA_ARGS__); \
} \
} while (0)
#else
#define NGRAPH_DEBUG_PRINT(fmt, ...)
#endif
} }
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <mpi.h> #include <mpi.h>
#endif #endif
#include "ngraph/log.hpp"
#include "ngraph/op/allreduce.hpp" #include "ngraph/op/allreduce.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp" #include "ngraph/runtime/cpu/cpu_builder.hpp"
...@@ -36,12 +37,23 @@ namespace ngraph ...@@ -36,12 +37,23 @@ namespace ngraph
template <> template <>
void Builder::BUILDER_DECL(ngraph::op::AllReduce) void Builder::BUILDER_DECL(ngraph::op::AllReduce)
{ {
auto& functors = external_function->get_functors(); static int call_seq = 0;
auto& functors = external_function->get_functors();
auto& arg_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto count = static_cast<int>(out[0].get_size()); auto count = static_cast<int>(out[0].get_size());
auto external_function_name = external_function->get_function_name();
NGRAPH_DEBUG_PRINT(
"AllReduce Queued[%d]: Function: %s Node: %s %s Size: "
"%d",
call_seq,
external_function_name.c_str(),
node->get_name().c_str(),
node->get_friendly_name().c_str(),
count);
#ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE #ifdef NGRAPH_DISTRIBUTED_MLSL_ENABLE
auto data_type = MLSL::DT_FLOAT; auto data_type = MLSL::DT_FLOAT;
...@@ -72,8 +84,20 @@ namespace ngraph ...@@ -72,8 +84,20 @@ namespace ngraph
data_type = MPI_DOUBLE; data_type = MPI_DOUBLE;
} }
auto functor = [&, count, data_type](CPURuntimeContext* ctx, auto node_friendly_name = node->get_friendly_name();
CPUExecutionContext* ectx) { auto node_name = node->get_name();
auto func_name = external_function->get_function_name();
int id = call_seq;
call_seq++;
auto functor = [&, id, count, data_type, func_name, node_friendly_name, node_name](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
NGRAPH_DEBUG_PRINT("AllReduce Execute[%d]: Function: %s Node: %s %s Size: %d",
id,
func_name.c_str(),
node_name.c_str(),
node_friendly_name.c_str(),
count);
MPI_Allreduce( MPI_Allreduce(
arg_tensor, out_tensor, count, data_type, MPI_SUM, MPI_COMM_WORLD); arg_tensor, out_tensor, count, data_type, MPI_SUM, MPI_COMM_WORLD);
}; };
......
...@@ -37,10 +37,6 @@ int main(int argc, char** argv) ...@@ -37,10 +37,6 @@ int main(int argc, char** argv)
DistributedSetup distributed_setup; DistributedSetup distributed_setup;
distributed_setup.set_comm_size(dist->get_size()); distributed_setup.set_comm_size(dist->get_size());
distributed_setup.set_comm_rank(dist->get_rank()); distributed_setup.set_comm_rank(dist->get_rank());
if (dist->get_size() == 1)
{
dist.reset();
}
#endif #endif
const char* exclude = "--gtest_filter=-benchmark.*"; const char* exclude = "--gtest_filter=-benchmark.*";
...@@ -54,8 +50,11 @@ int main(int argc, char** argv) ...@@ -54,8 +50,11 @@ int main(int argc, char** argv)
argc++; argc++;
::testing::InitGoogleTest(&argc, argv_vector.data()); ::testing::InitGoogleTest(&argc, argv_vector.data());
auto start = std::chrono::system_clock::now();
int rc = RUN_ALL_TESTS(); int rc = RUN_ALL_TESTS();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start);
NGRAPH_DEBUG_PRINT("[MAIN] Tests finished: Time: %d ms Exit code: %d", elapsed.count(), rc);
#ifdef NGRAPH_DISTRIBUTED_ENABLE #ifdef NGRAPH_DISTRIBUTED_ENABLE
if (dist) if (dist)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment