Unverified Commit 93fcc5b5 authored by Ge Jun's avatar Ge Jun Committed by GitHub

Merge pull request #484 from zyearn/hc-async

asynchronous health checking
parents f912f0db d4022e4d
// Copyright (c) 2018 brpc authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Authors: Ge,Jun (gejun@baidu.com)
#include <bthread/bthread.h>
#include <bthread/unstable.h>
#include "brpc/periodic_task.h"
namespace brpc {
PeriodicTask::~PeriodicTask() {
}
static void* PeriodicTaskThread(void* arg) {
PeriodicTask* task = static_cast<PeriodicTask*>(arg);
timespec abstime;
if (!task->DoPeriodicTask(&abstime)) { // end
task->DoPeriodicTask(NULL);
return NULL;
}
PeriodicTaskManager::StartTaskAt(task, abstime);
return NULL;
}
static void RunPeriodicTaskThread(void* arg) {
bthread_t th = 0;
int rc = bthread_start_background(
&th, &BTHREAD_ATTR_NORMAL, PeriodicTaskThread, arg);
if (rc != 0) {
LOG(ERROR) << "Fail to start PeriodicTaskThread";
static_cast<PeriodicTask*>(arg)->DoPeriodicTask(NULL);
return;
}
}
void PeriodicTaskManager::StartTaskAt(PeriodicTask* task, const timespec& abstime) {
if (task == NULL) {
LOG(ERROR) << "Param[task] is NULL";
return;
}
bthread_timer_t timer_id;
const int rc = bthread_timer_add(
&timer_id, abstime, RunPeriodicTaskThread, task);
if (rc != 0) {
LOG(ERROR) << "Fail to add timer for RunPerodicTaskThread";
task->DoPeriodicTask(NULL);
return;
}
}
} // namespace brpc
// Copyright (c) 2018 brpc authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Authors: Ge,Jun (gejun@baidu.com)
#ifndef BRPC_HEALTH_CHECK_MANAGER_H
#define BRPC_HEALTH_CHECK_MANAGER_H
namespace brpc {
// Override DoPeriodicTask() with code that needs to be periodically run. If
// the task is completed, the method should return false; Otherwise the method
// should return true and set `next_abstime' to the time that the task should
// be run next time.
// Each call to DoPeriodicTask() is run in a separated bthread which can be
// suspended. To preserve states between different calls, put the states as
// fields of (subclass of) PeriodicTask.
// If any error occurs or DoPeriodicTask() returns false, the task is called
// with DoPeriodicTask(NULL) and will not be scheduled anymore.
class PeriodicTask {
public:
virtual ~PeriodicTask();
virtual bool DoPeriodicTask(timespec* next_abstime) = 0;
};
class PeriodicTaskManager {
public:
static void StartTaskAt(PeriodicTask* task, const timespec& abstime);
};
} // namespace brpc
#endif // BRPC_HEALTH_CHECK_MANAGER_H
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include "brpc/stream_impl.h" #include "brpc/stream_impl.h"
#include "brpc/shared_object.h" #include "brpc/shared_object.h"
#include "brpc/policy/rtmp_protocol.h" // FIXME #include "brpc/policy/rtmp_protocol.h" // FIXME
#include "brpc/periodic_task.h"
#if defined(OS_MACOSX) #if defined(OS_MACOSX)
#include <sys/event.h> #include <sys/event.h>
#endif #endif
...@@ -95,13 +96,6 @@ BRPC_VALIDATE_GFLAG(connect_timeout_as_unreachable, ...@@ -95,13 +96,6 @@ BRPC_VALIDATE_GFLAG(connect_timeout_as_unreachable,
const int WAIT_EPOLLOUT_TIMEOUT_MS = 50; const int WAIT_EPOLLOUT_TIMEOUT_MS = 50;
#ifdef BAIDU_INTERNAL
#define BRPC_AUXTHREAD_ATTR \
(sizeof(com_device_t) > 32*1024 ? BTHREAD_ATTR_NORMAL : BTHREAD_ATTR_SMALL)
#else
#define BRPC_AUXTHREAD_ATTR BTHREAD_ATTR_SMALL
#endif
class BAIDU_CACHELINE_ALIGNMENT SocketPool { class BAIDU_CACHELINE_ALIGNMENT SocketPool {
friend class Socket; friend class Socket;
public: public:
...@@ -780,6 +774,16 @@ void Socket::Revive() { ...@@ -780,6 +774,16 @@ void Socket::Revive() {
} }
} }
class HealthCheckTask : public PeriodicTask {
public:
explicit HealthCheckTask(SocketId id) : _id(id) , _first_time(true) {}
bool DoPeriodicTask(timespec* next_abstime);
private:
SocketId _id;
bool _first_time;
};
int Socket::ReleaseAdditionalReference() { int Socket::ReleaseAdditionalReference() {
bool expect = false; bool expect = false;
// Use `relaxed' fence here since `Dereference' has `released' fence // Use `relaxed' fence here since `Dereference' has `released' fence
...@@ -826,10 +830,9 @@ int Socket::SetFailed(int error_code, const char* error_fmt, ...) { ...@@ -826,10 +830,9 @@ int Socket::SetFailed(int error_code, const char* error_fmt, ...) {
// by Channel to revive never-connected socket when server side // by Channel to revive never-connected socket when server side
// comes online. // comes online.
if (_health_check_interval_s > 0) { if (_health_check_interval_s > 0) {
bthread_t th = 0; PeriodicTaskManager::StartTaskAt(
int rc = bthread_start_background( new HealthCheckTask(id()),
&th, &BRPC_AUXTHREAD_ATTR, HealthCheckThread, (void*)id()); butil::milliseconds_from_now(_health_check_interval_s * 500));
CHECK_EQ(0, rc);
} }
// Wake up all threads waiting on EPOLLOUT when closing fd // Wake up all threads waiting on EPOLLOUT when closing fd
_epollout_butex->fetch_add(1, butil::memory_order_relaxed); _epollout_butex->fetch_add(1, butil::memory_order_relaxed);
...@@ -927,81 +930,65 @@ int Socket::Status(SocketId id, int32_t* nref) { ...@@ -927,81 +930,65 @@ int Socket::Status(SocketId id, int32_t* nref) {
return -1; return -1;
} }
void* Socket::HealthCheckThread(void* void_arg) { bool HealthCheckTask::DoPeriodicTask(timespec* next_abstime) {
SocketId socket_id = (SocketId)void_arg; if (next_abstime == NULL) {
bool first_time = true; delete this;
if (bthread_usleep(100000) < 0) { return true;
PLOG_IF(FATAL, errno != ESTOP) << "Fail to sleep";
return NULL;
} }
SocketUniquePtr ptr;
for (;;) { const int rc = Socket::AddressFailedAsWell(_id, &ptr);
butil::EndPoint remote_side; CHECK(rc != 0);
int check_interval_s = 0; if (rc < 0) {
do { RPC_VLOG << "SocketId=" << _id
SocketUniquePtr ptr; << " was abandoned before health checking";
const int rc = AddressFailedAsWell(socket_id, &ptr); return false;
CHECK(rc != 0); }
if (rc < 0) { // Note: Making a Socket re-addessable is hard. An alternative is
RPC_VLOG << "SocketId=" << socket_id // creating another Socket with selected internal fields to replace
<< " was abandoned before health checking"; // failed Socket. Although it avoids concurrent issues with in-place
return NULL; // revive, it changes SocketId: many code need to watch SocketId
} // and update on change, which is impractical. Another issue with
remote_side = ptr->remote_side(); // this method is that it has to move "selected internal fields"
check_interval_s = ptr->_health_check_interval_s; // which may be accessed in parallel, not trivial to be moved.
// Note: Making a Socket re-addessable is hard. An alternative is // Finally we choose a simple-enough solution: wait until the
// creating another Socket with selected internal fields to replace // reference count hits `expected_nref', which basically means no
// failed Socket. Although it avoids concurrent issues with in-place // one is addressing the Socket(except here). Because the Socket
// revive, it changes SocketId: many code need to watch SocketId // is not addressable, the reference count will not increase
// and update on change, which is impractical. Another issue with // again. This solution is not perfect because the `expected_nref'
// this method is that it has to move "selected internal fields" // is implementation specific. In our case, one reference comes
// which may be accessed in parallel, not trivial to be moved. // from SocketMapInsert(socket_map.cpp), one reference is here.
// Finally we choose a simple-enough solution: wait until the // Although WaitAndReset() could hang when someone is addressing
// reference count hits `expected_nref', which basically means no // the failed Socket forever (also indicating bug), this is not an
// one is addressing the Socket(except here). Because the Socket // issue in current code.
// is not addressable, the reference count will not increase if (_first_time) { // Only check at first time.
// again. This solution is not perfect because the `expected_nref' _first_time = false;
// is implementation specific. In our case, one reference comes if (ptr->WaitAndReset(2/*note*/) != 0) {
// from SocketMapInsert(socket_map.cpp), one reference is here. LOG(INFO) << "Cancel checking " << *ptr;
// Although WaitAndReset() could hang when someone is addressing return false;
// the failed Socket forever (also indicating bug), this is not an }
// issue in current code. }
if (first_time) { // Only check at first time.
first_time = false; s_vars->nhealthcheck << 1;
if (ptr->WaitAndReset(2/*note*/) != 0) { int hc = 0;
LOG(INFO) << "Cancel checking " << *ptr; if (ptr->_user) {
return NULL; hc = ptr->_user->CheckHealth(ptr.get());
} } else {
} hc = ptr->CheckHealth();
}
s_vars->nhealthcheck << 1; if (hc == 0) {
int hc = 0; if (ptr->CreatedByConnect()) {
if (ptr->_user) { s_vars->channel_conn << -1;
hc = ptr->_user->CheckHealth(ptr.get());
} else {
hc = ptr->CheckHealth();
}
if (hc == 0) {
if (ptr->CreatedByConnect()) {
s_vars->channel_conn << -1;
}
ptr->Revive();
ptr->_hc_count = 0;
return NULL;
} else if (hc == ESTOP) {
LOG(INFO) << "Cancel checking " << *ptr;
return NULL;
}
++ ptr->_hc_count;
} while (0);
CHECK_GT(check_interval_s, 0);
if (bthread_usleep(check_interval_s * 1000000L) < 0) {
PLOG_IF(FATAL, errno != ESTOP) << "Fail to sleep";
LOG(INFO) << "Cancel checking SocketId="
<< socket_id << '@' << remote_side;
return NULL;
} }
ptr->Revive();
ptr->_hc_count = 0;
return false;
} else if (hc == ESTOP) {
LOG(INFO) << "Cancel checking " << *ptr;
return false;
} }
++ ptr->_hc_count;
*next_abstime = butil::seconds_from_now(ptr->_health_check_interval_s);
return true;
} }
void Socket::OnRecycle() { void Socket::OnRecycle() {
......
...@@ -183,6 +183,7 @@ friend class Controller; ...@@ -183,6 +183,7 @@ friend class Controller;
friend class policy::ConsistentHashingLoadBalancer; friend class policy::ConsistentHashingLoadBalancer;
friend class policy::RtmpContext; friend class policy::RtmpContext;
friend class schan::ChannelBalancer; friend class schan::ChannelBalancer;
friend class HealthCheckTask;
class SharedPart; class SharedPart;
struct Forbidden {}; struct Forbidden {};
struct WriteRequest; struct WriteRequest;
...@@ -523,7 +524,6 @@ friend void DereferenceSocket(Socket*); ...@@ -523,7 +524,6 @@ friend void DereferenceSocket(Socket*);
int ConnectIfNot(const timespec* abstime, WriteRequest* req); int ConnectIfNot(const timespec* abstime, WriteRequest* req);
int ResetFileDescriptor(int fd); int ResetFileDescriptor(int fd);
static void* HealthCheckThread(void*);
// Returns 0 on success, 1 on failed socket, -1 on recycled. // Returns 0 on success, 1 on failed socket, -1 on recycled.
static int AddressFailedAsWell(SocketId id, SocketUniquePtr* ptr); static int AddressFailedAsWell(SocketId id, SocketUniquePtr* ptr);
...@@ -668,7 +668,7 @@ private: ...@@ -668,7 +668,7 @@ private:
int _preferred_index; int _preferred_index;
// Number of HC since the last SetFailed() was called. Set to 0 when the // Number of HC since the last SetFailed() was called. Set to 0 when the
// socket is revived. Only set in HealthCheckThread // socket is revived. Only set in HealthCheckTask::DoPeriodicTask()
int _hc_count; int _hc_count;
// Size of current incomplete message, set to 0 on complete. // Size of current incomplete message, set to 0 on complete.
......
...@@ -36,7 +36,7 @@ namespace policy { ...@@ -36,7 +36,7 @@ namespace policy {
void SendRpcResponse(int64_t correlation_id, Controller* cntl, void SendRpcResponse(int64_t correlation_id, Controller* cntl,
const google::protobuf::Message* req, const google::protobuf::Message* req,
const google::protobuf::Message* res, const google::protobuf::Message* res,
const Server* server_raw, MethodStatus *, long); const Server* server_raw, MethodStatus *, int64_t);
} // policy } // policy
} // brpc } // brpc
...@@ -231,7 +231,7 @@ protected: ...@@ -231,7 +231,7 @@ protected:
const google::protobuf::Message*, const google::protobuf::Message*,
const google::protobuf::Message*, const google::protobuf::Message*,
const brpc::Server*, const brpc::Server*,
brpc::MethodStatus*, long>( brpc::MethodStatus*, int64_t>(
&brpc::policy::SendRpcResponse, &brpc::policy::SendRpcResponse,
meta.correlation_id(), cntl, NULL, res, meta.correlation_id(), cntl, NULL, res,
&ts->_dummy, NULL, -1); &ts->_dummy, NULL, -1);
...@@ -748,7 +748,7 @@ protected: ...@@ -748,7 +748,7 @@ protected:
CallMethod(&subchans[0], &cntl, &req, &res, false); CallMethod(&subchans[0], &cntl, &req, &res, false);
ASSERT_TRUE(cntl.Failed()); ASSERT_TRUE(cntl.Failed());
ASSERT_EQ(brpc::EINTERNAL, cntl.ErrorCode()) << cntl.ErrorText(); ASSERT_EQ(brpc::EINTERNAL, cntl.ErrorCode()) << cntl.ErrorText();
ASSERT_EQ("[E2001][127.0.1.1:0]Method ComboEcho() not implemented.", cntl.ErrorText()); ASSERT_TRUE(butil::StringPiece(cntl.ErrorText()).ends_with("Method ComboEcho() not implemented."));
// do the rpc call. // do the rpc call.
cntl.Reset(); cntl.Reset();
......
...@@ -394,7 +394,7 @@ TEST(NamingServiceTest, consul_with_backup_file) { ...@@ -394,7 +394,7 @@ TEST(NamingServiceTest, consul_with_backup_file) {
restful_map.c_str())); restful_map.c_str()));
ASSERT_EQ(0, server.Start("localhost:8500", NULL)); ASSERT_EQ(0, server.Start("localhost:8500", NULL));
bthread_usleep(1000000); bthread_usleep(2000000);
butil::EndPoint n1; butil::EndPoint n1;
ASSERT_EQ(0, butil::str2endpoint("10.121.36.189:8003", &n1)); ASSERT_EQ(0, butil::str2endpoint("10.121.36.189:8003", &n1));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment