Make LALB sensitive to errors & remove useless flags in LALB

d9bab076 · gejun · 4749ccd4 · d9bab076 · d9bab076 · d9bab076
Commit d9bab076 authored Nov 08, 2017 by gejun
9 changed files
--- a/src/brpc/channel.cpp
+++ b/src/brpc/channel.cpp
@@ -425,23 +425,9 @@ int Channel::CheckHealth() {
        return -1;
    } else {
        SocketUniquePtr tmp_sock;
-        LoadBalancer::SelectIn sel_in = { 0, false, 0, NULL };
+        LoadBalancer::SelectIn sel_in = { 0, false, false, 0, NULL };
        LoadBalancer::SelectOut sel_out(&tmp_sock);
-        const int rc = _lb->SelectServer(sel_in, &sel_out);
+        return _lb->SelectServer(sel_in, &sel_out);
-        if (rc != 0) {
-            return rc;
-        }
-        if (sel_out.need_feedback) {
-            LoadBalancer::CallInfo info;
-            info.in.begin_time_us = 0;
-            info.in.has_request_code = false;
-            info.in.request_code = 0;
-            info.in.excluded = NULL;
-            info.server_id = tmp_sock->id();
-            info.error_code = ECANCELED;
-            _lb->Feedback(info);
-        }
-        return 0;
    }
 }

--- a/src/brpc/controller.cpp
+++ b/src/brpc/controller.cpp
@@ -648,6 +648,9 @@ void* Controller::RunEndRPC(void* arg) {
 }
 inline bool does_error_affect_main_socket(int error_code) {
+    // Errors tested in this function are reported by pooled connections
+    // and very likely to indicate that the server-side is down and the socket
+    // should be health-checked.
    return error_code == ECONNREFUSED ||
        error_code == ENETUNREACH ||
        error_code == EHOSTUNREACH ||
@@ -732,13 +735,8 @@ void Controller::Call::OnComplete(Controller* c, int error_code/*note*/,
    sending_sock.reset(NULL);
    if (need_feedback) {
-        LoadBalancer::CallInfo info;
+        const LoadBalancer::CallInfo info =
-        info.in.begin_time_us = begin_time_us;
+            { begin_time_us, peer_id, error_code, c };
-        info.in.has_request_code = c->has_request_code();
-        info.in.request_code = c->request_code();
-        info.in.excluded = NULL;
-        info.server_id = peer_id;
-        info.error_code = error_code;
        c->_lb->Feedback(info);
    }
 }
@@ -947,7 +945,8 @@ void Controller::IssueRPC(int64_t start_realtime_us) {
        _current_call.peer_id = _single_server_id;
    } else {
        LoadBalancer::SelectIn sel_in =
-            { start_realtime_us, has_request_code(), _request_code, _accessed };
+            { start_realtime_us, true,
+              has_request_code(), _request_code, _accessed };
        LoadBalancer::SelectOut sel_out(&tmp_sock);
        const int rc = _lb->SelectServer(sel_in, &sel_out);
        if (rc != 0) {

--- a/src/brpc/load_balancer.h
+++ b/src/brpc/load_balancer.h
@@ -28,11 +28,15 @@
 namespace brpc {
+class Controller;
 // Select a server from a set of servers (in form of ServerId).
 class LoadBalancer : public NonConstDescribable, public Destroyable {
 public:
    struct SelectIn {
        int64_t begin_time_us;
+        // Weight of different nodes could be changed.
+        bool changable_weights;
        bool has_request_code;
        uint64_t request_code;
        const ExcludedServers* excluded;
@@ -46,9 +50,17 @@ public:
    };
    struct CallInfo {
-        LoadBalancer::SelectIn in;
+        // Exactly same with SelectIn.begin_time_us, may be different from
+        // controller->_begin_time_us which is beginning of the RPC.
+        int64_t begin_time_us;
+        // Remote side of the call.
        SocketId server_id;
+        // A RPC may have multiple calls, this error may be different from
+        // controller->ErrorCode();
        int error_code;
+        // The controller for the RPC. Should NOT be saved in Feedback()
+        // and used after the function.
+        const Controller* controller;
    };
    LoadBalancer() { }

--- a/src/brpc/policy/locality_aware_load_balancer.cpp
+++ b/src/brpc/policy/locality_aware_load_balancer.cpp
@@ -27,12 +27,12 @@
 namespace brpc {
 namespace policy {
-DEFINE_bool(quadratic_latency, false, "Divide square of latency");
+DEFINE_int64(min_weight, 1000, "Minimum weight of a node in LALB");
-DEFINE_int64(min_weight, 1000, "minimum weight");
+DEFINE_double(punish_inflight_ratio, 1.5, "Decrease weight proportionally if "
-DEFINE_int64(dev_multiple, 0, "Multiple of deviation");
+              "average latency of the inflight requests exeeds average "
-DEFINE_bool(count_inflight, true, "Adjust weight by inflight requests");
+              "latency of the node times this ratio");
+DEFINE_double(punish_error_ratio, 1.2,
-BRPC_VALIDATE_GFLAG(dev_multiple, NonNegativeInteger);
+              "Multiply latencies caused by errors with this ratio");
 static const int64_t DEFAULT_QPS = 1;
 static const size_t INITIAL_WEIGHT_TREE_SIZE = 128;
@@ -304,12 +304,12 @@ int LocalityAwareLoadBalancer::SelectServer(const SelectIn& in, SelectOut* out)
            }
        } else if (Socket::Address(info.server_id, out->ptr) == 0
                   && !(*out->ptr)->IsLogOff()) {
-            if (!FLAGS_count_inflight) {
-                return 0;
-            }
            if ((ntry + 1) == n  // Instead of fail with EHOSTDOWN, we prefer
                                 // choosing the server again.
                || !ExcludedServers::IsExcluded(in.excluded, info.server_id)) {
+                if (!in.changable_weights) {
+                    return 0;
+                }
                const Weight::AddInflightResult r =
                    info.weight->AddInflight(in, index, dice - left);
                if (r.weight_diff) {
@@ -324,7 +324,7 @@ int LocalityAwareLoadBalancer::SelectServer(const SelectIn& in, SelectOut* out)
            if (++ntry >= n) {
                break;
            }
-        } else if (FLAGS_count_inflight) {
+        } else if (in.changable_weights) {
            const int64_t diff =
                info.weight->MarkFailed(index, total / n);
            if (diff) {
@@ -350,18 +350,6 @@ int LocalityAwareLoadBalancer::SelectServer(const SelectIn& in, SelectOut* out)
 }
 void LocalityAwareLoadBalancer::Feedback(const CallInfo& info) {        
-    // Make latency larger when error is caused by backup request or timedout.
-    // If we don't do this, there's not much difference between a timedout
-    // request and a successful request camed back just before timeout.
-    int64_t latency_percent = 0;
-    if (info.error_code == 0) {
-        latency_percent = 100;
-    } else if (info.error_code == EBACKUPREQUEST) {
-        latency_percent = 150;
-    } else if (info.error_code == ERPCTIMEDOUT) {
-        latency_percent = 200;
-    } // else we will not update any weight on other errors
    butil::DoublyBufferedData<Servers>::ScopedPtr s;
    if (_db_servers.Read(&s) != 0) {
        return;
@@ -372,7 +360,7 @@ void LocalityAwareLoadBalancer::Feedback(const CallInfo& info) {
    }
    const size_t index = *pindex;
    Weight* w = s->weight_tree[index].weight;
-    const int64_t diff = w->Update(info, index, latency_percent);
+    const int64_t diff = w->Update(info, index);
    if (diff != 0) {
        s->UpdateParentWeights(diff, index);
        _total.fetch_add(diff, butil::memory_order_relaxed);
@@ -380,8 +368,9 @@ void LocalityAwareLoadBalancer::Feedback(const CallInfo& info) {
 }
 int64_t LocalityAwareLoadBalancer::Weight::Update(
-    const CallInfo& info, size_t index, int64_t latency_percent) {
+    const CallInfo& ci, size_t index) {
    const int64_t end_time_us = butil::gettimeofday_us();
+    const int64_t latency = end_time_us - ci.begin_time_us;
    BAIDU_SCOPED_LOCK(_mutex);
    if (Disabled()) {
        // The weight was disabled and will be removed soon, do nothing
@@ -389,46 +378,85 @@ int64_t LocalityAwareLoadBalancer::Weight::Update(
        return 0;
    }
-    _begin_time_sum -= info.in.begin_time_us;
+    _begin_time_sum -= ci.begin_time_us;
    --_begin_time_count;
-    const int64_t latency =
-        (end_time_us - info.in.begin_time_us) * latency_percent / 100;
    if (latency <= 0) {
-        // error happens or time skews, ignore the sample.
+        // time skews, ignore the sample.
        return 0;
    }
+    if (ci.error_code == 0) {
-    // Update latency and QPS
+        // Add a new entry
-    TimeInfo tm_info = { latency, end_time_us, latency * (double)latency };
+        TimeInfo tm_info = { latency, end_time_us };
        if (!_time_q.empty()) {
            tm_info.latency_sum += _time_q.bottom()->latency_sum;
-        tm_info.squared_latency_sum += _time_q.bottom()->squared_latency_sum;
        }
        _time_q.elim_push(tm_info);
-    const int64_t top = _time_q.top()->end_time_us;
+    } else {
+        // Accumulate into the last entry so that errors always decrease
+        // the overall QPS and latency.
+        // Note that the latency used is linearly mixed from the real latency
+        // (of an errorous call) and the timeout, so that errors that are more
+        // unlikely to be solved by later retries are punished more.
+        // Examples:
+        //   max_retry=0: always use timeout
+        //   max_retry=1, retried=0: latency
+        //   max_retry=1, retried=1: timeout
+        //   max_retry=2, retried=0: latency
+        //   max_retry=2, retried=1: (latency + timeout) / 2
+        //   max_retry=2, retried=2: timeout
+        //   ...
+        int ndone = 1;
+        int nleft = 0;
+        if (ci.controller->max_retry() > 0) {
+            ndone = ci.controller->retried_count();
+            nleft = ci.controller->max_retry() - ndone;
+        }
+        const int64_t err_latency =
+            (nleft * (int64_t)(latency * FLAGS_punish_error_ratio)
+             + ndone * ci.controller->timeout_ms() * 1000L) / (ndone + nleft);
+        if (!_time_q.empty()) {
+            TimeInfo* ti = _time_q.bottom();
+            ti->latency_sum += err_latency;
+            ti->end_time_us = end_time_us;
+        } else {
+            // If the first response is error, enlarge the latency as timedout
+            // since we know nothing about the normal latency yet.
+            const TimeInfo tm_info = {
+                std::max(err_latency, ci.controller->timeout_ms() * 1000L),
+                end_time_us
+            };
+            _time_q.push(tm_info);
+        }
+    }
+    const int64_t top_time_us = _time_q.top()->end_time_us;
    const size_t n = _time_q.size();
    int64_t scaled_qps = DEFAULT_QPS * WEIGHT_SCALE;
-    if (end_time_us > top) {
+    if (end_time_us > top_time_us) {        
+        // Only calculate scaled_qps when the queue is full or the elapse
+        // between bottom and top is reasonably large(so that error of the
+        // calculated QPS is probably smaller).
+        if (n == _time_q.capacity() ||
+            end_time_us >= top_time_us + 1000000L/*1s*/) { 
            // will not overflow.
-        scaled_qps = (n - 1) * 1000000L * WEIGHT_SCALE / (end_time_us - top);
+            scaled_qps = (n - 1) * 1000000L * WEIGHT_SCALE / (end_time_us - top_time_us);
            if (scaled_qps < WEIGHT_SCALE) {
                scaled_qps = WEIGHT_SCALE;
            }
-        _avg_latency = (tm_info.latency_sum - _time_q.top()->latency_sum) / (n - 1);
-        double avg_squared_latency = (tm_info.squared_latency_sum -
-                                      _time_q.top()->squared_latency_sum) / (n - 1);
-        _dev = (int64_t)sqrt(avg_squared_latency - _avg_latency * (double)_avg_latency);
-    } else {
-        _avg_latency = latency;
-        _dev = _avg_latency;
        }
+        _avg_latency = (_time_q.bottom()->latency_sum -
-    if (FLAGS_quadratic_latency) {
+                        _time_q.top()->latency_sum) / (n - 1);
-        _base_weight = scaled_qps / _avg_latency * 100000 / _avg_latency;
+    } else if (n == 1) {
+        _avg_latency = _time_q.bottom()->latency_sum;
    } else {
-        _base_weight = scaled_qps / _avg_latency;
+        // end_time_us <= top_time_us && n > 1: the QPS is so high that
+        // the time elapse between top and bottom is 0(possible in examples),
+        // or time skews, we don't update the weight for safety.
+        return 0;
    }
+    _base_weight = scaled_qps / _avg_latency;
    return ResetWeight(index, end_time_us);
 }
@@ -449,9 +477,8 @@ void LocalityAwareLoadBalancer::Weight::Describe(std::ostream& os, int64_t now)
    size_t n = _time_q.size();
    double qps = 0;
    int64_t avg_latency = _avg_latency;
-    int64_t dev = _dev;
    if (n <= 1UL) {
-        qps = DEFAULT_QPS;
+        qps = 0;
    } else {
        if (n == _time_q.capacity()) {
            --n;
@@ -471,7 +498,6 @@ void LocalityAwareLoadBalancer::Weight::Describe(std::ostream& os, int64_t now)
        os << " inflight_delay=0";
    }
    os  << " avg_latency=" << avg_latency
-        << " dev=" << dev
        << " expected_qps=" << qps;
 }
@@ -492,7 +518,14 @@ void LocalityAwareLoadBalancer::Describe(
        os << '[';
        for (size_t i = 0; i < n; ++i) {
            const ServerInfo & info = s->weight_tree[i];
-            os << "\n{id=" << info.server_id << " left="
+            os << "\n{id=" << info.server_id;
+            {
+                SocketUniquePtr tmp_ptr;
+                if (Socket::Address(info.server_id, &tmp_ptr) != 0) {
+                    os << "(broken)";
+                }
+            }
+            os << " left="
               << info.left->load(butil::memory_order_relaxed) << ' ';
            info.weight->Describe(os, now);
            os << '}';
@@ -511,7 +544,6 @@ LocalityAwareLoadBalancer::Weight::Weight(int64_t initial_weight)
    , _old_index((size_t)-1L)
    , _old_weight(0)
    , _avg_latency(0)
-    , _dev(0)
    , _time_q(_time_q_items, sizeof(_time_q_items), butil::NOT_OWN_STORAGE) {
 }

--- a/src/brpc/policy/locality_aware_load_balancer.h
+++ b/src/brpc/policy/locality_aware_load_balancer.h
@@ -31,7 +31,7 @@ namespace brpc {
 namespace policy {
 DECLARE_int64(min_weight);
-DECLARE_int64(dev_multiple);
+DECLARE_double(punish_inflight_ratio);
 // Locality-aware is an iterative algorithm to send requests to servers which
 // have lowest expected latencies. Read docs/cn/lalb.md to get a peek at the
@@ -54,7 +54,6 @@ private:
    struct TimeInfo {
        int64_t latency_sum;         // microseconds
        int64_t end_time_us;
-        double squared_latency_sum;  // for calculating deviation
    };
    class Servers;
@@ -68,7 +67,7 @@ private:
        // Called in Feedback() to recalculate _weight.
        // Returns diff of _weight.
-        int64_t Update(const CallInfo&, size_t index, int64_t latency_percent);
+        int64_t Update(const CallInfo&, size_t index);
        // Weight of self. Notice that this value may change at any time.
        int64_t volatile_value() const { return _weight; }
@@ -100,7 +99,6 @@ private:
        size_t _old_index;
        int64_t _old_weight;
        int64_t _avg_latency;
-        int64_t _dev;
        butil::BoundedQueue<TimeInfo> _time_q;
        // content of _time_q
        TimeInfo _time_q_items[RECV_QUEUE_SIZE];
@@ -168,15 +166,10 @@ inline int64_t LocalityAwareLoadBalancer::Weight::ResetWeight(
    if (_begin_time_count > 0) {
        const int64_t inflight_delay =
            now_us - _begin_time_sum / _begin_time_count;
-        // note: we only punish latencies at least twice of average latency
+        const int64_t punish_latency =
-        // when FLAGS_dev_multiple is 0.
+            (int64_t)(_avg_latency * FLAGS_punish_inflight_ratio);
-        int64_t punish_latency = _avg_latency * 2;
-        const int64_t dev = FLAGS_dev_multiple * _dev;
-        if (dev > 0) {
-            punish_latency = _avg_latency + dev;
-        }            
        if (inflight_delay >= punish_latency && _avg_latency > 0) {
-            new_weight = new_weight * _avg_latency / inflight_delay;
+            new_weight = new_weight * punish_latency / inflight_delay;
        }
    }
    if (new_weight < FLAGS_min_weight) {

--- a/src/brpc/selective_channel.cpp
+++ b/src/brpc/selective_channel.cpp
@@ -284,6 +284,7 @@ Sender::Sender(Controller* cntl,
 int Sender::IssueRPC(int64_t start_realtime_us) {
    _main_cntl->_current_call.need_feedback = false;
    LoadBalancer::SelectIn sel_in = { start_realtime_us,
+                                      true,
                                      _main_cntl->has_request_code(),
                                      _main_cntl->_request_code,
                                      _main_cntl->_accessed };

--- a/src/brpc/socket.cpp
+++ b/src/brpc/socket.cpp
@@ -74,6 +74,17 @@ DEFINE_int32(max_connection_pool_size, 100,
             "maximum pooled connection count to a single endpoint");
 BRPC_VALIDATE_GFLAG(max_connection_pool_size, PassValidate);
+DEFINE_int32(connect_timeout_as_unreachable, 3,
+             "If the socket failed to connect due to ETIMEDOUT for so many "
+             "times *continuously*, the error is changed to ENETUNREACH which "
+             "fails the main socket as well when this socket is pooled.");
+static bool validate_connect_timeout_as_unreachable(const char*, int32_t v) {
+    return v >= 2 && v < 1000/*large enough*/;
+}
+BRPC_VALIDATE_GFLAG(connect_timeout_as_unreachable,
+                         validate_connect_timeout_as_unreachable);
 const int WAIT_EPOLLOUT_TIMEOUT_MS = 50;
 #ifdef BAIDU_INTERNAL
@@ -148,6 +159,9 @@ public:
    // The socket newing this object.
    SocketId creator_socket_id;
+    // Counting number of continuous ETIMEDOUT
+    butil::atomic<int> num_continuous_connect_timeouts;
    // _in_size, _in_num_messages, _out_size, _out_num_messages of pooled
    // sockets are counted into the corresponding fields in their _main_socket.
    butil::atomic<size_t> in_size;
@@ -168,6 +182,7 @@ public:
 Socket::SharedPart::SharedPart(SocketId creator_socket_id2)
    : socket_pool(NULL)
    , creator_socket_id(creator_socket_id2)
+    , num_continuous_connect_timeouts(0)
    , in_size(0)
    , in_num_messages(0)
    , out_size(0)
@@ -1320,6 +1335,10 @@ void Socket::AfterAppConnected(int err, void* data) {
    WriteRequest* req = static_cast<WriteRequest*>(data);
    if (err == 0) {
        Socket* const s = req->socket;
+        SharedPart* sp = s->GetSharedPart();
+        if (sp) {
+            sp->num_continuous_connect_timeouts.store(0, butil::memory_order_relaxed);
+        }
        // requests are not setup yet. check the comment on Setup() in Write()
        req->Setup(s);
        bthread_t th;
@@ -1330,6 +1349,18 @@ void Socket::AfterAppConnected(int err, void* data) {
        }
    } else {
        SocketUniquePtr s(req->socket);
+        if (err == ETIMEDOUT) {
+            SharedPart* sp = s->GetOrNewSharedPart();
+            if (sp->num_continuous_connect_timeouts.fetch_add(
+                    1, butil::memory_order_relaxed) + 1 >=
+                FLAGS_connect_timeout_as_unreachable) {
+                // the race between store and fetch_add(in another thread) is
+                // OK since a critial error is about to return.
+                sp->num_continuous_connect_timeouts.store(
+                    0, butil::memory_order_relaxed);
+                err = ENETUNREACH;
+            }
+        }
        s->SetFailed(err, "Fail to connect %s: %s",
                     s->description().c_str(), berror(err));
        s->ReleaseAllFailedWriteRequests(req);

--- a/test/brpc_load_balancer_unittest.cpp
+++ b/test/brpc_load_balancer_unittest.cpp
@@ -20,7 +20,6 @@
 namespace brpc {
 namespace policy {
-DECLARE_bool(count_inflight);
 extern uint32_t CRCHash32(const char *key, size_t len);
 }}
@@ -201,7 +200,7 @@ void* select_server(void* arg) {
    brpc::LoadBalancer* c = sa->lb;
    brpc::SocketUniquePtr ptr;
    CountMap *selected_count = new CountMap;
-    brpc::LoadBalancer::SelectIn in = { 0, false, 0u, NULL };
+    brpc::LoadBalancer::SelectIn in = { 0, false, false, 0u, NULL };
    brpc::LoadBalancer::SelectOut out(&ptr);
    uint32_t rand_seed = rand();
    if (sa->hash) {
@@ -233,9 +232,6 @@ class SaveRecycle : public brpc::SocketUser {
 };
 TEST_F(LoadBalancerTest, update_while_selection) {
-    const bool saved = brpc::policy::FLAGS_count_inflight;
-    brpc::policy::FLAGS_count_inflight = false;
    for (size_t round = 0; round < 4; ++round) {
        brpc::LoadBalancer* lb = NULL;
        SelectArg sa = { NULL, NULL};
@@ -256,7 +252,7 @@ TEST_F(LoadBalancerTest, update_while_selection) {
        // Accessing empty lb should result in error.
        brpc::SocketUniquePtr ptr;
-        brpc::LoadBalancer::SelectIn in = { 0, true, 0, NULL };
+        brpc::LoadBalancer::SelectIn in = { 0, false, true, 0, NULL };
        brpc::LoadBalancer::SelectOut out(&ptr);
        ASSERT_EQ(ENODATA, lb->SelectServer(in, &out));
@@ -344,12 +340,9 @@ TEST_F(LoadBalancerTest, update_while_selection) {
        }
        delete lb;
    }
-    brpc::policy::FLAGS_count_inflight = saved;
 }
 TEST_F(LoadBalancerTest, fairness) {
-    const bool saved = brpc::policy::FLAGS_count_inflight;
-    brpc::policy::FLAGS_count_inflight = false;
    for (size_t round = 0; round < 4; ++round) {
        brpc::LoadBalancer* lb = NULL;
        SelectArg sa = { NULL, NULL};
@@ -447,7 +440,6 @@ TEST_F(LoadBalancerTest, fairness) {
        }
        delete lb;
    }
-    brpc::policy::FLAGS_count_inflight = saved;
 }
 TEST_F(LoadBalancerTest, consistent_hashing) {
@@ -492,7 +484,7 @@ TEST_F(LoadBalancerTest, consistent_hashing) {
        const size_t SELECT_TIMES = 1000000;
        std::map<butil::EndPoint, size_t> times;
        brpc::SocketUniquePtr ptr;
-        brpc::LoadBalancer::SelectIn in = { 0, false, 0u, NULL };
+        brpc::LoadBalancer::SelectIn in = { 0, false, false, 0u, NULL };
        ::brpc::LoadBalancer::SelectOut out(&ptr);
        for (size_t i = 0; i < SELECT_TIMES; ++i) {
            in.has_request_code = true;

--- a/test/brpc_naming_service_filter_unittest.cpp
+++ b/test/brpc_naming_service_filter_unittest.cpp
@@ -53,7 +53,7 @@ TEST_F(NamingServiceFilterTest, sanity) {
    ASSERT_EQ(0, butil::hostname2endpoint("10.128.0.1:1234", &ep));
    for (int i = 0; i < 10; ++i) {
        brpc::SocketUniquePtr tmp_sock;
-        brpc::LoadBalancer::SelectIn sel_in = { 0, false, 0, NULL };
+        brpc::LoadBalancer::SelectIn sel_in = { 0, false, false, 0, NULL };
        brpc::LoadBalancer::SelectOut sel_out(&tmp_sock);
        ASSERT_EQ(0, channel._lb->SelectServer(sel_in, &sel_out));
        ASSERT_EQ(ep, tmp_sock->remote_side());