// Copyright (c) 2015 Baidu, Inc.
// 
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// 
//     http://www.apache.org/licenses/LICENSE-2.0
// 
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Author: Ge,Jun (gejun@baidu.com)
// Date: Mon Dec 14 19:12:30 CST 2015

#include <map>
#include <gflags/gflags.h>
#include "butil/memory/singleton_on_pthread_once.h"
#include "bvar/bvar.h"
#include "bvar/collector.h"

namespace bvar {

// TODO: Do we need to expose this flag? Dumping thread may dump different
// kind of samples, users are unlikely to make good decisions on this value.
DEFINE_int32(bvar_collector_max_pending_samples, 1000,
             "Destroy unprocessed samples when they're too many");

DEFINE_int32(bvar_collector_expected_per_second, 1000,
             "Expected number of samples to be collected per second");

// CAUTION: Don't change this value unless you know exactly what it means.
static const int64_t COLLECTOR_GRAB_INTERVAL_US = 100000L; // 100ms

BAIDU_CASSERT(!(COLLECTOR_SAMPLING_BASE & (COLLECTOR_SAMPLING_BASE - 1)),
              must_be_power_of_2);

// Combine two circular linked list into one.
struct CombineCollected {
    void operator()(Collected* & s1, Collected* s2) const {
        if (s2 == NULL) {
            return;
        }
        if (s1 == NULL) {
            s1 = s2;
            return;
        }
        s1->InsertBeforeAsList(s2);
    }
};

// A thread and a special bvar to collect samples submitted.
class Collector : public bvar::Reducer<Collected*, CombineCollected> {
public:
    Collector();
    ~Collector();

    int64_t last_active_cpuwide_us() const { return _last_active_cpuwide_us; }

    void wakeup_grab_thread();

private:
    // The thread for collecting TLS submissions.
    void grab_thread();

    // The thread for calling user's callbacks.
    void dump_thread();

    // Adjust speed_limit if grab_thread collected too many in one round.
    void update_speed_limit(CollectorSpeedLimit* speed_limit,
                            size_t* last_ngrab, size_t cur_ngrab,
                            int64_t interval_us);

    static void* run_grab_thread(void* arg) {
        static_cast<Collector*>(arg)->grab_thread();
        return NULL;
    }

    static void* run_dump_thread(void* arg) {
        static_cast<Collector*>(arg)->dump_thread();
        return NULL;
    }

    static int64_t get_pending_count(void* arg) {
        Collector* d = static_cast<Collector*>(arg);
        return d->_ngrab - d->_ndump - d->_ndrop;
    }
    
private:
    // periodically modified by grab_thread, accessed by every submit.
    // Make sure that this cacheline does not include frequently modified field.
    int64_t _last_active_cpuwide_us;
    
    bool _created;      // Mark validness of _grab_thread.
    bool _stop;         // Set to true in dtor.
    pthread_t _grab_thread;     // For joining.
    pthread_t _dump_thread;
    int64_t _ngrab BAIDU_CACHELINE_ALIGNMENT;
    int64_t _ndrop;
    int64_t _ndump;
    pthread_mutex_t _dump_thread_mutex;
    pthread_cond_t _dump_thread_cond;
    butil::LinkNode<Collected> _dump_root;
    pthread_mutex_t _sleep_mutex;
    pthread_cond_t _sleep_cond;
};

Collector::Collector()
    : _last_active_cpuwide_us(butil::cpuwide_time_us())
    , _created(false)
    , _stop(false)
    , _grab_thread(0)
    , _dump_thread(0)
    , _ngrab(0)
    , _ndrop(0)
    , _ndump(0) {
    pthread_mutex_init(&_dump_thread_mutex, NULL);
    pthread_cond_init(&_dump_thread_cond, NULL);
    pthread_mutex_init(&_sleep_mutex, NULL);
    pthread_cond_init(&_sleep_cond, NULL);
    int rc = pthread_create(&_grab_thread, NULL, run_grab_thread, this);
    if (rc != 0) {
        LOG(ERROR) << "Fail to create Collector, " << berror(rc);
    } else {
        _created = true;
    }
}

Collector::~Collector() {
    if (_created) {
        _stop = true;
        pthread_join(_grab_thread, NULL);
        _created = false;
    }
    pthread_mutex_destroy(&_dump_thread_mutex);
    pthread_cond_destroy(&_dump_thread_cond);
    pthread_mutex_destroy(&_sleep_mutex);
    pthread_cond_destroy(&_sleep_cond);
}

template <typename T>
static T deref_value(void* arg) {
    return *(T*)arg;
}

// for limiting samples returning NULL in speed_limit()
static CollectorSpeedLimit g_null_speed_limit = BVAR_COLLECTOR_SPEED_LIMIT_INITIALIZER;

void Collector::grab_thread() {
    _last_active_cpuwide_us = butil::cpuwide_time_us();
    int64_t last_before_update_sl = _last_active_cpuwide_us;

    // This is the thread for collecting TLS submissions. User's callbacks are
    // called inside the separate _dump_thread to prevent a slow callback
    // (caused by busy disk generally) from blocking collecting code too long
    // that pending requests may explode memory.
    CHECK_EQ(0, pthread_create(&_dump_thread, NULL, run_dump_thread, this));

    // vars
    bvar::PassiveStatus<int64_t> pending_sampled_data(
        "bvar_collector_pending_samples", get_pending_count, this);
    double busy_seconds = 0;
    bvar::PassiveStatus<double> busy_seconds_var(deref_value<double>, &busy_seconds);
    bvar::PerSecond<bvar::PassiveStatus<double> > busy_seconds_second(
        "bvar_collector_grab_thread_usage", &busy_seconds_var);

    bvar::PassiveStatus<int64_t> ngrab_var(deref_value<int64_t>, &_ngrab);
    bvar::PerSecond<bvar::PassiveStatus<int64_t> > ngrab_second(
        "bvar_collector_grab_second", &ngrab_var);

    // Maps for calculating speed limit.
    typedef std::map<CollectorSpeedLimit*, size_t> GrapMap;
    GrapMap last_ngrab_map;
    GrapMap ngrab_map;
    // Map for group samples by preprocessors.
    typedef std::map<CollectorPreprocessor*, std::vector<Collected*> >
        PreprocessorMap;
    PreprocessorMap prep_map;

    // The main loop.
    while (!_stop) {
        const int64_t abstime = _last_active_cpuwide_us + COLLECTOR_GRAB_INTERVAL_US;

        // Clear and reuse vectors in prep_map, don't clear prep_map directly.
        for (PreprocessorMap::iterator it = prep_map.begin(); it != prep_map.end();
             ++it) {
            it->second.clear();
        }

        // Collect TLS submissions and give them to dump_thread.
        butil::LinkNode<Collected>* head = this->reset();
        if (head) {
            butil::LinkNode<Collected> tmp_root;
            head->InsertBeforeAsList(&tmp_root);
            head = NULL;
            
            // Group samples by preprocessors.
            for (butil::LinkNode<Collected>* p = tmp_root.next(); p != &tmp_root;) {
                butil::LinkNode<Collected>* saved_next = p->next();
                p->RemoveFromList();
                CollectorPreprocessor* prep = p->value()->preprocessor();
                prep_map[prep].push_back(p->value());
                p = saved_next;
            }
            // Iterate prep_map
            butil::LinkNode<Collected> root;
            for (PreprocessorMap::iterator it = prep_map.begin();
                 it != prep_map.end(); ++it) {
                std::vector<Collected*> & list = it->second;
                if (it->second.empty()) {
                    // don't call preprocessor when there's no samples.
                    continue;
                }
                if (it->first != NULL) {
                    it->first->process(list);
                }
                for (size_t i = 0; i < list.size(); ++i) {
                    Collected* p = list[i];
                    CollectorSpeedLimit* speed_limit = p->speed_limit();
                    if (speed_limit == NULL) {
                        ++ngrab_map[&g_null_speed_limit];
                    } else {
                        // Add up the samples of certain type.
                        ++ngrab_map[speed_limit];
                    }
                    // Drop samples if dump_thread is too busy.
                    // FIXME: equal probabilities to drop.
                    ++_ngrab;
                    if (_ngrab >= _ndrop + _ndump +
                        FLAGS_bvar_collector_max_pending_samples) {
                        ++_ndrop;
                        p->destroy();
                    } else {
                        p->InsertBefore(&root);
                    }
                }
            }
            // Give the samples to dump_thread
            if (root.next() != &root) {  // non empty
                butil::LinkNode<Collected>* head2 = root.next();
                root.RemoveFromList();
                BAIDU_SCOPED_LOCK(_dump_thread_mutex);
                head2->InsertBeforeAsList(&_dump_root);
                pthread_cond_signal(&_dump_thread_cond);
            }
        }
        int64_t now = butil::cpuwide_time_us();
        int64_t interval = now - last_before_update_sl;
        last_before_update_sl = now;
        for (GrapMap::iterator it = ngrab_map.begin();
             it != ngrab_map.end(); ++it) {
            update_speed_limit(it->first, &last_ngrab_map[it->first],
                               it->second, interval);
        }
        
        now = butil::cpuwide_time_us();
        // calcuate thread usage.
        busy_seconds += (now - _last_active_cpuwide_us) / 1000000.0;
        _last_active_cpuwide_us = now;

        // sleep for the next round.
        if (!_stop && abstime > now) {
            timespec abstimespec = butil::microseconds_from_now(abstime - now);
            pthread_mutex_lock(&_sleep_mutex);
            pthread_cond_timedwait(&_sleep_cond, &_sleep_mutex, &abstimespec);
            pthread_mutex_unlock(&_sleep_mutex);
        }
        _last_active_cpuwide_us = butil::cpuwide_time_us();
    }
    // make sure _stop is true, we may have other reasons to quit above loop
    {
        BAIDU_SCOPED_LOCK(_dump_thread_mutex);
        _stop = true; 
        pthread_cond_signal(&_dump_thread_cond);
    }
    CHECK_EQ(0, pthread_join(_dump_thread, NULL));
}

void Collector::wakeup_grab_thread() {
    pthread_mutex_lock(&_sleep_mutex);
    pthread_cond_signal(&_sleep_cond);
    pthread_mutex_unlock(&_sleep_mutex);
}

// Adjust speed_limit to match collected samples per second
void Collector::update_speed_limit(CollectorSpeedLimit* sl,
                                   size_t* last_ngrab, size_t cur_ngrab,
                                   int64_t interval_us) {
    // FIXME: May become too large at startup.
    const size_t round_ngrab = cur_ngrab - *last_ngrab;
    if (round_ngrab == 0) {
        return;
    }
    *last_ngrab = cur_ngrab;
    if (interval_us < 0) {
        interval_us = 0;
    }
    size_t new_sampling_range = 0;
    const size_t old_sampling_range = sl->sampling_range;
    if (!sl->ever_grabbed) {
        if (sl->first_sample_real_us) {
            interval_us = butil::gettimeofday_us() - sl->first_sample_real_us;
            if (interval_us < 0) {
                interval_us = 0;
            }
        } else {
            // Rare. the timestamp is still not set or visible yet. Just
            // use the default interval which may make the calculated
            // sampling_range larger.
        }
        new_sampling_range = FLAGS_bvar_collector_expected_per_second
            * interval_us * COLLECTOR_SAMPLING_BASE / (1000000L * round_ngrab);
    } else {
        // NOTE: the multiplications are unlikely to overflow.
        new_sampling_range = FLAGS_bvar_collector_expected_per_second
            * interval_us * old_sampling_range / (1000000L * round_ngrab);
        // Don't grow or shrink too fast.
        if (interval_us < 1000000L) {
            new_sampling_range =
                (new_sampling_range * interval_us +
                 old_sampling_range * (1000000L - interval_us)) / 1000000L;
        }
    }
    // Make sure new value is sane.
    if (new_sampling_range == 0) {
        new_sampling_range = 1;
    } else if (new_sampling_range > COLLECTOR_SAMPLING_BASE) {
        new_sampling_range = COLLECTOR_SAMPLING_BASE;
    }

    // NOTE: don't update unmodified fields in sl to avoid meaningless
    // flushing of the cacheline. 
    if (new_sampling_range != old_sampling_range) {
        sl->sampling_range = new_sampling_range;
    }
    if (!sl->ever_grabbed) {
        sl->ever_grabbed = true;
    }
}

size_t is_collectable_before_first_time_grabbed(CollectorSpeedLimit* sl) {
    if (!sl->ever_grabbed) {
        int before_add = sl->count_before_grabbed.fetch_add(
            1, butil::memory_order_relaxed);
        if (before_add == 0) {
            sl->first_sample_real_us = butil::gettimeofday_us();
        } else if (before_add >= FLAGS_bvar_collector_expected_per_second) {
            butil::get_leaky_singleton<Collector>()->wakeup_grab_thread();
        }
    }
    return sl->sampling_range;
}

// Call user's callbacks in this thread.
void Collector::dump_thread() {
    int64_t last_ns = butil::cpuwide_time_ns();

    // vars
    double busy_seconds = 0;
    bvar::PassiveStatus<double> busy_seconds_var(deref_value<double>, &busy_seconds);
    bvar::PerSecond<bvar::PassiveStatus<double> > busy_seconds_second(
        "bvar_collector_dump_thread_usage", &busy_seconds_var);

    bvar::PassiveStatus<int64_t> ndumped_var(deref_value<int64_t>, &_ndump);
    bvar::PerSecond<bvar::PassiveStatus<int64_t> > ndumped_second(
        "bvar::collector_dump_second", &ndumped_var);

    butil::LinkNode<Collected> root;
    size_t round = 0;

    // The main loop
    while (!_stop) {
        ++round;
        // Get new samples set by grab_thread.
        butil::LinkNode<Collected>* newhead = NULL;
        {
            BAIDU_SCOPED_LOCK(_dump_thread_mutex);
            while (!_stop && _dump_root.next() == &_dump_root) {
                const int64_t now_ns = butil::cpuwide_time_ns();
                busy_seconds += (now_ns - last_ns) / 1000000000.0;
                pthread_cond_wait(&_dump_thread_cond, &_dump_thread_mutex);
                last_ns = butil::cpuwide_time_ns();
            }
            if (_stop) {
                break;
            }
            newhead = _dump_root.next();
            _dump_root.RemoveFromList();
        }
        CHECK(newhead != &_dump_root);
        newhead->InsertBeforeAsList(&root);

        // Call callbacks.
        for (butil::LinkNode<Collected>* p = root.next(); !_stop && p != &root;) {
            // We remove p from the list, save next first.
            butil::LinkNode<Collected>* saved_next = p->next();
            p->RemoveFromList();
            Collected* s = p->value();
            s->dump_and_destroy(round);
            ++_ndump;
            p = saved_next;
        }
    }
}

void Collected::submit(int64_t cpuwide_us) {
    Collector* d = butil::get_leaky_singleton<Collector>();
    // Destroy the sample in-place if the grab_thread did not run for twice
    // of the normal interval. This also applies to the situation that
    // grab_thread aborts due to severe errors.
    // Collector::_last_active_cpuwide_us is periodically modified by grab_thread,
    // cache bouncing is tolerable.
    if (cpuwide_us < d->last_active_cpuwide_us() + COLLECTOR_GRAB_INTERVAL_US * 2) {
        *d << this;
    } else {
        destroy();
    }
}

static double get_sampling_ratio(void* arg) {
    return ((const CollectorSpeedLimit*)arg)->sampling_range /
        (double)COLLECTOR_SAMPLING_BASE;
}

DisplaySamplingRatio::DisplaySamplingRatio(const char* name,
                                           const CollectorSpeedLimit* sl)
    : _var(name, get_sampling_ratio, (void*)sl) {
}

}  // namespace bvar