// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H

#ifdef EIGEN_AVOID_THREAD_LOCAL

#ifdef EIGEN_THREAD_LOCAL
#undef EIGEN_THREAD_LOCAL
#endif

#else

#if EIGEN_MAX_CPP_VER >= 11 &&                         \
    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
     __has_feature(cxx_thread_local)                || \
     (EIGEN_COMP_MSVC >= 1900) )
#define EIGEN_THREAD_LOCAL static thread_local
#endif

// Disable TLS for Apple and Android builds with older toolchains.
#if defined(__APPLE__)
// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
// __IPHONE_8_0.
#include <Availability.h>
#include <TargetConditionals.h>
#endif
// Checks whether C++11's `thread_local` storage duration specifier is
// supported.
#if defined(__apple_build_version__) &&     \
    ((__apple_build_version__ < 8000042) || \
     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
// Notes: Xcode's clang did not support `thread_local` until version
// 8, and even then not for all iOS < 9.0.
#undef EIGEN_THREAD_LOCAL

#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
// There are platforms for which TLS should not be used even though the compiler
// makes it seem like it's supported (Android NDK < r12b for example).
// This is primarily because of linker problems and toolchain misconfiguration:
// TLS isn't supported until NDK r12b per
// https://developer.android.com/ndk/downloads/revision_history.html
// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
// <android/ndk-version.h>. For NDK < r16, users should define these macros,
// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
#if __has_include(<android/ndk-version.h>)
#include <android/ndk-version.h>
#endif  // __has_include(<android/ndk-version.h>)
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
    defined(__NDK_MINOR__) &&                                               \
    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
#undef EIGEN_THREAD_LOCAL
#endif
#endif  // defined(__ANDROID__) && defined(__clang__)

#endif  // EIGEN_AVOID_THREAD_LOCAL

namespace Eigen {

namespace internal {
template <typename T>
struct ThreadLocalNoOpInitialize {
  void operator()(T&) const {}
};

template <typename T>
struct ThreadLocalNoOpRelease {
  void operator()(T&) const {}
};

}  // namespace internal

// Thread local container for elements of type T, that does not use thread local
// storage. As long as the number of unique threads accessing this storage
// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
// use a mutex for synchronization.
//
// Type `T` has to be default constructible, and by default each thread will get
// a default constructed value. It is possible to specify custom `initialize`
// callable, that will be called lazily from each thread accessing this object,
// and will be passed a default initialized object of type `T`. Also it's
// possible to pass a custom `release` callable, that will be invoked before
// calling ~T().
//
// Example:
//
//   struct Counter {
//     int value = 0;
//   }
//
//   Eigen::ThreadLocal<Counter> counter(10);
//
//   // Each thread will have access to it's own counter object.
//   Counter& cnt = counter.local();
//   cnt++;
//
// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
// std::this_thread::get_id() to identify threads. This value is not guaranteed
// to be unique except for the life of the thread. A newly created thread may
// get an OS-specific ID equal to that of an already destroyed thread.
//
// Somewhat similar to TBB thread local storage, with similar restrictions:
// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
//
template <typename T,
          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
          typename Release = internal::ThreadLocalNoOpRelease<T>>
class ThreadLocal {
  // We preallocate default constructed elements in MaxSizedVector.
  static_assert(std::is_default_constructible<T>::value,
                "ThreadLocal data type must be default constructible");

 public:
  explicit ThreadLocal(int capacity)
      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
                    internal::ThreadLocalNoOpRelease<T>()) {}

  ThreadLocal(int capacity, Initialize initialize)
      : ThreadLocal(capacity, std::move(initialize),
                    internal::ThreadLocalNoOpRelease<T>()) {}

  ThreadLocal(int capacity, Initialize initialize, Release release)
      : initialize_(std::move(initialize)),
        release_(std::move(release)),
        capacity_(capacity),
        data_(capacity_),
        ptr_(capacity_),
        filled_records_(0) {
    eigen_assert(capacity_ >= 0);
    data_.resize(capacity_);
    for (int i = 0; i < capacity_; ++i) {
      ptr_.emplace_back(nullptr);
    }
  }

  T& local() {
    std::thread::id this_thread = std::this_thread::get_id();
    if (capacity_ == 0) return SpilledLocal(this_thread);

    std::size_t h = std::hash<std::thread::id>()(this_thread);
    const int start_idx = h % capacity_;

    // NOTE: From the definition of `std::this_thread::get_id()` it is
    // guaranteed that we never can have concurrent insertions with the same key
    // to our hash-map like data structure. If we didn't find an element during
    // the initial traversal, it's guaranteed that no one else could have
    // inserted it while we are in this function. This allows to massively
    // simplify out lock-free insert-only hash map.

    // Check if we already have an element for `this_thread`.
    int idx = start_idx;
    while (ptr_[idx].load() != nullptr) {
      ThreadIdAndValue& record = *(ptr_[idx].load());
      if (record.thread_id == this_thread) return record.value;

      idx += 1;
      if (idx >= capacity_) idx -= capacity_;
      if (idx == start_idx) break;
    }

    // If we are here, it means that we found an insertion point in lookup
    // table at `idx`, or we did a full traversal and table is full.

    // If lock-free storage is full, fallback on mutex.
    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);

    // We double check that we still have space to insert an element into a lock
    // free storage. If old value in `filled_records_` is larger than the
    // records capacity, it means that some other thread added an element while
    // we were traversing lookup table.
    int insertion_index =
        filled_records_.fetch_add(1, std::memory_order_relaxed);
    if (insertion_index >= capacity_) return SpilledLocal(this_thread);

    // At this point it's guaranteed that we can access to
    // data_[insertion_index_] without a data race.
    data_[insertion_index].thread_id = this_thread;
    initialize_(data_[insertion_index].value);

    // That's the pointer we'll put into the lookup table.
    ThreadIdAndValue* inserted = &data_[insertion_index];

    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
    ThreadIdAndValue* empty = nullptr;

    // Now we have to find an insertion point into the lookup table. We start
    // from the `idx` that was identified as an insertion point above, it's
    // guaranteed that we will have an empty record somewhere in a lookup table
    // (because we created a record in the `data_`).
    const int insertion_idx = idx;

    do {
      // Always start search from the original insertion candidate.
      idx = insertion_idx;
      while (ptr_[idx].load() != nullptr) {
        idx += 1;
        if (idx >= capacity_) idx -= capacity_;
        // If we did a full loop, it means that we don't have any free entries
        // in the lookup table, and this means that something is terribly wrong.
        eigen_assert(idx != insertion_idx);
      }
      // Atomic CAS of the pointer guarantees that any other thread, that will
      // follow this pointer will see all the mutations in the `data_`.
    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));

    return inserted->value;
  }

  // WARN: It's not thread safe to call it concurrently with `local()`.
  void ForEach(std::function<void(std::thread::id, T&)> f) {
    // Reading directly from `data_` is unsafe, because only CAS to the
    // record in `ptr_` makes all changes visible to other threads.
    for (auto& ptr : ptr_) {
      ThreadIdAndValue* record = ptr.load();
      if (record == nullptr) continue;
      f(record->thread_id, record->value);
    }

    // We did not spill into the map based storage.
    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;

    // Adds a happens before edge from the last call to SpilledLocal().
    std::unique_lock<std::mutex> lock(mu_);
    for (auto& kv : per_thread_map_) {
      f(kv.first, kv.second);
    }
  }

  // WARN: It's not thread safe to call it concurrently with `local()`.
  ~ThreadLocal() {
    // Reading directly from `data_` is unsafe, because only CAS to the record
    // in `ptr_` makes all changes visible to other threads.
    for (auto& ptr : ptr_) {
      ThreadIdAndValue* record = ptr.load();
      if (record == nullptr) continue;
      release_(record->value);
    }

    // We did not spill into the map based storage.
    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;

    // Adds a happens before edge from the last call to SpilledLocal().
    std::unique_lock<std::mutex> lock(mu_);
    for (auto& kv : per_thread_map_) {
      release_(kv.second);
    }
  }

 private:
  struct ThreadIdAndValue {
    std::thread::id thread_id;
    T value;
  };

  // Use unordered map guarded by a mutex when lock free storage is full.
  T& SpilledLocal(std::thread::id this_thread) {
    std::unique_lock<std::mutex> lock(mu_);

    auto it = per_thread_map_.find(this_thread);
    if (it == per_thread_map_.end()) {
      auto result = per_thread_map_.emplace(this_thread, T());
      eigen_assert(result.second);
      initialize_((*result.first).second);
      return (*result.first).second;
    } else {
      return it->second;
    }
  }

  Initialize initialize_;
  Release release_;
  const int capacity_;

  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
  // storage contiguously starting from index 0.
  MaxSizeVector<ThreadIdAndValue> data_;

  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;

  // Number of records stored in the `data_`.
  std::atomic<int> filled_records_;

  // We fallback on per thread map if lock-free storage is full. In practice
  // this should never happen, if `capacity_` is a reasonable estimate of the
  // number of threads running in a system.
  std::mutex mu_;  // Protects per_thread_map_.
  std::unordered_map<std::thread::id, T> per_thread_map_;
};

}  // namespace Eigen

#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H