Commit 028b8ea7 authored by Kenton Varda's avatar Kenton Varda

Futex-based mutex and once-init implementations for Linux. These are measurably…

Futex-based mutex and once-init implementations for Linux.  These are measurably faster -- and much smaller -- than the pthread-based implementations.
parent 68733fb7
......@@ -24,9 +24,147 @@
#include "mutex.h"
#include "debug.h"
#if KJ_USE_FUTEX
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/futex.h>
#include <limits.h>
#endif
namespace kj {
namespace _ { // private
#if KJ_USE_FUTEX
// =======================================================================================
// Futex-based implementation (Linux-only)
Mutex::Mutex(): futex(0) {}
Mutex::~Mutex() {
// This will crash anyway, might as well crash with a nice error message.
KJ_ASSERT(futex == 0, "Mutex destroyed while locked.") { break; }
}
void Mutex::lock(Exclusivity exclusivity) {
switch (exclusivity) {
case EXCLUSIVE:
for (;;) {
uint state = 0;
if (KJ_LIKELY(__atomic_compare_exchange_n(&futex, &state, EXCLUSIVE_HELD, false,
__ATOMIC_ACQUIRE, __ATOMIC_RELAXED))) {
// Acquired.
break;
}
// The mutex is contended. Set the exclusive-requested bit and wait.
if ((state & EXCLUSIVE_REQUESTED) == 0) {
if (!__atomic_compare_exchange_n(&futex, &state, state | EXCLUSIVE_REQUESTED, false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// Oops, the state changed before we could set the request bit. Start over.
continue;
}
state |= EXCLUSIVE_REQUESTED;
}
syscall(SYS_futex, &futex, FUTEX_WAIT_PRIVATE, state, NULL, NULL, 0);
}
break;
case SHARED: {
uint state = __atomic_add_fetch(&futex, 1, __ATOMIC_ACQUIRE);
for (;;) {
if (KJ_LIKELY((state & EXCLUSIVE_HELD) == 0)) {
// Acquired.
break;
}
// The mutex is exclusively locked by another thread. Since we incremented the counter
// already, we just have to wait for it to be unlocked.
syscall(SYS_futex, &futex, FUTEX_WAIT_PRIVATE, state, NULL, NULL, 0);
state = __atomic_load_n(&futex, __ATOMIC_ACQUIRE);
}
break;
}
}
}
void Mutex::unlock(Exclusivity exclusivity) {
switch (exclusivity) {
case EXCLUSIVE: {
KJ_DASSERT(futex & EXCLUSIVE_HELD, "Unlocked a mutex that wasn't locked.");
uint oldState = __atomic_fetch_and(
&futex, ~(EXCLUSIVE_HELD | EXCLUSIVE_REQUESTED), __ATOMIC_RELEASE);
if (KJ_UNLIKELY(oldState & ~EXCLUSIVE_HELD)) {
// Other threads are waiting. If there are any shared waiters, they now collectively hold
// the lock, and we must wake them up. If there are any exclusive waiters, we must wake
// them up even if readers are waiting so that at the very least they may re-establish the
// EXCLUSIVE_REQUESTED bit that we just removed.
syscall(SYS_futex, &futex, FUTEX_WAKE_PRIVATE, INT_MAX, NULL, NULL, 0);
}
break;
}
case SHARED: {
KJ_DASSERT(futex & SHARED_COUNT_MASK, "Unshared a mutex that wasn't shared.");
uint state = __atomic_sub_fetch(&futex, 1, __ATOMIC_RELEASE);
// The only case where anyone is waiting is if EXCLUSIVE_REQUESTED is set, and the only time
// it makes sense to wake up that waiter is if the shared count has reached zero.
if (KJ_UNLIKELY(state == EXCLUSIVE_REQUESTED)) {
if (__atomic_compare_exchange_n(
&futex, &state, 0, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// Wake all exclusive waiters. We have to wake all of them because one of them will
// grab the lock while the others will re-establish the exclusive-requested bit.
syscall(SYS_futex, &futex, FUTEX_WAKE_PRIVATE, INT_MAX, NULL, NULL, 0);
}
}
break;
}
}
}
void Once::runOnce(Initializer& init) {
uint state = UNINITIALIZED;
if (__atomic_compare_exchange_n(&futex, &state, INITIALIZING, false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// It's our job to initialize!
init.run();
if (__atomic_exchange_n(&futex, INITIALIZED, __ATOMIC_RELEASE) ==
INITIALIZING_WITH_WAITERS) {
// Someone was waiting for us to finish.
syscall(SYS_futex, &futex, FUTEX_WAKE_PRIVATE, INT_MAX, NULL, NULL, 0);
}
} else {
for (;;) {
if (state == INITIALIZED) {
break;
} else if (state == INITIALIZING) {
// Initialization is taking place in another thread. Indicate that we're waiting.
if (!__atomic_compare_exchange_n(&futex, &state, INITIALIZING_WITH_WAITERS, true,
__ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// State changed, retry.
continue;
}
}
// Wait for initialization.
syscall(SYS_futex, &futex, FUTEX_WAIT_PRIVATE, INITIALIZING_WITH_WAITERS, NULL, NULL, 0);
state = __atomic_load_n(&futex, __ATOMIC_ACQUIRE);
}
// The docs for __atomic_compare_exchange_n claim that the memmodel for the failure case cannot
// be stronger than the success case. That's disappointing, because what we really want is
// for the two cmpxchg calls above to do an acquire barrier in the failure case only, while
// being relaxed if successful, so that once the state is INITIALIZED we know we've acquired
// it. Oh well, we'll just do an acquire barrier on the way out instead.
KJ_ASSERT(__atomic_load_n(&futex, __ATOMIC_ACQUIRE) == INITIALIZED);
}
}
#else
// =======================================================================================
// Generic pthreads-based implementation
#define KJ_PTHREAD_CALL(code) \
{ \
int pthreadError = code; \
......@@ -85,5 +223,7 @@ void Once::runOnce(Initializer& init) {
__atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
}
#endif
} // namespace _ (private)
} // namespace kj
......@@ -26,11 +26,15 @@
#include "memory.h"
// For now, we use pthreads.
// TODO(someday): On Linux, use raw futexes. pthreads are bloated with optional features and
// debugging bookkeeping that aren't worth the cost. A mutex should be four bytes, not forty,
// and uncontended operations should be entirely inline!
#if __linux__ && !defined(KJ_FUTEX)
#define KJ_USE_FUTEX 1
#endif
#if !KJ_USE_FUTEX
// On Linux we use futex. On other platforms we wrap pthreads.
// TODO(someday): Write efficient low-level locking primitives for other platforms.
#include <pthread.h>
#endif
namespace kj {
......@@ -56,15 +60,33 @@ public:
void unlock(Exclusivity exclusivity);
private:
#if KJ_USE_FUTEX
uint futex;
// bit 31 (msb) = set if exclusive lock held
// bit 30 (msb) = set if threads are waiting for exclusive lock
// bits 0-29 = count of readers; If an exclusive lock is held, this is the count of threads
// waiting for a read lock, otherwise it is the count of threads that currently hold a read
// lock.
static constexpr uint EXCLUSIVE_HELD = 1u << 31;
static constexpr uint EXCLUSIVE_REQUESTED = 1u << 30;
static constexpr uint SHARED_COUNT_MASK = EXCLUSIVE_REQUESTED - 1;
#else
mutable pthread_rwlock_t mutex;
#endif
};
class Once {
// Internal implementation details. See `Lazy<T>`.
public:
#if KJ_USE_FUTEX
inline Once(): futex(UNINITIALIZED) {}
#else
Once();
~Once();
#endif
KJ_DISALLOW_COPY(Once);
class Initializer {
......@@ -76,12 +98,26 @@ public:
inline bool isInitialized() noexcept {
// Fast path check to see if runOnce() would simply return immediately.
#if KJ_USE_FUTEX
return __atomic_load_n(&futex, __ATOMIC_ACQUIRE) == INITIALIZED;
#else
return __atomic_load_n(&initialized, __ATOMIC_ACQUIRE);
#endif
}
private:
#if KJ_USE_FUTEX
uint futex;
static constexpr uint UNINITIALIZED = 0;
static constexpr uint INITIALIZING = 1;
static constexpr uint INITIALIZING_WITH_WAITERS = 2;
static constexpr uint INITIALIZED = 3;
#else
bool initialized;
pthread_mutex_t mutex;
#endif
};
} // namespace _ (private)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment