Make kj::Arena not thread-safe since it hurts performance even when used single-threaded.

91acb5b2 · Kenton Varda · c5bed0d2 · 91acb5b2 · 91acb5b2 · 91acb5b2
Commit 91acb5b2 authored Nov 30, 2013 by Kenton Varda
Showing with 72 additions and 182 deletions

compiler.c++ c++/src/capnp/compiler/compiler.c++ +4 -5

arena-test.c++ c++/src/kj/arena-test.c++ +0 -66

arena.c++ c++/src/kj/arena.c++ +41 -73

arena.h c++/src/kj/arena.h +27 -38

No files found.
--- a/c++/src/capnp/compiler/compiler.c++
+++ b/c++/src/capnp/compiler/compiler.c++
@@ -185,8 +185,7 @@ private:
  // Extract the ID from the declaration, or if it has none, generate one based on the name and
  // parent ID.

-  static kj::StringPtr joinDisplayName(const kj::Arena& arena, Node& parent,
-                                       kj::StringPtr declName);
+  static kj::StringPtr joinDisplayName(kj::Arena& arena, Node& parent, kj::StringPtr declName);
  // Join the parent's display name with the child's unqualified name to construct the child's
  // display name.

@@ -274,10 +273,10 @@ public:
          bootstrapLoader(loaderCallback) {}
  };

-  const kj::Arena& getNodeArena() { return nodeArena; }
+  kj::Arena& getNodeArena() { return nodeArena; }
  // Arena where nodes and other permanent objects should be allocated.

-  const Workspace& getWorkspace() { return workspace; }
+  Workspace& getWorkspace() { return workspace; }
  // Temporary workspace that can be used to construct bootstrap objects.

  inline bool shouldCompileAnnotations() {
@@ -395,7 +394,7 @@ uint64_t Compiler::Node::generateId(uint64_t parentId, kj::StringPtr declName,
 }

 kj::StringPtr Compiler::Node::joinDisplayName(
-    const kj::Arena& arena, Node& parent, kj::StringPtr declName) {
+    kj::Arena& arena, Node& parent, kj::StringPtr declName) {
  kj::ArrayPtr<char> result = arena.allocateArray<char>(
      parent.displayName.size() + declName.size() + 2);


--- a/c++/src/kj/arena-test.c++
+++ b/c++/src/kj/arena-test.c++
@@ -23,7 +23,6 @@

 #include "arena.h"
 #include "debug.h"
-#include "thread.h"
 #include <gtest/gtest.h>
 #include <stdint.h>

@@ -307,70 +306,5 @@ TEST(Arena, Strings) {
  EXPECT_EQ(quux.end() + 1, corge.begin());
 }

-struct ThreadTestObject {
-  ThreadTestObject* next;
-  void* owner;  // points into the owning thread's stack
-
-  ThreadTestObject(ThreadTestObject* next, void* owner)
-      : next(next), owner(owner) {}
-  ~ThreadTestObject() { ++destructorCount; }
-
-  static uint destructorCount;
-};
-uint ThreadTestObject::destructorCount = 0;
-
-TEST(Arena, Threads) {
-  // Test thread-safety.  We allocate objects in four threads simultaneously, verify that they
-  // are not corrupted, then verify that their destructors are all called when the Arena is
-  // destroyed.
-
-  {
-    MutexGuarded<Arena> arena;
-
-    // Func to run in each thread.
-    auto threadFunc = [&]() {
-      int me;
-      ThreadTestObject* head = nullptr;
-
-      {
-        auto lock = arena.lockShared();
-
-        // Allocate a huge linked list.
-        for (uint i = 0; i < 100000; i++) {
-          head = &lock->allocate<ThreadTestObject>(head, &me);
-        }
-      }
-
-      // Wait until all other threads are done before verifying.
-      arena.lockExclusive();
-
-      // Verify that the list hasn't been corrupted.
-      while (head != nullptr) {
-        ASSERT_EQ(&me, head->owner);
-        head = head->next;
-      }
-    };
-
-    {
-      auto lock = arena.lockExclusive();
-      Thread thread1(threadFunc);
-      Thread thread2(threadFunc);
-      Thread thread3(threadFunc);
-      Thread thread4(threadFunc);
-
-      // Wait for threads to be ready.
-      usleep(10000);
-
-      auto release = kj::mv(lock);
-      // As we go out of scope, the lock will be released (since `release` is destroyed first),
-      // allowing all the threads to start running.  We'll then join each thread.
-    }
-
-    EXPECT_EQ(0u, ThreadTestObject::destructorCount);
-  }
-
-  EXPECT_EQ(400000u, ThreadTestObject::destructorCount);
-}
-
 }  // namespace
 }  // namespace kj
--- a/c++/src/kj/arena.c++
+++ b/c++/src/kj/arena.c++
@@ -27,10 +27,10 @@

 namespace kj {

-Arena::Arena(size_t chunkSizeHint): state(kj::max(sizeof(ChunkHeader), chunkSizeHint)) {}
+Arena::Arena(size_t chunkSizeHint): nextChunkSize(kj::max(sizeof(ChunkHeader), chunkSizeHint)) {}

 Arena::Arena(ArrayPtr<byte> scratch)
-    : state(kj::max(sizeof(ChunkHeader), scratch.size())) {
+    : nextChunkSize(kj::max(sizeof(ChunkHeader), scratch.size())) {
  if (scratch.size() > sizeof(ChunkHeader)) {
    ChunkHeader* chunk = reinterpret_cast<ChunkHeader*>(scratch.begin());
    chunk->end = scratch.end();
@@ -39,19 +39,19 @@ Arena::Arena(ArrayPtr<byte> scratch)

    // Don't place the chunk in the chunk list because it's not ours to delete.  Just make it the
    // current chunk so that we'll allocate from it until it is empty.
-    state.getWithoutLock().currentChunk = chunk;
+    currentChunk = chunk;
  }
 }

 Arena::~Arena() noexcept(false) {
-  // Run cleanup explicitly.  It will be executed again implicitly when state's destructor is
-  // called.  This ensures that if the first pass throws an exception, remaining objects are still
-  // destroyed.  If the second pass throws, the program terminates, but any destructors that could
-  // throw should be using UnwindDetector to avoid this.
-  state.getWithoutLock().cleanup();
+  // Run cleanup() explicitly, but if it throws an exception, make sure to run it again as part of
+  // unwind.  The second call will not throw because destructors are required to guard against
+  // exceptions when already unwinding.
+  KJ_ON_SCOPE_FAILURE(cleanup());
+  cleanup();
 }

-void Arena::State::cleanup() {
+void Arena::cleanup() {
  while (objectList != nullptr) {
    void* ptr = objectList + 1;
    auto destructor = objectList->destructor;
@@ -91,17 +91,13 @@ inline size_t alignTo(size_t s, uint alignment) {

 }  // namespace

-void* Arena::allocateBytes(size_t amount, uint alignment, bool hasDisposer) const {
+void* Arena::allocateBytes(size_t amount, uint alignment, bool hasDisposer) {
  if (hasDisposer) {
    alignment = kj::max(alignment, alignof(ObjectHeader));
    amount += alignTo(sizeof(ObjectHeader), alignment);
  }

-  void* result = allocateBytesLockless(amount, alignment);
-
-  if (result == nullptr) {
-    result = allocateBytesFallback(amount, alignment);
-  }
+  void* result = allocateBytesInternal(amount, alignment);

  if (hasDisposer) {
    // Reserve space for the ObjectHeader, but don't add it to the object list yet.
@@ -112,90 +108,62 @@ void* Arena::allocateBytes(size_t amount, uint alignment, bool hasDisposer) cons
  return result;
 }

-void* Arena::allocateBytesLockless(size_t amount, uint alignment) const {
-  for (;;) {
-    ChunkHeader* chunk = __atomic_load_n(&state.getWithoutLock().currentChunk, __ATOMIC_ACQUIRE);
-
-    if (chunk == nullptr) {
-      // No chunks allocated yet.
-      return nullptr;
-    }
-
-    byte* pos = __atomic_load_n(&chunk->pos, __ATOMIC_RELAXED);
-    byte* alignedPos = alignTo(pos, alignment);
-    byte* endPos = alignedPos + amount;
-
-    // Careful about pointer wrapping (e.g. if the chunk is near the end of the address space).
-    if (chunk->end - endPos < 0) {
-      // Not enough space.
-      return nullptr;
-    }
+void* Arena::allocateBytesInternal(size_t amount, uint alignment) {
+  if (currentChunk != nullptr) {
+    ChunkHeader* chunk = currentChunk;
+    byte* alignedPos = alignTo(chunk->pos, alignment);

-    // There appears to be enough space in this chunk, unless another thread stole it.
-    if (KJ_LIKELY(__atomic_compare_exchange_n(
-          &chunk->pos, &pos, endPos, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))) {
+    // Careful about overflow here.
+    if (amount + (alignedPos - chunk->pos) <= chunk->end - chunk->pos) {
+      // There's enough space in this chunk.
+      chunk->pos = alignedPos + amount;
      return alignedPos;
    }
-
-    // Contention.  Retry.
  }
-}
-
-void* Arena::allocateBytesFallback(size_t amount, uint alignment) const {
-  auto lock = state.lockExclusive();

-  // Now that we have the lock, try one more time to allocate from the current chunk.  This could
-  // work if another thread allocated a new chunk while we were waiting for the lock.
-  void* locklessResult = allocateBytesLockless(amount, alignment);
-  if (locklessResult != nullptr) {
-    return locklessResult;
-  }
+  // Not enough space in the current chunk.  Allocate a new one.

-  // OK, we know the current chunk is out of space and we hold the lock so no one else is
-  // allocating a new one.  Let's do it!
+  // We need to allocate at least enough space for the ChunkHeader and the requested allocation.

+  // If the alignment is less than that of the chunk header, we'll need to increase it.
  alignment = kj::max(alignment, alignof(ChunkHeader));
+
+  // If the ChunkHeader size does not match the alignment, we'll need to pad it up.
  amount += alignTo(sizeof(ChunkHeader), alignment);

-  while (lock->nextChunkSize < amount) {
-    lock->nextChunkSize *= 2;
+  // Make sure we're going to allocate enough space.
+  while (nextChunkSize < amount) {
+    nextChunkSize *= 2;
  }

-  byte* bytes = reinterpret_cast<byte*>(operator new(lock->nextChunkSize));
+  // Allocate.
+  byte* bytes = reinterpret_cast<byte*>(operator new(nextChunkSize));

+  // Set up the ChunkHeader at the beginning of the allocation.
  ChunkHeader* newChunk = reinterpret_cast<ChunkHeader*>(bytes);
-  newChunk->next = lock->chunkList;
+  newChunk->next = chunkList;
  newChunk->pos = bytes + amount;
-  newChunk->end = bytes + lock->nextChunkSize;
-  __atomic_store_n(&lock->currentChunk, newChunk, __ATOMIC_RELEASE);
+  newChunk->end = bytes + nextChunkSize;
+  currentChunk = newChunk;
+  chunkList = newChunk;
+  nextChunkSize *= 2;

-  lock->nextChunkSize *= 2;
-
-  byte* result = alignTo(bytes + sizeof(ChunkHeader), alignment);
-  lock->chunkList = newChunk;
-
-  return result;
+  // Move past the ChunkHeader to find the position of the allocated object.
+  return alignTo(bytes + sizeof(ChunkHeader), alignment);
 }

-StringPtr Arena::copyString(StringPtr content) const {
+StringPtr Arena::copyString(StringPtr content) {
  char* data = reinterpret_cast<char*>(allocateBytes(content.size() + 1, 1, false));
  memcpy(data, content.cStr(), content.size() + 1);
  return StringPtr(data, content.size());
 }

-void Arena::setDestructor(void* ptr, void (*destructor)(void*)) const {
+void Arena::setDestructor(void* ptr, void (*destructor)(void*)) {
  ObjectHeader* header = reinterpret_cast<ObjectHeader*>(ptr) - 1;
  KJ_DASSERT(reinterpret_cast<uintptr_t>(header) % alignof(ObjectHeader) == 0);
  header->destructor = destructor;
-  header->next = state.getWithoutLock().objectList;
-
-  // We can use relaxed atomics here because the object list is not actually traversed until the
-  // destructor, which needs to be synchronized in its own way.
-  while (!__atomic_compare_exchange_n(
-      &state.getWithoutLock().objectList, &header->next, header, true,
-      __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
-    // Retry.
-  }
+  header->next = objectList;
+  objectList = header;
 }

 }  // namespace kj
--- a/c++/src/kj/arena.h
+++ b/c++/src/kj/arena.h
@@ -27,7 +27,6 @@
 #include "memory.h"
 #include "array.h"
 #include "string.h"
-#include "mutex.h"

 namespace kj {

@@ -35,9 +34,10 @@ class Arena {
  // A class which allows several objects to be allocated in contiguous chunks of memory, then
  // frees them all at once.
  //
-  // Allocating from the same Arena in multiple threads concurrently is safe but not particularly
-  // performant due to contention.  The class could be optimized in the future to use per-thread
-  // chunks to solve this.
+  // Allocating from the same Arena in multiple threads concurrently is NOT safe, because making
+  // it safe would require atomic operations that would slow down allocation even when
+  // single-threaded.  If you need to use arena allocation in a multithreaded context, consider
+  // allocating thread-local arenas.

 public:
  explicit Arena(size_t chunkSizeHint = 1024);
@@ -52,20 +52,20 @@ public:
  ~Arena() noexcept(false);

  template <typename T, typename... Params>
-  T& allocate(Params&&... params) const;
+  T& allocate(Params&&... params);
  template <typename T>
-  ArrayPtr<T> allocateArray(size_t size) const;
+  ArrayPtr<T> allocateArray(size_t size);
  // Allocate an object or array of type T.  If T has a non-trivial destructor, that destructor
  // will be run during the Arena's destructor.  Such destructors are run in opposite order of
  // allocation.  Note that these methods must maintain a list of destructors to call, which has
  // overhead, but this overhead only applies if T has a non-trivial destructor.

  template <typename T, typename... Params>
-  Own<T> allocateOwn(Params&&... params) const;
+  Own<T> allocateOwn(Params&&... params);
  template <typename T>
-  Array<T> allocateOwnArray(size_t size) const;
+  Array<T> allocateOwnArray(size_t size);
  template <typename T>
-  ArrayBuilder<T> allocateOwnArrayBuilder(size_t capacity) const;
+  ArrayBuilder<T> allocateOwnArrayBuilder(size_t capacity);
  // Allocate an object or array of type T.  Destructors are executed when the returned Own<T>
  // or Array<T> goes out-of-scope, which must happen before the Arena is destroyed.  This variant
  // is useful when you need to control when the destructor is called.  This variant also avoids
@@ -73,11 +73,11 @@ public:
  // slightly more efficient.

  template <typename T>
-  inline T& copy(T&& value) const { return allocate<Decay<T>>(kj::fwd<T>(value)); }
+  inline T& copy(T&& value) { return allocate<Decay<T>>(kj::fwd<T>(value)); }
  // Allocate a copy of the given value in the arena.  This is just a shortcut for calling the
  // type's copy (or move) constructor.

-  StringPtr copyString(StringPtr content) const;
+  StringPtr copyString(StringPtr content);
  // Make a copy of the given string inside the arena, and return a pointer to the copy.

 private:
@@ -91,37 +91,26 @@ private:
    ObjectHeader* next;
  };

-  struct State {
-    size_t nextChunkSize;
-    ChunkHeader* chunkList;
-    mutable ObjectHeader* objectList;
+  size_t nextChunkSize;
+  ChunkHeader* chunkList = nullptr;
+  ObjectHeader* objectList = nullptr;

-    ChunkHeader* currentChunk;
+  ChunkHeader* currentChunk = nullptr;

-    inline State(size_t nextChunkSize)
-        : nextChunkSize(nextChunkSize), chunkList(nullptr),
-          objectList(nullptr), currentChunk(nullptr) {}
-    inline ~State() noexcept(false) { cleanup(); }
+  void cleanup();
+  // Run all destructors, leaving the above pointers null.  If a destructor throws, the State is
+  // left in a consistent state, such that if cleanup() is called again, it will pick up where
+  // it left off.

-    void cleanup();
-    // Run all destructors, leaving the above pointers null.  If a destructor throws, the State is
-    // left in a consistent state, such that if cleanup() is called again, it will pick up where
-    // it left off.
-  };
-  MutexGuarded<State> state;
-
-  void* allocateBytes(size_t amount, uint alignment, bool hasDisposer) const;
+  void* allocateBytes(size_t amount, uint alignment, bool hasDisposer);
  // Allocate the given number of bytes.  `hasDisposer` must be true if `setDisposer()` may be
  // called on this pointer later.

-  void* allocateBytesLockless(size_t amount, uint alignment) const;
+  void* allocateBytesInternal(size_t amount, uint alignment);
  // Try to allocate the given number of bytes without taking a lock.  Fails if and only if there
  // is no space left in the current chunk.

-  void* allocateBytesFallback(size_t amount, uint alignment) const;
-  // Fallback used when the current chunk is out of space.
-
-  void setDestructor(void* ptr, void (*destructor)(void*)) const;
+  void setDestructor(void* ptr, void (*destructor)(void*));
  // Schedule the given destructor to be executed when the Arena is destroyed.  `ptr` must be a
  // pointer previously returned by an `allocateBytes()` call for which `hasDisposer` was true.

@@ -144,7 +133,7 @@ private:
 // Inline implementation details

 template <typename T, typename... Params>
-T& Arena::allocate(Params&&... params) const {
+T& Arena::allocate(Params&&... params) {
  T& result = *reinterpret_cast<T*>(allocateBytes(
      sizeof(T), alignof(T), !__has_trivial_destructor(T)));
  if (!__has_trivial_constructor(T) || sizeof...(Params) > 0) {
@@ -157,7 +146,7 @@ T& Arena::allocate(Params&&... params) const {
 }

 template <typename T>
-ArrayPtr<T> Arena::allocateArray(size_t size) const {
+ArrayPtr<T> Arena::allocateArray(size_t size) {
  if (__has_trivial_destructor(T)) {
    ArrayPtr<T> result =
        arrayPtr(reinterpret_cast<T*>(allocateBytes(
@@ -193,7 +182,7 @@ ArrayPtr<T> Arena::allocateArray(size_t size) const {
 }

 template <typename T, typename... Params>
-Own<T> Arena::allocateOwn(Params&&... params) const {
+Own<T> Arena::allocateOwn(Params&&... params) {
  T& result = *reinterpret_cast<T*>(allocateBytes(sizeof(T), alignof(T), false));
  if (!__has_trivial_constructor(T) || sizeof...(Params) > 0) {
    ctor(result, kj::fwd<Params>(params)...);
@@ -202,7 +191,7 @@ Own<T> Arena::allocateOwn(Params&&... params) const {
 }

 template <typename T>
-Array<T> Arena::allocateOwnArray(size_t size) const {
+Array<T> Arena::allocateOwnArray(size_t size) {
  ArrayBuilder<T> result = allocateOwnArrayBuilder<T>(size);
  for (size_t i = 0; i < size; i++) {
    result.add();
@@ -211,7 +200,7 @@ Array<T> Arena::allocateOwnArray(size_t size) const {
 }

 template <typename T>
-ArrayBuilder<T> Arena::allocateOwnArrayBuilder(size_t capacity) const {
+ArrayBuilder<T> Arena::allocateOwnArrayBuilder(size_t capacity) {
  return ArrayBuilder<T>(
      reinterpret_cast<T*>(allocateBytes(sizeof(T) * capacity, alignof(T), false)),
      capacity, DestructorOnlyArrayDisposer::instance);