Skip to content

Commit

Permalink
less indirection in StaticMeta::get
Browse files Browse the repository at this point in the history
Summary:
`ThreadLocalPtr::get` and `ThreadLocal::get` wrap `StaticMeta::get`.

We cache the static-meta's per-thread capacity and a pointer to the thread-entry in a thread-local cache, which `StaticMeta::get` looks at in the fast path to find the requested item. We currently get a pointer to the thread-vector from the thread-entry, and then get the item from the thread-vector.

But this is an indirection which presumably adds latency to the overall operation.

If we cache the static-meta's per-thread thread-vector as well, we eliminate this indirection and reduce the latency.

Differential Revision: D59246804

fbshipit-source-id: c3716eef84493dc8c8f859faae7a3af223d3c4eb
  • Loading branch information
yfeldblum authored and facebook-github-bot committed Jul 19, 2024
1 parent 80fdd93 commit 785c433
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 15 deletions.
17 changes: 17 additions & 0 deletions folly/detail/ThreadLocalDetail.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,13 @@ void StaticMetaBase::cleanupThreadEntriesAndList(
// before issuing a delete.
DCHECK(tmp->meta->isThreadEntryRemovedFromAllInMap(tmp, true));

// Fail safe check to make sure that the ThreadEntry's local-caches are
// all cleared. Failure would indicate that something is using ThreadLocal
// instances for a given tag for the first time in a pthread-thread-specific
// destructor, such as in the destructor of some other ThreadLocal.
DCHECK(tmp->caches->tracking.rlock()->empty());

delete tmp->caches;
delete tmp;
}

Expand Down Expand Up @@ -421,6 +428,16 @@ void StaticMetaBase::reserve(EntryID* id) {

meta.totalElementWrappers_ += (newCapacity - prevCapacity);
free(reallocated);

threadEntry->caches->tracking.withRLock([&](auto& tracking) {
for (auto [_, cachep] : tracking) {
DCHECK(cachep);
DCHECK(!cachep->poison);
DCHECK_EQ(threadEntry, cachep->threadEntry);
cachep->capacity = newCapacity;
cachep->elements = threadEntry->elements;
}
});
}

/*
Expand Down
111 changes: 96 additions & 15 deletions folly/detail/ThreadLocalDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,44 @@ struct ThreadEntryList;
* (under the lock).
*/
struct ThreadEntry {
struct LocalCache {
size_t capacity;
ElementWrapper* elements;
ThreadEntry* threadEntry;
bool poison;
};
static_assert(std::is_standard_layout_v<LocalCache>);
static_assert(std::is_trivial_v<LocalCache>);

struct LocalLifetime;

struct LocalSet {
using Map = std::unordered_map<LocalLifetime const*, LocalCache*>;
Synchronized<Map> tracking;
};

struct LocalLifetime {
LocalSet& caches;
explicit LocalLifetime(LocalSet& set, LocalCache& cache) noexcept
: caches{set} {
DCHECK(!cache.poison);
auto tracking = caches.tracking.wlock();
auto inserted = tracking->emplace(this, &cache).second;
DCHECK(inserted);
}
~LocalLifetime() {
auto tracking = caches.tracking.wlock();
auto it = tracking->find(this);
DCHECK(it != tracking->end());
DCHECK(it->second);
auto& cache = *it->second;
tracking->erase(it);
DCHECK(!cache.poison);
cache = {};
cache.poison = true;
}
};

ElementWrapper* elements{nullptr};
std::atomic<size_t> elementsCapacity{0};
ThreadEntryList* list{nullptr};
Expand All @@ -170,6 +208,7 @@ struct ThreadEntry {
bool removed_{false};
uint64_t tid_os{};
aligned_storage_for_t<std::thread::id> tid_data{};
LocalSet* caches{nullptr};

size_t getElementsCapacity() const noexcept {
return elementsCapacity.load(std::memory_order_relaxed);
Expand Down Expand Up @@ -604,15 +643,22 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase {
return detail::createGlobal<StaticMeta<Tag, AccessMode>, void>();
}

struct LocalCache {
ThreadEntry* threadEntry;
size_t capacity;
};
static_assert(std::is_standard_layout_v<LocalCache>);
static_assert(std::is_trivial_v<LocalCache>);
// All functions which participate in the local-cache/local-lifetime dance
// must be marked as hidden and as not-exported from the DSO or DLL. This
// covers all functions which may separately access the thread-locals or
// which are part of separately accessing the thread-locals. This does not
// cover a function which calls another function which separately accesses
// the thread-locals, though.

using LocalCache = ThreadEntry::LocalCache;
using LocalSet = ThreadEntry::LocalSet;
using LocalLifetime = ThreadEntry::LocalLifetime;

FOLLY_EXPORT FOLLY_ALWAYS_INLINE static LocalCache& getLocalCache() {
static thread_local LocalCache instance;
// Hidden: participates in the thread-local local-cache/local-lifetime dance.
FOLLY_ERASE static LocalCache& getLocalCache() {
constexpr auto align = // std::bit_ceil is c++20; nextPowTwo is !constexpr
size_t(1) << folly::constexpr_log2_ceil(sizeof(LocalCache));
/* library-local */ alignas(align) static thread_local LocalCache instance;
return instance;
}

Expand All @@ -621,11 +667,33 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase {
// cached fast path, leaving only one branch here and one indirection
// below.

ThreadEntry* te = getThreadEntry(ent);
ElementWrapper* vec = getThreadVector(ent);
uint32_t id = ent->getOrInvalid();
// Only valid index into the the elements array
DCHECK_NE(id, kEntryIDInvalid);
return te->elements[id];
return vec[id];
}

// Hidden: participates in the thread-local local-cache/local-lifetime dance.
FOLLY_ERASE static ElementWrapper* getThreadVector(EntryID* ent) {
if (!kUseThreadLocal) {
return getThreadEntrySlowReserve(ent)->elements;
}

// Eliminate as many branches and as much extra code as possible in the
// cached fast path, leaving only one branch here and one indirection below.
uint32_t id = ent->getOrInvalid();
auto& cache = getLocalCache();
if (FOLLY_UNLIKELY(cache.capacity <= id)) {
return getThreadVectorSlowReserveAndCache(ent, cache);
}
return cache.elements;
}

// Hidden: participates in the thread-local local-cache/local-lifetime dance.
FOLLY_ERASE_NOINLINE static ElementWrapper*
getThreadVectorSlowReserveAndCache(EntryID* ent, LocalCache& cache) {
return getSlowReserveAndCache(ent, cache)->elements;
}

/*
Expand All @@ -636,7 +704,8 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase {
* StaticMetaBase::allId2ThreadEntrySets_ updated with ThreadEntry* whenever a
* ThreadLocal is set/released.
*/
FOLLY_ALWAYS_INLINE static ThreadEntry* getThreadEntry(EntryID* ent) {
// Hidden: participates in the thread-local local-cache/local-lifetime dance.
FOLLY_ERASE static ThreadEntry* getThreadEntry(EntryID* ent) {
if (!kUseThreadLocal) {
return getThreadEntrySlowReserve(ent);
}
Expand All @@ -646,16 +715,26 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase {
uint32_t id = ent->getOrInvalid();
auto& cache = getLocalCache();
if (FOLLY_UNLIKELY(cache.capacity <= id)) {
getSlowReserveAndCache(ent, cache);
return getSlowReserveAndCache(ent, cache);
}
return cache.threadEntry;
}

FOLLY_NOINLINE static void getSlowReserveAndCache(
// Hidden: participates in the thread-local local-cache/local-lifetime dance.
FOLLY_ERASE_NOINLINE static ThreadEntry* getSlowReserveAndCache(
EntryID* ent, LocalCache& cache) {
auto threadEntry = getThreadEntrySlowReserve(ent);
cache.capacity = threadEntry->getElementsCapacity();
cache.threadEntry = threadEntry;
if (!cache.poison && !dying()) {
FOLLY_PUSH_WARNING
FOLLY_CLANG_DISABLE_WARNING("-Wexit-time-destructors")
/* library-local */ static thread_local LocalLifetime lifetime{
*threadEntry->caches, cache};
FOLLY_POP_WARNING
cache.capacity = threadEntry->getElementsCapacity();
cache.elements = threadEntry->elements;
cache.threadEntry = threadEntry;
}
return threadEntry;
}

FOLLY_NOINLINE static ThreadEntry* getThreadEntrySlowReserve(EntryID* ent) {
Expand Down Expand Up @@ -695,6 +774,8 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase {
threadEntry->meta = &meta;
int ret = pthread_setspecific(key, threadEntry);
checkPosixError(ret, "pthread_setspecific failed");

threadEntry->caches = new ThreadEntry::LocalSet();
}
return threadEntry;
}
Expand Down

0 comments on commit 785c433

Please sign in to comment.