From 785c4331e20e00b00d02da91b654a42acefb0752 Mon Sep 17 00:00:00 2001 From: Yedidya Feldblum Date: Thu, 18 Jul 2024 18:53:57 -0700 Subject: [PATCH] less indirection in StaticMeta::get Summary: `ThreadLocalPtr::get` and `ThreadLocal::get` wrap `StaticMeta::get`. We cache the static-meta's per-thread capacity and a pointer to the thread-entry in a thread-local cache, which `StaticMeta::get` looks at in the fast path to find the requested item. We currently get a pointer to the thread-vector from the thread-entry, and then get the item from the thread-vector. But this is an indirection which presumably adds latency to the overall operation. If we cache the static-meta's per-thread thread-vector as well, we eliminate this indirection and reduce the latency. Differential Revision: D59246804 fbshipit-source-id: c3716eef84493dc8c8f859faae7a3af223d3c4eb --- folly/detail/ThreadLocalDetail.cpp | 17 +++++ folly/detail/ThreadLocalDetail.h | 111 +++++++++++++++++++++++++---- 2 files changed, 113 insertions(+), 15 deletions(-) diff --git a/folly/detail/ThreadLocalDetail.cpp b/folly/detail/ThreadLocalDetail.cpp index 13cce1f9f67..f5f463f7476 100644 --- a/folly/detail/ThreadLocalDetail.cpp +++ b/folly/detail/ThreadLocalDetail.cpp @@ -208,6 +208,13 @@ void StaticMetaBase::cleanupThreadEntriesAndList( // before issuing a delete. DCHECK(tmp->meta->isThreadEntryRemovedFromAllInMap(tmp, true)); + // Fail safe check to make sure that the ThreadEntry's local-caches are + // all cleared. Failure would indicate that something is using ThreadLocal + // instances for a given tag for the first time in a pthread-thread-specific + // destructor, such as in the destructor of some other ThreadLocal. + DCHECK(tmp->caches->tracking.rlock()->empty()); + + delete tmp->caches; delete tmp; } @@ -421,6 +428,16 @@ void StaticMetaBase::reserve(EntryID* id) { meta.totalElementWrappers_ += (newCapacity - prevCapacity); free(reallocated); + + threadEntry->caches->tracking.withRLock([&](auto& tracking) { + for (auto [_, cachep] : tracking) { + DCHECK(cachep); + DCHECK(!cachep->poison); + DCHECK_EQ(threadEntry, cachep->threadEntry); + cachep->capacity = newCapacity; + cachep->elements = threadEntry->elements; + } + }); } /* diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h index 477d69086cb..2fd07bc367b 100644 --- a/folly/detail/ThreadLocalDetail.h +++ b/folly/detail/ThreadLocalDetail.h @@ -162,6 +162,44 @@ struct ThreadEntryList; * (under the lock). */ struct ThreadEntry { + struct LocalCache { + size_t capacity; + ElementWrapper* elements; + ThreadEntry* threadEntry; + bool poison; + }; + static_assert(std::is_standard_layout_v); + static_assert(std::is_trivial_v); + + struct LocalLifetime; + + struct LocalSet { + using Map = std::unordered_map; + Synchronized tracking; + }; + + struct LocalLifetime { + LocalSet& caches; + explicit LocalLifetime(LocalSet& set, LocalCache& cache) noexcept + : caches{set} { + DCHECK(!cache.poison); + auto tracking = caches.tracking.wlock(); + auto inserted = tracking->emplace(this, &cache).second; + DCHECK(inserted); + } + ~LocalLifetime() { + auto tracking = caches.tracking.wlock(); + auto it = tracking->find(this); + DCHECK(it != tracking->end()); + DCHECK(it->second); + auto& cache = *it->second; + tracking->erase(it); + DCHECK(!cache.poison); + cache = {}; + cache.poison = true; + } + }; + ElementWrapper* elements{nullptr}; std::atomic elementsCapacity{0}; ThreadEntryList* list{nullptr}; @@ -170,6 +208,7 @@ struct ThreadEntry { bool removed_{false}; uint64_t tid_os{}; aligned_storage_for_t tid_data{}; + LocalSet* caches{nullptr}; size_t getElementsCapacity() const noexcept { return elementsCapacity.load(std::memory_order_relaxed); @@ -604,15 +643,22 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase { return detail::createGlobal, void>(); } - struct LocalCache { - ThreadEntry* threadEntry; - size_t capacity; - }; - static_assert(std::is_standard_layout_v); - static_assert(std::is_trivial_v); + // All functions which participate in the local-cache/local-lifetime dance + // must be marked as hidden and as not-exported from the DSO or DLL. This + // covers all functions which may separately access the thread-locals or + // which are part of separately accessing the thread-locals. This does not + // cover a function which calls another function which separately accesses + // the thread-locals, though. + + using LocalCache = ThreadEntry::LocalCache; + using LocalSet = ThreadEntry::LocalSet; + using LocalLifetime = ThreadEntry::LocalLifetime; - FOLLY_EXPORT FOLLY_ALWAYS_INLINE static LocalCache& getLocalCache() { - static thread_local LocalCache instance; + // Hidden: participates in the thread-local local-cache/local-lifetime dance. + FOLLY_ERASE static LocalCache& getLocalCache() { + constexpr auto align = // std::bit_ceil is c++20; nextPowTwo is !constexpr + size_t(1) << folly::constexpr_log2_ceil(sizeof(LocalCache)); + /* library-local */ alignas(align) static thread_local LocalCache instance; return instance; } @@ -621,11 +667,33 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase { // cached fast path, leaving only one branch here and one indirection // below. - ThreadEntry* te = getThreadEntry(ent); + ElementWrapper* vec = getThreadVector(ent); uint32_t id = ent->getOrInvalid(); // Only valid index into the the elements array DCHECK_NE(id, kEntryIDInvalid); - return te->elements[id]; + return vec[id]; + } + + // Hidden: participates in the thread-local local-cache/local-lifetime dance. + FOLLY_ERASE static ElementWrapper* getThreadVector(EntryID* ent) { + if (!kUseThreadLocal) { + return getThreadEntrySlowReserve(ent)->elements; + } + + // Eliminate as many branches and as much extra code as possible in the + // cached fast path, leaving only one branch here and one indirection below. + uint32_t id = ent->getOrInvalid(); + auto& cache = getLocalCache(); + if (FOLLY_UNLIKELY(cache.capacity <= id)) { + return getThreadVectorSlowReserveAndCache(ent, cache); + } + return cache.elements; + } + + // Hidden: participates in the thread-local local-cache/local-lifetime dance. + FOLLY_ERASE_NOINLINE static ElementWrapper* + getThreadVectorSlowReserveAndCache(EntryID* ent, LocalCache& cache) { + return getSlowReserveAndCache(ent, cache)->elements; } /* @@ -636,7 +704,8 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase { * StaticMetaBase::allId2ThreadEntrySets_ updated with ThreadEntry* whenever a * ThreadLocal is set/released. */ - FOLLY_ALWAYS_INLINE static ThreadEntry* getThreadEntry(EntryID* ent) { + // Hidden: participates in the thread-local local-cache/local-lifetime dance. + FOLLY_ERASE static ThreadEntry* getThreadEntry(EntryID* ent) { if (!kUseThreadLocal) { return getThreadEntrySlowReserve(ent); } @@ -646,16 +715,26 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase { uint32_t id = ent->getOrInvalid(); auto& cache = getLocalCache(); if (FOLLY_UNLIKELY(cache.capacity <= id)) { - getSlowReserveAndCache(ent, cache); + return getSlowReserveAndCache(ent, cache); } return cache.threadEntry; } - FOLLY_NOINLINE static void getSlowReserveAndCache( + // Hidden: participates in the thread-local local-cache/local-lifetime dance. + FOLLY_ERASE_NOINLINE static ThreadEntry* getSlowReserveAndCache( EntryID* ent, LocalCache& cache) { auto threadEntry = getThreadEntrySlowReserve(ent); - cache.capacity = threadEntry->getElementsCapacity(); - cache.threadEntry = threadEntry; + if (!cache.poison && !dying()) { + FOLLY_PUSH_WARNING + FOLLY_CLANG_DISABLE_WARNING("-Wexit-time-destructors") + /* library-local */ static thread_local LocalLifetime lifetime{ + *threadEntry->caches, cache}; + FOLLY_POP_WARNING + cache.capacity = threadEntry->getElementsCapacity(); + cache.elements = threadEntry->elements; + cache.threadEntry = threadEntry; + } + return threadEntry; } FOLLY_NOINLINE static ThreadEntry* getThreadEntrySlowReserve(EntryID* ent) { @@ -695,6 +774,8 @@ struct FOLLY_EXPORT StaticMeta final : StaticMetaBase { threadEntry->meta = &meta; int ret = pthread_setspecific(key, threadEntry); checkPosixError(ret, "pthread_setspecific failed"); + + threadEntry->caches = new ThreadEntry::LocalSet(); } return threadEntry; }