From e765c51668b0515ce318558f0c8e67f7b94212ea Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 17 Jun 2024 20:22:40 +0100 Subject: [PATCH 1/4] Added lambda lock primitive --- src/snmalloc/backend/globalconfig.h | 42 +++++++------ src/snmalloc/backend_helpers/lockrange.h | 12 ++-- src/snmalloc/ds/flaglock.h | 7 +++ src/snmalloc/ds/singleton.h | 17 +++--- src/snmalloc/mem/pool.h | 78 +++++++++++++----------- 5 files changed, 90 insertions(+), 66 deletions(-) diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h index 72a6c3d42..3918315bd 100644 --- a/src/snmalloc/backend/globalconfig.h +++ b/src/snmalloc/backend/globalconfig.h @@ -96,33 +96,37 @@ namespace snmalloc // of allocators. SNMALLOC_SLOW_PATH static void ensure_init_slow() { - FlagLock lock{initialisation_lock}; + if (initialised) + return; + + with(initialisation_lock, [&]() { #ifdef SNMALLOC_TRACING - message<1024>("Run init_impl"); + message<1024>("Run init_impl"); #endif - if (initialised) - return; + if (initialised) + return; - LocalEntropy entropy; - entropy.init(); - // Initialise key for remote deallocation lists - RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key()); + LocalEntropy entropy; + entropy.init(); + // Initialise key for remote deallocation lists + RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key()); - // Need to randomise pagemap location. If requested and not a - // StrictProvenance architecture, randomize its table's location within a - // significantly larger address space allocation. - static constexpr bool pagemap_randomize = - mitigations(random_pagemap) && !aal_supports; + // Need to randomise pagemap location. If requested and not a + // StrictProvenance architecture, randomize its table's location within + // a significantly larger address space allocation. + static constexpr bool pagemap_randomize = + mitigations(random_pagemap) && !aal_supports; - Pagemap::concretePagemap.template init(); + Pagemap::concretePagemap.template init(); - if constexpr (aal_supports) - { - Authmap::init(); - } + if constexpr (aal_supports) + { + Authmap::init(); + } - initialised.store(true, std::memory_order_release); + initialised.store(true, std::memory_order_release); + }); } public: diff --git a/src/snmalloc/backend_helpers/lockrange.h b/src/snmalloc/backend_helpers/lockrange.h index ce91711cc..b5d000742 100644 --- a/src/snmalloc/backend_helpers/lockrange.h +++ b/src/snmalloc/backend_helpers/lockrange.h @@ -35,14 +35,18 @@ namespace snmalloc CapPtr alloc_range(size_t size) { - FlagLock lock(spin_lock); - return parent.alloc_range(size); + CapPtr result; + with(spin_lock, [&]() { + { + result = parent.alloc_range(size); + } + }); + return result; } void dealloc_range(CapPtr base, size_t size) { - FlagLock lock(spin_lock); - parent.dealloc_range(base, size); + with(spin_lock, [&]() { parent.dealloc_range(base, size); }); } }; }; diff --git a/src/snmalloc/ds/flaglock.h b/src/snmalloc/ds/flaglock.h index 4a539e636..c3c0930ae 100644 --- a/src/snmalloc/ds/flaglock.h +++ b/src/snmalloc/ds/flaglock.h @@ -133,4 +133,11 @@ namespace snmalloc lock.flag.store(false, std::memory_order_release); } }; + + template + inline void with(FlagWord& lock, F&& f) + { + FlagLock l(lock); + f(); + } } // namespace snmalloc diff --git a/src/snmalloc/ds/singleton.h b/src/snmalloc/ds/singleton.h index c85635d39..375a82f87 100644 --- a/src/snmalloc/ds/singleton.h +++ b/src/snmalloc/ds/singleton.h @@ -35,14 +35,15 @@ namespace snmalloc if (SNMALLOC_UNLIKELY(!initialised.load(std::memory_order_acquire))) { - FlagLock lock(flag); - if (!initialised) - { - init(&obj); - initialised.store(true, std::memory_order_release); - if (first != nullptr) - *first = true; - } + with(flag, [&]() { + if (!initialised) + { + init(&obj); + initialised.store(true, std::memory_order_release); + if (first != nullptr) + *first = true; + } + }); } return obj; } diff --git a/src/snmalloc/mem/pool.h b/src/snmalloc/mem/pool.h index 8db58eccc..0497d1ad9 100644 --- a/src/snmalloc/mem/pool.h +++ b/src/snmalloc/mem/pool.h @@ -100,8 +100,9 @@ namespace snmalloc static T* acquire() { PoolState& pool = get_state(); - { - FlagLock f(pool.lock); + + T* result{nullptr}; + with(pool.lock, [&]() { if (pool.front != nullptr) { auto p = pool.front; @@ -112,17 +113,21 @@ namespace snmalloc } pool.front = next; p->set_in_use(); - return p.unsafe_ptr(); + result = p.unsafe_ptr(); } - } + }); + + if (result != nullptr) + return result; auto p = ConstructT::make(); - FlagLock f(pool.lock); - p->list_next = pool.list; - pool.list = p; + with(pool.lock, [&]() { + p->list_next = pool.list; + pool.list = p; - p->set_in_use(); + p->set_in_use(); + }); return p.unsafe_ptr(); } @@ -146,11 +151,13 @@ namespace snmalloc // Returns a linked list of all objects in the stack, emptying the stack. if (p == nullptr) { - FlagLock f(pool.lock); - auto result = pool.front; - pool.front = nullptr; - pool.back = nullptr; - return result.unsafe_ptr(); + T* result; + with(pool.lock, [&]() { + result = pool.front.unsafe_ptr(); + pool.front = nullptr; + pool.back = nullptr; + }); + return result; } return p->next.unsafe_ptr(); @@ -165,18 +172,18 @@ namespace snmalloc { PoolState& pool = get_state(); last->next = nullptr; - FlagLock f(pool.lock); - - if (pool.front == nullptr) - { - pool.front = capptr::Alloc::unsafe_from(first); - } - else - { - pool.back->next = capptr::Alloc::unsafe_from(first); - } + with(pool.lock, [&]() { + if (pool.front == nullptr) + { + pool.front = capptr::Alloc::unsafe_from(first); + } + else + { + pool.back->next = capptr::Alloc::unsafe_from(first); + } - pool.back = capptr::Alloc::unsafe_from(last); + pool.back = capptr::Alloc::unsafe_from(last); + }); } /** @@ -188,18 +195,19 @@ namespace snmalloc { PoolState& pool = get_state(); last->next = nullptr; - FlagLock f(pool.lock); - if (pool.front == nullptr) - { - pool.back = capptr::Alloc::unsafe_from(last); - } - else - { - last->next = pool.front; - pool.back->next = capptr::Alloc::unsafe_from(first); - } - pool.front = capptr::Alloc::unsafe_from(first); + with(pool.lock, [&]() { + if (pool.front == nullptr) + { + pool.back = capptr::Alloc::unsafe_from(last); + } + else + { + last->next = pool.front; + pool.back->next = capptr::Alloc::unsafe_from(first); + } + pool.front = capptr::Alloc::unsafe_from(first); + }); } static T* iterate(T* p = nullptr) From e54fcdab7875c5c5ba68352f3caac05a08f4e384 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Sat, 22 Jun 2024 09:25:45 +0100 Subject: [PATCH 2/4] Implement MCS Combining lock This is hybrid of Flat Combining and the MCS queue lock. It uses the queue like the MCS queue lock, but each item additionally contains a thunk to perform the body of the lock. This enables other threads to perform the work than initially issued the request. --- src/snmalloc/backend_helpers/lockrange.h | 2 +- src/snmalloc/ds/combininglock.h | 168 +++++++++++++++++++++++ src/snmalloc/ds/ds.h | 1 + src/snmalloc/ds/flaglock.h | 2 +- 4 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 src/snmalloc/ds/combininglock.h diff --git a/src/snmalloc/backend_helpers/lockrange.h b/src/snmalloc/backend_helpers/lockrange.h index b5d000742..2dc796ac6 100644 --- a/src/snmalloc/backend_helpers/lockrange.h +++ b/src/snmalloc/backend_helpers/lockrange.h @@ -22,7 +22,7 @@ namespace snmalloc * This is infrequently used code, a spin lock simplifies the code * considerably, and should never be on the fast path. */ - FlagWord spin_lock{}; + CombiningLock spin_lock{}; public: static constexpr bool Aligned = ParentRange::Aligned; diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h new file mode 100644 index 000000000..9d92e3b72 --- /dev/null +++ b/src/snmalloc/ds/combininglock.h @@ -0,0 +1,168 @@ +#pragma once + +#include "../aal/aal.h" +#include "../pal/pal.h" + +#include +#include + +namespace snmalloc +{ + class CombineLockNode; + using CombiningLock = std::atomic; + + /** + * @brief Combinations of MCS queue lock with Flat Combining + * + * Each element in the queue has a pointer to a work item. + * This means when under contention the thread holding the lock + * can perform the work. + * + * As the work items are arbitrary lambdas there are no simplifications + * for combining related work items. I.e. original Flat Combining paper + * might sort a collection of inserts, and perform them in a single traversal. + * + * Note that, we should perhaps add a Futex/WakeOnAddress mode to improve + * performance in the contended case, rather than spinning. + */ + class CombineLockNode + { + enum class LockStatus + { + // The work for this node has not been completed. + WAITING, + + // The work for this thread has been completed, and it is not the + // last element in the queue. + DONE, + + // The work for this thread has not been completed, and it is the + // head of the queue. + READY + }; + + // Status of the queue, set by the thread at the head of the queue, + // When it makes the thread for this node either the head of the queue + // or completes its work. + std::atomic status{LockStatus::WAITING}; + + // Used to store the queue + std::atomic next{nullptr}; + + // Stores the C++ lambda associated with this node in the queue. + void (*f_raw)(CombineLockNode*); + + void set_status(LockStatus s) + { + status.store(s, std::memory_order_release); + } + + public: + constexpr CombineLockNode(void (*f)(CombineLockNode*)) : f_raw(f) {} + + void attach(CombiningLock& lock) + { + // Add to the queue of pending work + auto prev = lock.exchange(this, std::memory_order_acq_rel); + + if (prev != nullptr) + { + // If we aren't the head, link into predecessor + prev->next.store(this, std::memory_order_release); + + // Wait to for predecessor to complete + while (status.load(std::memory_order_relaxed) == LockStatus::WAITING) + Aal::pause(); + + // Determine if another thread completed our work. + if (status.load(std::memory_order_acquire) == LockStatus::DONE) + return; + } + // We could add an else branch here that could set + // status = LockStatus::Ready + // However, the subsequent state assumes it is Ready, and + // nothing would read it. + + // We are the head of the queue, and responsible for + // waking/performing our and subsequent work. + auto curr = this; + while (true) + { + // Perform work for head of the queue + curr->f_raw(curr); + + // Determine if there are more elements. + auto n = curr->next.load(std::memory_order_acquire); + if (n != nullptr) + { + // Signal this work was completed and move on to + // next item. + curr->set_status(LockStatus::DONE); + curr = n; + continue; + } + + // This could be the end of the queue, attempt to close the + // queue. + auto curr_c = curr; + if (lock.compare_exchange_strong(curr_c, nullptr, std::memory_order_acq_rel)) + { + // Queue was successfully closed. + // Notify last element the work was completed. + curr->set_status(LockStatus::DONE); + return; + } + + // Failed to close the queue wait for next thread to be + // added. + while (curr->next.load(std::memory_order_relaxed) == nullptr) + Aal::pause(); + + // As we had to wait, give the job to the next thread + // to carry on performing the work. + n = curr->next.load(std::memory_order_acquire); + n->set_status(LockStatus::READY); + + // Notify the thread that we completed its work. + // Note that this needs to be done last, as we can't read + // curr->next after setting curr->status + curr->set_status(LockStatus::DONE); + return; + } + } + }; + + template + class CombineLockNodeTempl : CombineLockNode + { + template + friend void with(CombiningLock&, FF&&); + + // This holds the closure for the lambda + F f; + + // Untyped version of calling f to store in the node. + static void invoke(CombineLockNode* self) + { + auto self_templ = reinterpret_cast(self); + self_templ->f(); + } + + CombineLockNodeTempl(CombiningLock& lock, F&& f_) + : CombineLockNode(invoke), f(f_) + { + attach(lock); + } + }; + + /** + * Lock primitive. This takes a reference to a Lock, and a thunk to + * call when the lock is available. The thunk should be independent of + * the current thread as the thunk may be executed by a different thread. + */ + template + inline void with(CombiningLock& lock, F&& f) + { + CombineLockNodeTempl node{lock, std::forward(f)}; + } +} // namespace snmalloc diff --git a/src/snmalloc/ds/ds.h b/src/snmalloc/ds/ds.h index 4cfa22b9b..a26eb20de 100644 --- a/src/snmalloc/ds/ds.h +++ b/src/snmalloc/ds/ds.h @@ -6,6 +6,7 @@ #include "../pal/pal.h" #include "aba.h" #include "allocconfig.h" +#include "combininglock.h" #include "entropy.h" #include "flaglock.h" #include "mpmcstack.h" diff --git a/src/snmalloc/ds/flaglock.h b/src/snmalloc/ds/flaglock.h index c3c0930ae..cd314f90d 100644 --- a/src/snmalloc/ds/flaglock.h +++ b/src/snmalloc/ds/flaglock.h @@ -134,7 +134,7 @@ namespace snmalloc } }; - template + template inline void with(FlagWord& lock, F&& f) { FlagLock l(lock); From 2f9137907e1e4d955bc97b8ac685e327043d8000 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 19 Jun 2024 19:41:07 +0100 Subject: [PATCH 3/4] Add a fast path flag This update adds a fast path flag for the uncontended case. This reduces the number of atomic operations in the uncontended case. --- src/snmalloc/ds/combininglock.h | 63 ++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h index 9d92e3b72..45c40315b 100644 --- a/src/snmalloc/ds/combininglock.h +++ b/src/snmalloc/ds/combininglock.h @@ -9,7 +9,15 @@ namespace snmalloc { class CombineLockNode; - using CombiningLock = std::atomic; + + struct CombiningLock + { + // Fast path lock incase there is no contention. + std::atomic flag{false}; + + // MCS queue of work items + std::atomic head{nullptr}; + }; /** * @brief Combinations of MCS queue lock with Flat Combining @@ -52,6 +60,11 @@ namespace snmalloc // Stores the C++ lambda associated with this node in the queue. void (*f_raw)(CombineLockNode*); + void release(CombiningLock& lock) + { + lock.flag.store(false, std::memory_order_release); + } + void set_status(LockStatus s) { status.store(s, std::memory_order_release); @@ -62,8 +75,25 @@ namespace snmalloc void attach(CombiningLock& lock) { - // Add to the queue of pending work - auto prev = lock.exchange(this, std::memory_order_acq_rel); + // Test if no one is waiting + if (lock.head.load(std::memory_order_relaxed) == nullptr) + { + // No one was waiting so low contention. Attempt to acquire the flag + // lock. + if (lock.flag.exchange(true, std::memory_order_acquire) == false) + { + // We grabbed the lock. + f_raw(this); + + // Release the lock + release(lock); + return; + } + } + + // There is contention for the lock, we need to add our work to the + // queue of pending work + auto prev = lock.head.exchange(this, std::memory_order_acq_rel); if (prev != nullptr) { @@ -78,10 +108,25 @@ namespace snmalloc if (status.load(std::memory_order_acquire) == LockStatus::DONE) return; } - // We could add an else branch here that could set - // status = LockStatus::Ready - // However, the subsequent state assumes it is Ready, and - // nothing would read it. + else + { + // We are the head of the queue. Spin until we acquire the fast path + // lock. As we are in the queue future requests shouldn't try to + // acquire the fast path lock, but stale views of the queue being empty + // could still be concurrent with this thread. + while (lock.flag.exchange(true, std::memory_order_acquire)) + { + while (lock.flag.load(std::memory_order_relaxed)) + { + Aal::pause(); + } + } + + // We could set + // status = LockStatus::Ready + // However, the subsequent state assumes it is Ready, and + // nothing would read it. + } // We are the head of the queue, and responsible for // waking/performing our and subsequent work. @@ -105,11 +150,13 @@ namespace snmalloc // This could be the end of the queue, attempt to close the // queue. auto curr_c = curr; - if (lock.compare_exchange_strong(curr_c, nullptr, std::memory_order_acq_rel)) + if (lock.head.compare_exchange_strong( + curr_c, nullptr, std::memory_order_acq_rel)) { // Queue was successfully closed. // Notify last element the work was completed. curr->set_status(LockStatus::DONE); + release(lock); return; } From 1fdf924b30db363bef88a0d59f981a5c07a464e9 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 24 Jun 2024 11:07:09 +0100 Subject: [PATCH 4/4] CR feedback --- src/snmalloc/ds/combininglock.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h index 45c40315b..e5b79eaad 100644 --- a/src/snmalloc/ds/combininglock.h +++ b/src/snmalloc/ds/combininglock.h @@ -35,6 +35,9 @@ namespace snmalloc */ class CombineLockNode { + template + friend class CombineLockNodeTempl; + enum class LockStatus { // The work for this node has not been completed. @@ -70,10 +73,9 @@ namespace snmalloc status.store(s, std::memory_order_release); } - public: constexpr CombineLockNode(void (*f)(CombineLockNode*)) : f_raw(f) {} - void attach(CombiningLock& lock) + SNMALLOC_FAST_PATH void attach(CombiningLock& lock) { // Test if no one is waiting if (lock.head.load(std::memory_order_relaxed) == nullptr) @@ -90,7 +92,11 @@ namespace snmalloc return; } } + attach_slow(lock); + } + SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock) + { // There is contention for the lock, we need to add our work to the // queue of pending work auto prev = lock.head.exchange(this, std::memory_order_acq_rel); @@ -151,7 +157,10 @@ namespace snmalloc // queue. auto curr_c = curr; if (lock.head.compare_exchange_strong( - curr_c, nullptr, std::memory_order_acq_rel)) + curr_c, + nullptr, + std::memory_order_release, + std::memory_order_relaxed)) { // Queue was successfully closed. // Notify last element the work was completed.