From e765c51668b0515ce318558f0c8e67f7b94212ea Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mattpark@microsoft.com>
Date: Mon, 17 Jun 2024 20:22:40 +0100
Subject: [PATCH 1/4] Added lambda lock primitive

---
 src/snmalloc/backend/globalconfig.h      | 42 +++++++------
 src/snmalloc/backend_helpers/lockrange.h | 12 ++--
 src/snmalloc/ds/flaglock.h               |  7 +++
 src/snmalloc/ds/singleton.h              | 17 +++---
 src/snmalloc/mem/pool.h                  | 78 +++++++++++++-----------
 5 files changed, 90 insertions(+), 66 deletions(-)
diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h
index 72a6c3d42..3918315bd 100644
--- a/src/snmalloc/backend/globalconfig.h
+++ b/src/snmalloc/backend/globalconfig.h
@@ -96,33 +96,37 @@ namespace snmalloc
     // of allocators.
     SNMALLOC_SLOW_PATH static void ensure_init_slow()
     {
-      FlagLock lock{initialisation_lock};
+      if (initialised)
+        return;
+
+      with(initialisation_lock, [&]() {
 #ifdef SNMALLOC_TRACING
-      message<1024>("Run init_impl");
+        message<1024>("Run init_impl");
 #endif
 
-      if (initialised)
-        return;
+        if (initialised)
+          return;
 
-      LocalEntropy entropy;
-      entropy.init<Pal>();
-      // Initialise key for remote deallocation lists
-      RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key());
+        LocalEntropy entropy;
+        entropy.init<Pal>();
+        // Initialise key for remote deallocation lists
+        RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key());
 
-      // Need to randomise pagemap location. If requested and not a
-      // StrictProvenance architecture, randomize its table's location within a
-      // significantly larger address space allocation.
-      static constexpr bool pagemap_randomize =
-        mitigations(random_pagemap) && !aal_supports<StrictProvenance>;
+        // Need to randomise pagemap location. If requested and not a
+        // StrictProvenance architecture, randomize its table's location within
+        // a significantly larger address space allocation.
+        static constexpr bool pagemap_randomize =
+          mitigations(random_pagemap) && !aal_supports<StrictProvenance>;
 
-      Pagemap::concretePagemap.template init<pagemap_randomize>();
+        Pagemap::concretePagemap.template init<pagemap_randomize>();
 
-      if constexpr (aal_supports<StrictProvenance>)
-      {
-        Authmap::init();
-      }
+        if constexpr (aal_supports<StrictProvenance>)
+        {
+          Authmap::init();
+        }
 
-      initialised.store(true, std::memory_order_release);
+        initialised.store(true, std::memory_order_release);
+      });
     }
 
   public:
diff --git a/src/snmalloc/backend_helpers/lockrange.h b/src/snmalloc/backend_helpers/lockrange.h
index ce91711cc..b5d000742 100644
--- a/src/snmalloc/backend_helpers/lockrange.h
+++ b/src/snmalloc/backend_helpers/lockrange.h
@@ -35,14 +35,18 @@ namespace snmalloc
 
       CapPtr<void, ChunkBounds> alloc_range(size_t size)
       {
-        FlagLock lock(spin_lock);
-        return parent.alloc_range(size);
+        CapPtr<void, ChunkBounds> result;
+        with(spin_lock, [&]() {
+          {
+            result = parent.alloc_range(size);
+          }
+        });
+        return result;
       }
 
       void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
       {
-        FlagLock lock(spin_lock);
-        parent.dealloc_range(base, size);
+        with(spin_lock, [&]() { parent.dealloc_range(base, size); });
       }
     };
   };
diff --git a/src/snmalloc/ds/flaglock.h b/src/snmalloc/ds/flaglock.h
index 4a539e636..c3c0930ae 100644
--- a/src/snmalloc/ds/flaglock.h
+++ b/src/snmalloc/ds/flaglock.h
@@ -133,4 +133,11 @@ namespace snmalloc
       lock.flag.store(false, std::memory_order_release);
     }
   };
+
+  template <typename F>
+  inline void with(FlagWord& lock, F&& f)
+  {
+    FlagLock l(lock);
+    f();
+  }
 } // namespace snmalloc
diff --git a/src/snmalloc/ds/singleton.h b/src/snmalloc/ds/singleton.h
index c85635d39..375a82f87 100644
--- a/src/snmalloc/ds/singleton.h
+++ b/src/snmalloc/ds/singleton.h
@@ -35,14 +35,15 @@ namespace snmalloc
 
       if (SNMALLOC_UNLIKELY(!initialised.load(std::memory_order_acquire)))
       {
-        FlagLock lock(flag);
-        if (!initialised)
-        {
-          init(&obj);
-          initialised.store(true, std::memory_order_release);
-          if (first != nullptr)
-            *first = true;
-        }
+        with(flag, [&]() {
+          if (!initialised)
+          {
+            init(&obj);
+            initialised.store(true, std::memory_order_release);
+            if (first != nullptr)
+              *first = true;
+          }
+        });
       }
       return obj;
     }
diff --git a/src/snmalloc/mem/pool.h b/src/snmalloc/mem/pool.h
index 8db58eccc..0497d1ad9 100644
--- a/src/snmalloc/mem/pool.h
+++ b/src/snmalloc/mem/pool.h
@@ -100,8 +100,9 @@ namespace snmalloc
     static T* acquire()
     {
       PoolState<T>& pool = get_state();
-      {
-        FlagLock f(pool.lock);
+
+      T* result{nullptr};
+      with(pool.lock, [&]() {
         if (pool.front != nullptr)
         {
           auto p = pool.front;
@@ -112,17 +113,21 @@ namespace snmalloc
           }
           pool.front = next;
           p->set_in_use();
-          return p.unsafe_ptr();
+          result = p.unsafe_ptr();
         }
-      }
+      });
+
+      if (result != nullptr)
+        return result;
 
       auto p = ConstructT::make();
 
-      FlagLock f(pool.lock);
-      p->list_next = pool.list;
-      pool.list = p;
+      with(pool.lock, [&]() {
+        p->list_next = pool.list;
+        pool.list = p;
 
-      p->set_in_use();
+        p->set_in_use();
+      });
       return p.unsafe_ptr();
     }
 
@@ -146,11 +151,13 @@ namespace snmalloc
       // Returns a linked list of all objects in the stack, emptying the stack.
       if (p == nullptr)
       {
-        FlagLock f(pool.lock);
-        auto result = pool.front;
-        pool.front = nullptr;
-        pool.back = nullptr;
-        return result.unsafe_ptr();
+        T* result;
+        with(pool.lock, [&]() {
+          result = pool.front.unsafe_ptr();
+          pool.front = nullptr;
+          pool.back = nullptr;
+        });
+        return result;
       }
 
       return p->next.unsafe_ptr();
@@ -165,18 +172,18 @@ namespace snmalloc
     {
       PoolState<T>& pool = get_state();
       last->next = nullptr;
-      FlagLock f(pool.lock);
-
-      if (pool.front == nullptr)
-      {
-        pool.front = capptr::Alloc<T>::unsafe_from(first);
-      }
-      else
-      {
-        pool.back->next = capptr::Alloc<T>::unsafe_from(first);
-      }
+      with(pool.lock, [&]() {
+        if (pool.front == nullptr)
+        {
+          pool.front = capptr::Alloc<T>::unsafe_from(first);
+        }
+        else
+        {
+          pool.back->next = capptr::Alloc<T>::unsafe_from(first);
+        }
 
-      pool.back = capptr::Alloc<T>::unsafe_from(last);
+        pool.back = capptr::Alloc<T>::unsafe_from(last);
+      });
     }
 
     /**
@@ -188,18 +195,19 @@ namespace snmalloc
     {
       PoolState<T>& pool = get_state();
       last->next = nullptr;
-      FlagLock f(pool.lock);
 
-      if (pool.front == nullptr)
-      {
-        pool.back = capptr::Alloc<T>::unsafe_from(last);
-      }
-      else
-      {
-        last->next = pool.front;
-        pool.back->next = capptr::Alloc<T>::unsafe_from(first);
-      }
-      pool.front = capptr::Alloc<T>::unsafe_from(first);
+      with(pool.lock, [&]() {
+        if (pool.front == nullptr)
+        {
+          pool.back = capptr::Alloc<T>::unsafe_from(last);
+        }
+        else
+        {
+          last->next = pool.front;
+          pool.back->next = capptr::Alloc<T>::unsafe_from(first);
+        }
+        pool.front = capptr::Alloc<T>::unsafe_from(first);
+      });
     }
 
     static T* iterate(T* p = nullptr)

From e54fcdab7875c5c5ba68352f3caac05a08f4e384 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mattpark@microsoft.com>
Date: Sat, 22 Jun 2024 09:25:45 +0100
Subject: [PATCH 2/4] Implement MCS Combining lock

This is hybrid of Flat Combining and the MCS queue lock. It uses
the queue like the MCS queue lock, but each item additionally
contains a thunk to perform the body of the lock. This enables
other threads to perform the work than initially issued the request.
---
 src/snmalloc/backend_helpers/lockrange.h |   2 +-
 src/snmalloc/ds/combininglock.h          | 168 +++++++++++++++++++++++
 src/snmalloc/ds/ds.h                     |   1 +
 src/snmalloc/ds/flaglock.h               |   2 +-
 4 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 src/snmalloc/ds/combininglock.h

diff --git a/src/snmalloc/backend_helpers/lockrange.h b/src/snmalloc/backend_helpers/lockrange.h
index b5d000742..2dc796ac6 100644
--- a/src/snmalloc/backend_helpers/lockrange.h
+++ b/src/snmalloc/backend_helpers/lockrange.h
@@ -22,7 +22,7 @@ namespace snmalloc
        * This is infrequently used code, a spin lock simplifies the code
        * considerably, and should never be on the fast path.
        */
-      FlagWord spin_lock{};
+      CombiningLock spin_lock{};
 
     public:
       static constexpr bool Aligned = ParentRange::Aligned;
diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h
new file mode 100644
index 000000000..9d92e3b72
--- /dev/null
+++ b/src/snmalloc/ds/combininglock.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "../aal/aal.h"
+#include "../pal/pal.h"
+
+#include <atomic>
+#include <functional>
+
+namespace snmalloc
+{
+  class CombineLockNode;
+  using CombiningLock = std::atomic<CombineLockNode*>;
+
+  /**
+   * @brief Combinations of MCS queue lock with Flat Combining
+   *
+   * Each element in the queue has a pointer to a work item.
+   * This means when under contention the thread holding the lock
+   * can perform the work.
+   *
+   * As the work items are arbitrary lambdas there are no simplifications
+   * for combining related work items.  I.e. original Flat Combining paper
+   * might sort a collection of inserts, and perform them in a single traversal.
+   *
+   * Note that, we should perhaps add a Futex/WakeOnAddress mode to improve
+   * performance in the contended case, rather than spinning.
+   */
+  class CombineLockNode
+  {
+    enum class LockStatus
+    {
+      // The work for this node has not been completed.
+      WAITING,
+
+      // The work for this thread has been completed, and it is not the
+      // last element in the queue.
+      DONE,
+
+      // The work for this thread has not been completed, and it is the
+      // head of the queue.
+      READY
+    };
+
+    // Status of the queue, set by the thread at the head of the queue,
+    // When it makes the thread for this node either the head of the queue
+    // or completes its work.
+    std::atomic<LockStatus> status{LockStatus::WAITING};
+
+    // Used to store the queue
+    std::atomic<CombineLockNode*> next{nullptr};
+
+    // Stores the C++ lambda associated with this node in the queue.
+    void (*f_raw)(CombineLockNode*);
+
+    void set_status(LockStatus s)
+    {
+      status.store(s, std::memory_order_release);
+    }
+
+  public:
+    constexpr CombineLockNode(void (*f)(CombineLockNode*)) : f_raw(f) {}
+
+    void attach(CombiningLock& lock)
+    {
+      // Add to the queue of pending work
+      auto prev = lock.exchange(this, std::memory_order_acq_rel);
+
+      if (prev != nullptr)
+      {
+        // If we aren't the head, link into predecessor
+        prev->next.store(this, std::memory_order_release);
+
+        // Wait to for predecessor to complete
+        while (status.load(std::memory_order_relaxed) == LockStatus::WAITING)
+          Aal::pause();
+
+        // Determine if another thread completed our work.
+        if (status.load(std::memory_order_acquire) == LockStatus::DONE)
+          return;
+      }
+      // We could add an else branch here that could set
+      //    status = LockStatus::Ready
+      // However, the subsequent state assumes it is Ready, and
+      // nothing would read it.
+
+      // We are the head of the queue, and responsible for
+      // waking/performing our and subsequent work.
+      auto curr = this;
+      while (true)
+      {
+        // Perform work for head of the queue
+        curr->f_raw(curr);
+
+        // Determine if there are more elements.
+        auto n = curr->next.load(std::memory_order_acquire);
+        if (n != nullptr)
+        {
+          // Signal this work was completed and move on to
+          // next item.
+          curr->set_status(LockStatus::DONE);
+          curr = n;
+          continue;
+        }
+
+        // This could be the end of the queue, attempt to close the
+        // queue.
+        auto curr_c = curr;
+        if (lock.compare_exchange_strong(curr_c, nullptr, std::memory_order_acq_rel))
+        {
+          // Queue was successfully closed.
+          // Notify last element the work was completed.
+          curr->set_status(LockStatus::DONE);
+          return;
+        }
+
+        // Failed to close the queue wait for next thread to be
+        // added.
+        while (curr->next.load(std::memory_order_relaxed) == nullptr)
+          Aal::pause();
+
+        // As we had to wait, give the job to the next thread
+        // to carry on performing the work.
+        n = curr->next.load(std::memory_order_acquire);
+        n->set_status(LockStatus::READY);
+
+        // Notify the thread that we completed its work.
+        // Note that this needs to be done last, as we can't read
+        // curr->next after setting curr->status
+        curr->set_status(LockStatus::DONE);
+        return;
+      }
+    }
+  };
+
+  template<typename F>
+  class CombineLockNodeTempl : CombineLockNode
+  {
+    template<typename FF>
+    friend void with(CombiningLock&, FF&&);
+
+    // This holds the closure for the lambda
+    F f;
+
+    // Untyped version of calling f to store in the node.
+    static void invoke(CombineLockNode* self)
+    {
+      auto self_templ = reinterpret_cast<CombineLockNodeTempl*>(self);
+      self_templ->f();
+    }
+
+    CombineLockNodeTempl(CombiningLock& lock, F&& f_)
+    : CombineLockNode(invoke), f(f_)
+    {
+      attach(lock);
+    }
+  };
+
+  /**
+   * Lock primitive. This takes a reference to a Lock, and a thunk to
+   * call when the lock is available.  The thunk should be independent of
+   * the current thread as the thunk may be executed by a different thread.
+   */
+  template<typename F>
+  inline void with(CombiningLock& lock, F&& f)
+  {
+    CombineLockNodeTempl<F> node{lock, std::forward<F>(f)};
+  }
+} // namespace snmalloc
diff --git a/src/snmalloc/ds/ds.h b/src/snmalloc/ds/ds.h
index 4cfa22b9b..a26eb20de 100644
--- a/src/snmalloc/ds/ds.h
+++ b/src/snmalloc/ds/ds.h
@@ -6,6 +6,7 @@
 #include "../pal/pal.h"
 #include "aba.h"
 #include "allocconfig.h"
+#include "combininglock.h"
 #include "entropy.h"
 #include "flaglock.h"
 #include "mpmcstack.h"
diff --git a/src/snmalloc/ds/flaglock.h b/src/snmalloc/ds/flaglock.h
index c3c0930ae..cd314f90d 100644
--- a/src/snmalloc/ds/flaglock.h
+++ b/src/snmalloc/ds/flaglock.h
@@ -134,7 +134,7 @@ namespace snmalloc
     }
   };
 
-  template <typename F>
+  template<typename F>
   inline void with(FlagWord& lock, F&& f)
   {
     FlagLock l(lock);

From 2f9137907e1e4d955bc97b8ac685e327043d8000 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mattpark@microsoft.com>
Date: Wed, 19 Jun 2024 19:41:07 +0100
Subject: [PATCH 3/4] Add a fast path flag

This update adds a fast path flag for the uncontended case. This
reduces the number of atomic operations in the uncontended case.
---
 src/snmalloc/ds/combininglock.h | 63 ++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h
index 9d92e3b72..45c40315b 100644
--- a/src/snmalloc/ds/combininglock.h
+++ b/src/snmalloc/ds/combininglock.h
@@ -9,7 +9,15 @@
 namespace snmalloc
 {
   class CombineLockNode;
-  using CombiningLock = std::atomic<CombineLockNode*>;
+
+  struct CombiningLock
+  {
+    // Fast path lock incase there is no contention.
+    std::atomic<bool> flag{false};
+
+    // MCS queue of work items
+    std::atomic<CombineLockNode*> head{nullptr};
+  };
 
   /**
    * @brief Combinations of MCS queue lock with Flat Combining
@@ -52,6 +60,11 @@ namespace snmalloc
     // Stores the C++ lambda associated with this node in the queue.
     void (*f_raw)(CombineLockNode*);
 
+    void release(CombiningLock& lock)
+    {
+      lock.flag.store(false, std::memory_order_release);
+    }
+
     void set_status(LockStatus s)
     {
       status.store(s, std::memory_order_release);
@@ -62,8 +75,25 @@ namespace snmalloc
 
     void attach(CombiningLock& lock)
     {
-      // Add to the queue of pending work
-      auto prev = lock.exchange(this, std::memory_order_acq_rel);
+      // Test if no one is waiting
+      if (lock.head.load(std::memory_order_relaxed) == nullptr)
+      {
+        // No one was waiting so low contention. Attempt to acquire the flag
+        // lock.
+        if (lock.flag.exchange(true, std::memory_order_acquire) == false)
+        {
+          // We grabbed the lock.
+          f_raw(this);
+
+          // Release the lock
+          release(lock);
+          return;
+        }
+      }
+
+      // There is contention for the lock, we need to add our work to the
+      // queue of pending work
+      auto prev = lock.head.exchange(this, std::memory_order_acq_rel);
 
       if (prev != nullptr)
       {
@@ -78,10 +108,25 @@ namespace snmalloc
         if (status.load(std::memory_order_acquire) == LockStatus::DONE)
           return;
       }
-      // We could add an else branch here that could set
-      //    status = LockStatus::Ready
-      // However, the subsequent state assumes it is Ready, and
-      // nothing would read it.
+      else
+      {
+        // We are the head of the queue. Spin until we acquire the fast path
+        // lock.  As we are in the queue future requests shouldn't try to
+        // acquire the fast path lock, but stale views of the queue being empty
+        // could still be concurrent with this thread.
+        while (lock.flag.exchange(true, std::memory_order_acquire))
+        {
+          while (lock.flag.load(std::memory_order_relaxed))
+          {
+            Aal::pause();
+          }
+        }
+
+        // We could set
+        //    status = LockStatus::Ready
+        // However, the subsequent state assumes it is Ready, and
+        // nothing would read it.
+      }
 
       // We are the head of the queue, and responsible for
       // waking/performing our and subsequent work.
@@ -105,11 +150,13 @@ namespace snmalloc
         // This could be the end of the queue, attempt to close the
         // queue.
         auto curr_c = curr;
-        if (lock.compare_exchange_strong(curr_c, nullptr, std::memory_order_acq_rel))
+        if (lock.head.compare_exchange_strong(
+              curr_c, nullptr, std::memory_order_acq_rel))
         {
           // Queue was successfully closed.
           // Notify last element the work was completed.
           curr->set_status(LockStatus::DONE);
+          release(lock);
           return;
         }
 

From 1fdf924b30db363bef88a0d59f981a5c07a464e9 Mon Sep 17 00:00:00 2001
From: Matthew Parkinson <mattpark@microsoft.com>
Date: Mon, 24 Jun 2024 11:07:09 +0100
Subject: [PATCH 4/4] CR feedback

---
 src/snmalloc/ds/combininglock.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h
index 45c40315b..e5b79eaad 100644
--- a/src/snmalloc/ds/combininglock.h
+++ b/src/snmalloc/ds/combininglock.h
@@ -35,6 +35,9 @@ namespace snmalloc
    */
   class CombineLockNode
   {
+    template<typename F>
+    friend class CombineLockNodeTempl;
+
     enum class LockStatus
     {
       // The work for this node has not been completed.
@@ -70,10 +73,9 @@ namespace snmalloc
       status.store(s, std::memory_order_release);
     }
 
-  public:
     constexpr CombineLockNode(void (*f)(CombineLockNode*)) : f_raw(f) {}
 
-    void attach(CombiningLock& lock)
+    SNMALLOC_FAST_PATH void attach(CombiningLock& lock)
     {
       // Test if no one is waiting
       if (lock.head.load(std::memory_order_relaxed) == nullptr)
@@ -90,7 +92,11 @@ namespace snmalloc
           return;
         }
       }
+      attach_slow(lock);
+    }
 
+    SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock)
+    {
       // There is contention for the lock, we need to add our work to the
       // queue of pending work
       auto prev = lock.head.exchange(this, std::memory_order_acq_rel);
@@ -151,7 +157,10 @@ namespace snmalloc
         // queue.
         auto curr_c = curr;
         if (lock.head.compare_exchange_strong(
-              curr_c, nullptr, std::memory_order_acq_rel))
+              curr_c,
+              nullptr,
+              std::memory_order_release,
+              std::memory_order_relaxed))
         {
           // Queue was successfully closed.
           // Notify last element the work was completed.