flexflow · Bob-Chen222 · Jul 25, 2024 · Jul 27, 2024 · Aug 3, 2024 · Aug 3, 2024
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -69,6 +69,11 @@ class BatchConfig {
     int first_token_index_in_request = -1;
     int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
+
+    // page attention: we need some additional attention information here to allocate physical blocks in load_batch_config
+    int32_t num_kv_pages; //number of kv pages used
+    int32_t kv_last_page_len; //last page length of kv
+    RequestGuid request_guid;
   };
 
   struct PerTokenInfo {
@@ -82,6 +87,8 @@ class BatchConfig {
     int request_index = -1;
   };
 
+  std::vector<int32_t> page_indices; //the physical block indices for each page
+
   struct CommittedTokensInfo {
     int index_in_kv_cache = -1; // the index in the temporary key-value cache
     int request_index = -1;     // request index in the batch
@@ -150,6 +157,7 @@ class BatchConfig {
 
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
+  std::vector<int32_t> requestsIndices[MAX_NUM_REQUESTS]; //for kv cache
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
   CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
   bool request_available[MAX_NUM_REQUESTS];

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -81,7 +81,8 @@ struct FFHandler {
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
       sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
-      sizeof(BatchConfig::committed_tokens);
+      sizeof(BatchConfig::committed_tokens) + sizeof(int);
+
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   DataType quantization_type;

diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
@@ -0,0 +1,141 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flexflow/batch_config.h"
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/config.h"
+#include "flexflow/utils/file_loader.h"
+#include <future>
+#include <mutex>
+#include <tokenizers_cpp.h>
+#include <deque>
+
+namespace FlexFlow {
+
+using TokenId = BatchConfig::TokenId;
+
+/**
+ * @class LogicalTokenBlock
+ * @brief A class to represent a logical block of tokens similar to virtual memory address
+ */
+class LogicalTokenBlock {
+public:
+    using TokenId = BatchConfig::TokenId;
+    // Constructor
+    LogicalTokenBlock(int block_number, uint32_t block_size);
+
+    // Method to check if the block is empty
+    bool is_empty() const;
+
+    // Method to get the number of empty slots
+    int get_num_empty_slots() const;
+
+    // Method to get the number of allocated slots
+    int get_num_alloc_slots();
+
+    // Method to check if the block is full
+    bool is_full() const;
+
+    // Method to append tokens
+    void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
+
+    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
+    void reset_num_spec_tokens();
+
+    std::vector<TokenId> get_token_ids() const;
+
+    int block_number; // the index of the logical token block
+    uint32_t block_size; // the size of the block
+    int num_tokens; // the number of tokens currently stored in the block
+    int num_commit_tokens; // the number of tokens inside this block that are already committed
+    int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
+
+    std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
+};
+
+/**
+ * @class PhysicalTokenBlock
+ * @brief A class to represent a physical block of tokens similar to physical memory address
+ * It keeps track of the location of the tokens stored on GPU memory
+ */
+class PhysicalTokenBlock {
+public:
+    // Constructor
+    PhysicalTokenBlock(int block_number, uint32_t block_size);
+
+    int ref_count; // reference count
+    int block_number; // the index of the physical token block
+    uint32_t block_size; // the size of the block
+};
+
+/**
+ * @class BlockAllocator
+ * @brief A Block Manager that is reponsible for maintaining a pool of free blocks
+ */
+class BlockAllocator {
+public:
+    // Constructor
+    BlockAllocator(uint32_t block_size, int num_blocks);
+
+    // Allocate a block
+    PhysicalTokenBlock allocate();
+
+    // Free a block
+    void free(PhysicalTokenBlock& block);
+
+    // Get the number of free blocks
+    size_t get_num_free_blocks() const;
+
+private:
+    uint32_t block_size;
+    int num_blocks;
+    std::deque<PhysicalTokenBlock> free_blocks;
+};
+
+/*
+* @class PageManager
+* @brief A wrapper class that manages the kv cache allocation status
+* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
+*/
+class PageManager {
+public:
+    // Get the singleton instance of the PageManager as it will be shared in multiple places
+    static PageManager *get_page_manager();
+    using BlockTable = std::vector<PhysicalTokenBlock>;
+    using RequestGuid = BatchConfig::RequestGuid;
+    PageManager(uint32_t block_size, int num_total_blocks);
+
+    // Prefill the block with the given token ids at the llm prefilling stage
+    bool prefill(const RequestGuid& request_guid, const std::vector<int>& token_ids);
+    bool allocate(const RequestGuid& request_guid);
+    void free(const RequestGuid& request_guid);
+
+    size_t get_num_free_blocks() const;
+    std::vector<int32_t> get_block_table_indices(const RequestGuid& request_guid) const;
+    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
+
+    void erase_last_pages(const RequestGuid& request_guid, int num_pages);
+
+private:
+    uint32_t block_size; // the size of the block
+    int num_total_blocks; // the total number of blocks
+    BlockAllocator block_allocator;
+    std::unordered_map<int, BlockTable> block_tables;
+};
+
+}; // namespace FlexFlow
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -19,6 +19,7 @@
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/file_loader.h"
+#include "flexflow/page_manager.h"
 #include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
@@ -75,6 +76,10 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
+  // Used for keeping track of the block information
+  std::vector<LogicalTokenBlock> blocks;
+  int32_t page_id_commit;
+
   // TokenTree speculative_token_tree;
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
@@ -393,6 +398,12 @@ class RequestManager {
   double total_request_run_time;
   void load_pending_request_to_batch();
   void request_complete_clean_up(int batch_index);
+
+  /* ---------- Page Attention Helper Functions ---------- */
+  void _append_logical_block_to_request(Request &request, bool is_commit);
+  void _append_tokens_to_blocks(Request &request, std::vector<TokenId> const &tokens, bool is_commit, int start = 0, int end = -1);
+  /* ---------- Page Attention Helper Functions ---------- */
+
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
@@ -97,26 +97,27 @@ using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
-__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
+// page_idx: the assigned physical block index for this token
+// token_idx: can be absolute index in the sequence but in there we just use it as an offset
+// hidden_size: the size of the hidden dimension
+__device__ __forceinline__ size_t get_k_entry_offset(int const token_idx,
+                                                     int const page_idx,
                                                      int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) *
-         hidden_size;
+  size_t index = ((page_idx) * kPagesize * 2 + (token_idx % kPagesize)) * hidden_size;
+  return index;
 }
 
-__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
+__device__ __forceinline__ size_t get_v_entry_offset(int const token_idx,
+                                                     int const page_idx,
                                                      int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) *
-         hidden_size;
+  size_t index = ((page_idx) * kPagesize * 2 + kPagesize + (token_idx % kPagesize)) * hidden_size;
+  return index;
 }
 
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     bool const *request_available,
     int num_requests,
@@ -135,26 +136,28 @@ __global__ void commit_tokens_kernel(
       cnt_1++;
     }
   }
-
+  // get the starting index of kv page
+  int start = kv_indptr[requext_idx_in_batch];
+  int end = kv_indptr[requext_idx_in_batch + 1] - 1;
   for (int i = 0; i < num_committed_tokens; i++) {
     if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
       int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache;
       if (index_in_kv_cache == -1) {
         continue;
       }
 
-      int const req_id = committedTokenInfos[i].request_index;
+      // int const req_id = committedTokenInfos[i].request_index;
       int const tok_id = committedTokenInfos[i].token_depth;
+      int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
+      int const page_from_idx = kv_page_indices[start + (tok_id / kPagesize)];
+
+      // page attention: since we cannot store temporary tokens in the cache, we need to figure out another way
+      size_t from_k_idx = get_k_entry_offset(index_in_kv_cache, page_from_idx, hidden_size),
+             from_v_idx = get_v_entry_offset(index_in_kv_cache, page_from_idx, hidden_size);
 
-      size_t from_k_idx = get_k_entry_offset(
-                 req_id, index_in_kv_cache, max_num_pages, hidden_size),
-             from_v_idx = get_v_entry_offset(
-                 req_id, index_in_kv_cache, max_num_pages, hidden_size);
-      size_t to_k_idx =
-                 get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
-             to_v_idx =
-                 get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
-      assert(to_k_idx <= from_k_idx);
+      // page attention: copy the token to the new position
+      size_t to_k_idx =get_k_entry_offset(tok_id, page_to_idx, hidden_size),
+             to_v_idx =get_v_entry_offset(tok_id, page_to_idx, hidden_size);
 
       kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
       kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
@@ -181,6 +184,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
                          stream>>>(static_cast<half *>(m->keyCache),
+                                   m->handle.tree_verify_attention_metadata->kv_indptr,
+                                   m->handle.tree_verify_attention_metadata->kv_indices,
                                    m->committed_token_infos,
                                    m->request_available,
                                    num_requests,
@@ -277,6 +282,8 @@ __global__ void
     update_qkv_cache_kernel(DT *devQKVProjArray,
                             half *qTmp_ptr,
                             half *kCache_ptr,
+                            int32_t *kv_indptr,
+                            int32_t *kv_page_indices,
                             BatchConfig::PerTokenInfo const *tokenInfos,
                             BatchConfig::PerRequestInfo *request_infos,
                             int const max_num_pages,
@@ -292,11 +299,12 @@ __global__ void
   int const req_idx = tokenInfos[token_idx].request_index;
   int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
+  // compute the starting index of kv page
+  int start = kv_indptr[req_idx];
+  int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
   size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = get_k_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size);
+  size_t to_k_idx = get_k_entry_offset(token_abs_idx, page_idx, hidden_size),
+         to_v_idx = get_v_entry_offset(token_abs_idx, page_idx, hidden_size);
 
   // key and value cache should be stored interleaved
   kCache_ptr[to_k_idx + offset] =
@@ -324,6 +332,8 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
                             stream>>>(static_cast<DT *>(m->devQKVProjArray),
                                       static_cast<half *>(m->queryTmp),
                                       static_cast<half *>(m->keyCache),
+                                      m->handle.tree_verify_attention_metadata->kv_indptr,
+                                      m->handle.tree_verify_attention_metadata->kv_indices,
                                       m->token_infos,
                                       m->request_infos,
                                       max_num_pages,
@@ -439,6 +449,23 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   half *q = static_cast<half *>(m->queryTmp),
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
+
+  static int32_t kv_indices_tmp[BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_NUM_TOKENS];
+  static int32_t kv_indptr_tmp[BatchConfig::MAX_NUM_REQUESTS + 1];
+  static int32_t kv_last_page_len_tmp[BatchConfig::MAX_NUM_REQUESTS];
+  // copy data from device to host
+  cudaMemcpy(kv_indices_tmp,
+             m->handle.tree_verify_attention_metadata->kv_indices,
+             sizeof(int32_t) * BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_NUM_TOKENS,
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(kv_indptr_tmp,
+              m->handle.tree_verify_attention_metadata->kv_indptr,
+              sizeof(int32_t) * (BatchConfig::MAX_NUM_REQUESTS + 1),
+              cudaMemcpyDeviceToHost);
+  cudaMemcpy(kv_last_page_len_tmp,
+              m->handle.tree_verify_attention_metadata->kv_last_page_len,
+              sizeof(int32_t) * BatchConfig::MAX_NUM_REQUESTS,
+              cudaMemcpyDeviceToHost);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
       num_kv_heads,
       kPagesize,