From 6451a3bc152f6dd784e3f043def0f57900a753d0 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@catalyst-cluster.cs.cmu.edu>
Date: Fri, 18 Aug 2023 00:20:43 -0400
Subject: [PATCH 01/30] Reorder pipeline.

---
 include/flexflow/model.h       |   2 +-
 src/mapper/mapper.cc           |   2 +-
 src/runtime/request_manager.cc | 289 +++++++++++++++++----------------
 3 files changed, 149 insertions(+), 144 deletions(-)
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index bc3c7e6545..60ab164e08 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -229,8 +229,8 @@ enum TaskIDs {
   RM_LOAD_TOKENS_TASK_ID,
   RM_LOAD_POSITION_TASK_ID,
   RM_PREPARE_NEXT_BATCH_TASK_ID,
-  RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
+  RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index 3d08eb0bcc..a86a6167a6 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -284,8 +284,8 @@ void FFMapper::select_task_options(const MapperContext ctx,
     return;
   }
   if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) ||
-      (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
+      (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) {
     output.initial_proc = all_cpus[0];
     return;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 348272a69b..7a1bdf8e7e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -458,149 +458,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 }
 
 /* ----- Speculative Inference Specific functions ----- */
-BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
-    BeamSearchBatchConfigFuture const &old_bc,
-    BeamInferenceResultFuture const &result) {
-  Runtime *runtime = Runtime::get_runtime();
-  Context ctx = Runtime::get_context();
-
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
-  launcher.add_future(result);
-  return runtime->execute_task(ctx, launcher);
-}
-
-BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-  BeamInferenceResult const &result =
-      Future(task->futures[1]).get_result<BeamInferenceResult>();
-  return rm->prepare_next_batch_beam(bc, result);
-}
-
-// update beam search metadata
-BeamSearchBatchConfig
-    RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
-                                            BeamInferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-  if (verbose) {
-    std::cout << "\n############### prepare_next_batch_beam ###############\n";
-  }
-  if (verbose) {
-    std::cout << "print all results"
-              << "\n";
-    for (int i = 0; i < 40; i++) {
-      std::cout << result.token_ids[i] << ", ";
-    }
-    std::cout << "Current Beam Depth: "
-              << old_bc.beamRequestsInfo[0].current_depth << "\n";
-  }
-
-  // Step 1: Store result to the beam tree struct
-  store_beam_metadata(old_bc, result);
-
-  // Step 2: preparing the next batch for existing requests
-  BeamSearchBatchConfig new_bc;
-  new_bc.max_init_length = 0;
-  new_bc.model_id = old_bc.model_id;
-  // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n";
-
-  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-    if (old_bc.request_completed[i]) {
-      continue;
-    }
-    // Comment out this assertion since num_tokens_in_batch can be
-    // zero when beam search has reached required sequence length
-    // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-    Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-    int processed_tokens = old_bc.requestsInfo[i].token_start_offset +
-                           old_bc.requestsInfo[i].num_tokens_in_batch;
-
-    // assert(processed_tokens < request.tokens.size());
-    log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
-    if (processed_tokens >
-        old_bc.beamRequestsInfo[i].max_depth + request.tokens.size()
-        // || ir.results[t] == 0 TODO: replace this with <EOS>
-    ) {
-      log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)",
-                        old_bc.requestsInfo[i].request_guid,
-                        old_bc.beamRequestsInfo[i].max_depth);
-      // new_bc.request_completed[i] = true;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].token_start_offset = processed_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-    } else {
-      log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
-                          << new_bc.num_tokens;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].token_start_offset = processed_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-
-      // update the beam search metadata
-      // how many sub request in current request
-      // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries?
-      new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size;
-      // update the parentid, accumalated_probs, depth, and token_ids
-      new_bc.beamRequestsInfo[i].current_depth =
-          old_bc.beamRequestsInfo[i].current_depth + 1;
-      new_bc.beamRequestsInfo[i].beam_size =
-          old_bc.beamRequestsInfo[i].beam_size;
-      new_bc.beamRequestsInfo[i].max_depth =
-          old_bc.beamRequestsInfo[i].max_depth;
-
-      // do the slot exchange to minimize the cache exchange in kernel.
-      // std::cout << "update metadata" << std::endl;
-      update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
-
-      if (new_bc.requestsInfo[i].token_start_offset + 1 >=
-          request.tokens.size()) {
-        // Incremental phase
-        new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-      } else {
-        // Prompt phase
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
-                     (int)request.tokens.size() -
-                         new_bc.requestsInfo[i].token_start_offset);
-      }
-
-      // register more tokens due to the beam width
-      for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-        int depth = new_bc.requestsInfo[i].token_start_offset + j;
-        for (int k = 0; k < new_bc.sub_requests[i]; k++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-
-          // get value from requestinfo
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_bc.beamRequestsInfo[i].tokens[k];
-          // request.tokens[depth];
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
-          new_bc.num_tokens++;
-        }
-      }
-    }
-  }
-  if (verbose) {
-    std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:"
-              << std::endl;
-    old_bc.print();
-    new_bc.print();
-  }
-  return new_bc;
-}
 
+/***** Request Init Phase *****/
 BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
     TreeVerifyBatchConfigFuture const &old_bc,
     InferenceResultFuture const &result,
@@ -894,6 +753,152 @@ BeamSearchBatchConfig
   return new_bc;
 }
 
+/***** Beam Search Phase *****/
+BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
+    BeamSearchBatchConfigFuture const &old_bc,
+    BeamInferenceResultFuture const &result) {
+  Runtime *runtime = Runtime::get_runtime();
+  Context ctx = Runtime::get_context();
+
+  RequestManager *rm = this;
+  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+                        TaskArgument(&rm, sizeof(RequestManager *)));
+  launcher.add_future(old_bc);
+  launcher.add_future(result);
+  return runtime->execute_task(ctx, launcher);
+}
+
+BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  RequestManager *rm = *((RequestManager **)task->args);
+  BeamSearchBatchConfig const &bc =
+      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+  BeamInferenceResult const &result =
+      Future(task->futures[1]).get_result<BeamInferenceResult>();
+  return rm->prepare_next_batch_beam(bc, result);
+}
+
+// update beam search metadata
+BeamSearchBatchConfig
+    RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
+                                            BeamInferenceResult const &result) {
+  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  if (verbose) {
+    std::cout << "\n############### prepare_next_batch_beam ###############\n";
+  }
+  if (verbose) {
+    std::cout << "print all results"
+              << "\n";
+    for (int i = 0; i < 40; i++) {
+      std::cout << result.token_ids[i] << ", ";
+    }
+    std::cout << "Current Beam Depth: "
+              << old_bc.beamRequestsInfo[0].current_depth << "\n";
+  }
+
+  // Step 1: Store result to the beam tree struct
+  store_beam_metadata(old_bc, result);
+
+  // Step 2: preparing the next batch for existing requests
+  BeamSearchBatchConfig new_bc;
+  new_bc.max_init_length = 0;
+  new_bc.model_id = old_bc.model_id;
+  // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n";
+
+  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+    if (old_bc.request_completed[i]) {
+      continue;
+    }
+    // Comment out this assertion since num_tokens_in_batch can be
+    // zero when beam search has reached required sequence length
+    // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
+    Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+    int processed_tokens = old_bc.requestsInfo[i].token_start_offset +
+                           old_bc.requestsInfo[i].num_tokens_in_batch;
+
+    // assert(processed_tokens < request.tokens.size());
+    log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
+    if (processed_tokens >
+        old_bc.beamRequestsInfo[i].max_depth + request.tokens.size()
+        // || ir.results[t] == 0 TODO: replace this with <EOS>
+    ) {
+      log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)",
+                        old_bc.requestsInfo[i].request_guid,
+                        old_bc.beamRequestsInfo[i].max_depth);
+      // new_bc.request_completed[i] = true;
+      new_bc.request_completed[i] = false;
+      new_bc.requestsInfo[i].token_start_offset = processed_tokens;
+      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length =
+          old_bc.requestsInfo[i].max_sequence_length;
+    } else {
+      log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
+                          << new_bc.num_tokens;
+      new_bc.request_completed[i] = false;
+      new_bc.requestsInfo[i].token_start_offset = processed_tokens;
+      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length =
+          old_bc.requestsInfo[i].max_sequence_length;
+
+      // update the beam search metadata
+      // how many sub request in current request
+      // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries?
+      new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size;
+      // update the parentid, accumalated_probs, depth, and token_ids
+      new_bc.beamRequestsInfo[i].current_depth =
+          old_bc.beamRequestsInfo[i].current_depth + 1;
+      new_bc.beamRequestsInfo[i].beam_size =
+          old_bc.beamRequestsInfo[i].beam_size;
+      new_bc.beamRequestsInfo[i].max_depth =
+          old_bc.beamRequestsInfo[i].max_depth;
+
+      // do the slot exchange to minimize the cache exchange in kernel.
+      // std::cout << "update metadata" << std::endl;
+      update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
+
+      if (new_bc.requestsInfo[i].token_start_offset + 1 >=
+          request.tokens.size()) {
+        // Incremental phase
+        new_bc.requestsInfo[i].num_tokens_in_batch = 1;
+      } else {
+        // Prompt phase
+        new_bc.requestsInfo[i].num_tokens_in_batch =
+            std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
+                     (int)request.tokens.size() -
+                         new_bc.requestsInfo[i].token_start_offset);
+      }
+
+      // register more tokens due to the beam width
+      for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
+        int depth = new_bc.requestsInfo[i].token_start_offset + j;
+        for (int k = 0; k < new_bc.sub_requests[i]; k++) {
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+
+          // get value from requestinfo
+          new_bc.tokensInfo[new_bc.num_tokens].token_id =
+              new_bc.beamRequestsInfo[i].tokens[k];
+          // request.tokens[depth];
+          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
+          new_bc.num_tokens++;
+        }
+      }
+    }
+  }
+  if (verbose) {
+    std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:"
+              << std::endl;
+    old_bc.print();
+    new_bc.print();
+  }
+  return new_bc;
+}
+
+/***** Verify Phase *****/
+
 TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify(
     std::vector<BeamSearchBatchConfigFuture> const &old_batches) {
   Runtime *runtime = Runtime::get_runtime();

From 1de3e2169fa15a789bdc6c66c158b07aac5b7b56 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@catalyst-0-11.eth>
Date: Tue, 22 Aug 2023 01:23:04 -0400
Subject: [PATCH 02/30] refactor and small fixes.

---
 CMakeLists.txt                  |   1 +
 include/flexflow/batch_config.h |   2 +-
 src/runtime/request_manager.cc  | 166 +++++++++++++++-----------------
 3 files changed, 82 insertions(+), 87 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9fced69cf8..95e4066037 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,6 +307,7 @@ list(APPEND FF_NVCC_FLAGS
 list(APPEND FF_LD_FLAGS
   -lrt
   -ldl
+  -lstdc++fs
   -rdynamic)
 
 # Set FF FLAGS
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ce331d3e41..a8e974841c 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -53,6 +53,7 @@ class BatchConfig {
 
   //  These are set by update
   int num_tokens;
+  bool loading_prompt = false;
 
   struct PerRequestInfo {
     int token_start_offset;
@@ -113,7 +114,6 @@ class BeamSearchBatchConfig : public BatchConfig {
   inline static int const MAX_BEAM_DEPTH = 8;
 
   int model_id;
-  int max_init_length = 0;
 
   struct BeamSearchPerRequestInfo {
     int beam_size;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7a1bdf8e7e..e251fb4638 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -136,7 +136,7 @@ RequestManager::RequestGuid
   // Add a new request
   Request request;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
+  request.max_sequence_length = max_sequence_length; 
 
   if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
@@ -498,6 +498,7 @@ BeamSearchBatchConfig
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
   }
+
   // Step 1: use result to update requests
   BeamSearchBatchConfig new_bc;
   new_bc.num_tokens = 0;
@@ -517,62 +518,52 @@ BeamSearchBatchConfig
 
     assert(old_bc.num_tokens > 0);
 
-    int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
-    if (committed_tokens.find(guid) == committed_tokens.end()) {
-      committed_tokens[guid] = std::vector<std::pair<int, int>>();
+    // int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
+
+    // reset committed_tokens
+    if (committed_tokens.count(guid) == 0) {
+        committed_tokens[guid] = {};
     } else {
-      committed_tokens.at(guid).clear();
+        committed_tokens[guid].clear();
     }
+
+
     // iterate through all the tokens that belong to request i
-    while (result_index < old_bc.num_tokens &&
-           old_bc.tokensInfo[result_index].request_index == i) {
-      // new tokens have not been appended yet, so the last appended token is
-      // the root of the beam search token tree
-      int root_abs_depth = request.tokens.size() - 1;
-      if (old_bc.tokensInfo[result_index].abs_depth_in_request >=
-          root_abs_depth) {
-        // append to tree_outputs a pair consisting of (token id, depth)
-        tree_outputs.push_back(std::make_pair(
-            result.token_ids[result_index],
-            old_bc.tokensInfo[result_index].abs_depth_in_request + 1));
-        // append (depth, index of the token in result) to committed_tokens
-        // array
-        committed_tokens.at(guid).push_back(
-            std::make_pair(old_bc.tokensInfo[result_index].abs_depth_in_request,
-                           result_index));
+    int root_abs_depth = request.tokens.size() - 1;
 
-        if (verbose) {
-          std::cout << "Index within old batch: " << result_index << std::endl;
-          printf("  Input: [%d] %d ---> [%d] %d \n",
-                 old_bc.tokensInfo[result_index].abs_depth_in_request,
-                 old_bc.tokensInfo[result_index].token_id,
-                 tree_outputs.back().second,
-                 tree_outputs.back().first);
+    while (result_index < old_bc.num_tokens &&
+          old_bc.tokensInfo[result_index].request_index == i) {
+        int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
+        int token_id = result.token_ids[result_index];
+
+        if (abs_depth >= root_abs_depth) {
+            tree_outputs.emplace_back(token_id, abs_depth + 1);
+            committed_tokens[guid].emplace_back(abs_depth, result_index);
+
+            if (verbose) {
+                std::cout << "Index within old batch: " << result_index << std::endl;
+                printf("  Input: [%d] %d ---> [%d] %d \n",
+                      abs_depth, old_bc.tokensInfo[result_index].token_id,
+                      tree_outputs.back().second, token_id);
+            }
         }
-        // std::cout << "  Input: " << old_bc.tokensInfo[result_index].token_id
-        // << ""
-        //   << old_bc.tokensInfo[result_index].abs_depth_in_request <<
-        //   std::endl;
-        // std::cout << "  Result: " << result.token_ids[result_index] << ",
-        // depth: "
-        //   << old_bc.tokensInfo[result_index].abs_depth_in_request + 1 <<
-        //   std::endl;
-      }
-      result_index++;
+        result_index++;
     }
 
+
     std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
         traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
     log_req_mgr.print("Number of Verified Tokens = %zu",
                       verified_tokens.size());
+
     // check if the request is finished
     if (verified_tokens.size() + request.tokens.size() >=
         request.max_sequence_length) {
       // Append all verified tokens to the request
-      for (int j = 0; j < verified_tokens.size(); j++) {
-        if (verified_tokens[j].second < request.max_sequence_length) {
-          request.tokens.push_back(verified_tokens[j].first);
-        }
+      for (const auto &token_pair : verified_tokens) {
+          if (token_pair.second < request.max_sequence_length) {
+              request.tokens.push_back(token_pair.first);
+          }
       }
 
       log_req_mgr.print("[Done] guid(%zu) with final length(%zu)",
@@ -587,8 +578,11 @@ BeamSearchBatchConfig
         gr.output_text = output;
       }
       log_req_mgr.print("Final output: %s", output.c_str());
+
       new_bc.request_completed[i] = true;
       num_processed_requests++;
+
+      // Log profiling info
       ProfileInfo profile_info = profiling_requests[request.guid];
       profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
       total_request_run_time +=
@@ -632,68 +626,69 @@ BeamSearchBatchConfig
       // delete the old input tree from cache
       dfs_tree_inputs.erase(request.guid);
 
-      continue;
-    }
+    } else { // Request not finished, pass verified_tokens to next iteration
 
-    new_bc.request_completed[i] = false;
+      new_bc.request_completed[i] = false;
 
-    // Normal Request Info
-    new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second;
-    new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-    new_bc.requestsInfo[i].max_sequence_length =
-        old_bc.requestsInfo[i].max_sequence_length;
-    new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
-
-    // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-    int new_max_depth = new_bc.requestsInfo[i].max_sequence_length -
-                        new_bc.requestsInfo[i].token_start_offset -
-                        verified_tokens.size();
-    new_bc.beamRequestsInfo[i].current_depth = 1;
-    new_bc.beamRequestsInfo[i].beam_size =
-        BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-    new_bc.beamRequestsInfo[i].max_depth =
-        std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
-    for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
-      new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-      new_bc.beamRequestsInfo[i].probs[j] = 1;
-    }
+      // Normal Request Info
+      new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second;
+      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length =
+          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
 
-    new_bc.sub_requests[i] = 1;
+      // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
+      int new_max_depth = new_bc.requestsInfo[i].max_sequence_length -
+                          new_bc.requestsInfo[i].token_start_offset -
+                          verified_tokens.size();
+      new_bc.beamRequestsInfo[i].current_depth = 1;
+      new_bc.beamRequestsInfo[i].beam_size =
+          BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+      new_bc.beamRequestsInfo[i].max_depth =
+          std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
+      for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
+        new_bc.beamRequestsInfo[i].parent_id[j] = 0;
+        new_bc.beamRequestsInfo[i].probs[j] = 1;
+      }
 
-    // Token Info
-    for (int j = 0; j < verified_tokens.size(); j++) {
-      auto token = verified_tokens.at(j);
+      new_bc.sub_requests[i] = 1;
 
-      // Normal Token Info
-      new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second;
+      // Token Info
+      for (int j = 0; j < verified_tokens.size(); j++) {
+        auto token = verified_tokens.at(j);
 
-      // Beam Token Info
-      new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
-      new_bc.num_tokens++;
+        // Normal Token Info
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second;
 
-      // Add verified token to request's token list
-      request.tokens.push_back(token.first);
+        // Beam Token Info
+        new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
+        new_bc.num_tokens++;
 
-      if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
-        break;
+        // Add verified token to request's token list
+        request.tokens.push_back(token.first);
+
+        if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
+          break;
+        }
       }
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      log_req_mgr.print("Output: %s", output.c_str());
     }
-    std::string output = this->tokenizer_->Decode(request.tokens);
-    log_req_mgr.print("Output: %s", output.c_str());
   }
 
+
+
+
+
   // Step 2: Initialize new request
-  new_bc.max_init_length = 0;
   for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) {
     if (new_bc.request_completed[i]) {
       if (!pending_request_queue.empty() &&
           new_bc.num_tokens < BeamSearchBatchConfig::MAX_NUM_TOKENS) {
         Request new_request = pending_request_queue.front();
         pending_request_queue.pop();
-        new_bc.max_init_length =
-            std::max(new_bc.max_init_length, new_request.initial_len);
         // all_requests[new_request.guid] = new_request;
         new_bc.requestsInfo[i].token_start_offset = 0;
         new_bc.requestsInfo[i].request_guid = new_request.guid;
@@ -804,7 +799,6 @@ BeamSearchBatchConfig
 
   // Step 2: preparing the next batch for existing requests
   BeamSearchBatchConfig new_bc;
-  new_bc.max_init_length = 0;
   new_bc.model_id = old_bc.model_id;
   // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n";
 

From c37235fd5d264ec6790b9435406e0ab668760a88 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Thu, 24 Aug 2023 21:21:05 -0400
Subject: [PATCH 03/30] Update

---
 src/runtime/request_manager.cc | 79 ++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e251fb4638..cc1322334c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -136,7 +136,7 @@ RequestManager::RequestGuid
   // Add a new request
   Request request;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length; 
+  request.max_sequence_length = max_sequence_length;
 
   if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
@@ -518,39 +518,43 @@ BeamSearchBatchConfig
 
     assert(old_bc.num_tokens > 0);
 
-    // int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
-
     // reset committed_tokens
     if (committed_tokens.count(guid) == 0) {
-        committed_tokens[guid] = {};
+      committed_tokens[guid] = {};
     } else {
-        committed_tokens[guid].clear();
+      committed_tokens[guid].clear();
     }
 
-
     // iterate through all the tokens that belong to request i
     int root_abs_depth = request.tokens.size() - 1;
 
     while (result_index < old_bc.num_tokens &&
-          old_bc.tokensInfo[result_index].request_index == i) {
-        int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
-        int token_id = result.token_ids[result_index];
-
-        if (abs_depth >= root_abs_depth) {
-            tree_outputs.emplace_back(token_id, abs_depth + 1);
-            committed_tokens[guid].emplace_back(abs_depth, result_index);
-
-            if (verbose) {
-                std::cout << "Index within old batch: " << result_index << std::endl;
-                printf("  Input: [%d] %d ---> [%d] %d \n",
-                      abs_depth, old_bc.tokensInfo[result_index].token_id,
-                      tree_outputs.back().second, token_id);
-            }
+           old_bc.tokensInfo[result_index].request_index == i) {
+      int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
+      int token_id = result.token_ids[result_index];
+
+      if (abs_depth >= root_abs_depth) {
+        tree_outputs.emplace_back(token_id, abs_depth + 1);
+        committed_tokens[guid].emplace_back(abs_depth, result_index);
+
+        if (verbose) {
+          std::cout << "Index within old batch: " << result_index << std::endl;
+          printf("  Input: [%d] %d ---> [%d] %d \n",
+                 abs_depth,
+                 old_bc.tokensInfo[result_index].token_id,
+                 tree_outputs.back().second,
+                 token_id);
         }
-        result_index++;
+        std::cout << "Index within old batch: " << result_index << std::endl;
+        printf("  Input: [%d] %d ---> [%d] %d \n",
+               abs_depth,
+               old_bc.tokensInfo[result_index].token_id,
+               tree_outputs.back().second,
+               token_id);
+      }
+      result_index++;
     }
 
-
     std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
         traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
     log_req_mgr.print("Number of Verified Tokens = %zu",
@@ -560,10 +564,10 @@ BeamSearchBatchConfig
     if (verified_tokens.size() + request.tokens.size() >=
         request.max_sequence_length) {
       // Append all verified tokens to the request
-      for (const auto &token_pair : verified_tokens) {
-          if (token_pair.second < request.max_sequence_length) {
-              request.tokens.push_back(token_pair.first);
-          }
+      for (auto const &token_pair : verified_tokens) {
+        if (token_pair.second < request.max_sequence_length) {
+          request.tokens.push_back(token_pair.first);
+        }
       }
 
       log_req_mgr.print("[Done] guid(%zu) with final length(%zu)",
@@ -631,7 +635,8 @@ BeamSearchBatchConfig
       new_bc.request_completed[i] = false;
 
       // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second;
+      new_bc.requestsInfo[i].token_start_offset =
+          verified_tokens.front().second;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
       new_bc.requestsInfo[i].max_sequence_length =
           old_bc.requestsInfo[i].max_sequence_length;
@@ -660,7 +665,8 @@ BeamSearchBatchConfig
         // Normal Token Info
         new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            token.second;
 
         // Beam Token Info
         new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
@@ -678,10 +684,6 @@ BeamSearchBatchConfig
     }
   }
 
-
-
-
-
   // Step 2: Initialize new request
   for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) {
     if (new_bc.request_completed[i]) {
@@ -850,7 +852,6 @@ BeamSearchBatchConfig
           old_bc.beamRequestsInfo[i].max_depth;
 
       // do the slot exchange to minimize the cache exchange in kernel.
-      // std::cout << "update metadata" << std::endl;
       update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
 
       if (new_bc.requestsInfo[i].token_start_offset + 1 >=
@@ -974,7 +975,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
     // Profiling
     profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1;
-    // TODO: Add prompt token first in first verify iteration
+
+    // Add prompt token first in first verify iteration
     if (request.tokens.size() == request.initial_len) {
       // Initialization (prompt) phase
       for (int j = 0; j < request.initial_len; j++) {
@@ -1020,8 +1022,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
     // add prompt to the dfs tree
     if (committed_tokens.find(guid) != committed_tokens.end()) {
-      if (dfs_tree_inputs.at(0).second ==
-          request.initial_len + committed_tokens.at(guid).size() - 1) {
+      if (dfs_tree_inputs.at(0).second == request.initial_len +
+                                              committed_tokens.at(guid).size() -
+                                              1) { // commit prompt
         for (int j = 0; j < request.initial_len; j++) {
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
@@ -1034,7 +1037,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           }
           new_bc.num_tokens_to_commit++;
         }
-      } else {
+      } else { // commit the last token
         // only add the root token
         auto committed_token = committed_tokens.at(guid).at(0);
         new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
@@ -1101,7 +1104,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.num_tokens++;
       new_bc.requestsInfo[i].num_tokens_in_batch++;
 
-      if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) {
+      if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
         break;
       }
     }

From d18926f61013c7b4543279c611957847c683b79b Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Mon, 4 Sep 2023 20:00:01 -0400
Subject: [PATCH 04/30] Refactor backup.

---
 include/flexflow/batch_config.h |   1 +
 src/runtime/request_manager.cc  | 359 ++++++++++++++++++++------------
 2 files changed, 223 insertions(+), 137 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index a8e974841c..ee3d5ac183 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -70,6 +70,7 @@ class BatchConfig {
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
+  bool request_running[MAX_NUM_TOKENS];
 };
 
 class TreeVerifyBatchConfig : public BatchConfig {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3d5780418e..215d7eb610 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -740,6 +740,26 @@ BeamSearchBatchConfig
           new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
           new_bc.num_tokens++;
         }
+
+        if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) {
+          all_requests[new_request.guid].status = Request::PENDING;
+          new_bc.request_running[i] = false;
+          std::cout << "Request " << new_request.guid << " is pending"
+                    << std::endl;
+        } else {
+          all_requests[new_request.guid].status = Request::RUNNING;
+          new_bc.request_running[i] = true;
+          std::cout << "Request " << new_request.guid << " is running"
+                    << std::endl;
+        }
+
+        std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch
+                  << " tokens for request " << new_request.guid << std::endl;
+        std::cout << "total prompt in request: " << new_request.initial_len
+                  << std::endl;
+
+
+
         if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
           break;
         }
@@ -849,16 +869,30 @@ BeamSearchBatchConfig
       // how many sub request in current request
       // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries?
       new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size;
+      
       // update the parentid, accumalated_probs, depth, and token_ids
-      new_bc.beamRequestsInfo[i].current_depth =
-          old_bc.beamRequestsInfo[i].current_depth + 1;
       new_bc.beamRequestsInfo[i].beam_size =
-          old_bc.beamRequestsInfo[i].beam_size;
+        old_bc.beamRequestsInfo[i].beam_size;
       new_bc.beamRequestsInfo[i].max_depth =
-          old_bc.beamRequestsInfo[i].max_depth;
+        old_bc.beamRequestsInfo[i].max_depth;
+      if (request.status == Request::RUNNING) {
+        new_bc.beamRequestsInfo[i].current_depth =
+          old_bc.beamRequestsInfo[i].current_depth + 1;
+        new_bc.request_running[i] = true;
+        // do the slot exchange to minimize the cache exchange in kernel.
+        update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
+      } else {
+        // if the request is pending, we need to update the beam search
+        // metadata based on the initial length
+        new_bc.beamRequestsInfo[i].current_depth =
+          old_bc.beamRequestsInfo[i].current_depth;
+        new_bc.request_running[i] = false;
+      }
+
+
 
       // do the slot exchange to minimize the cache exchange in kernel.
-      update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
+      // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
 
       if (new_bc.requestsInfo[i].token_start_offset + 1 >=
           request.tokens.size()) {
@@ -880,9 +914,14 @@ BeamSearchBatchConfig
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
 
           // get value from requestinfo
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_bc.beamRequestsInfo[i].tokens[k];
-          // request.tokens[depth];
+          if (request.status == Request::RUNNING) {
+            new_bc.tokensInfo[new_bc.num_tokens].token_id =
+                new_bc.beamRequestsInfo[i].tokens[k];
+          } else {
+            new_bc.tokensInfo[new_bc.num_tokens].token_id =
+                request.tokens[depth];
+          }
+
           new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
           new_bc.num_tokens++;
         }
@@ -948,174 +987,220 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     size_t guid = old_batches.at(0).requestsInfo[i].request_guid;
     Request &request = all_requests[guid];
 
-    // Get the dfs tree
-    std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-        all_dfs_trees;
-
-    for (int j = 0; j < old_batches.size(); j++) {
-      std::vector<std::pair<BatchConfig::TokenId, int>> new_tree =
-          traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1);
-      all_dfs_trees.push_back(new_tree);
-    }
-    assert(all_dfs_trees.size() == old_batches.size());
-    std::vector<std::pair<BatchConfig::TokenId, int>> dfs_tree_inputs =
-        merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid);
-
-    if (verbose) {
-      std::cout << "Request Tokens Size: " << request.tokens.size()
-                << std::endl;
-      for (int k = 0; k < request.tokens.size(); k++) {
-        std::cout << k << ": " << request.tokens[k] << std::endl;
-      }
-    }
-
-    // Normal Request Info
-    new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second;
-    new_bc.requestsInfo[i].request_guid =
-        old_batches.at(0).requestsInfo[i].request_guid;
-    new_bc.requestsInfo[i].max_sequence_length =
-        old_batches.at(0).requestsInfo[i].max_sequence_length;
-    // TODO: Check this
-    new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-    new_bc.request_completed[i] = false;
-
     // Profiling
     profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1;
 
-    // Add prompt token first in first verify iteration
-    if (request.tokens.size() == request.initial_len) {
-      // Initialization (prompt) phase
-      for (int j = 0; j < request.initial_len; j++) {
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j];
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j;
+    if(request.status == Request::RUNNING) {
+      new_bc.request_running[i] = true;
+      std::cout << "[Verify] Request " << request.guid << " is running" << std::endl;
 
-        new_bc.num_tokens++;
-        new_bc.requestsInfo[i].num_tokens_in_batch++;
-      }
+      // Get the dfs tree
+      std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
+          all_dfs_trees;
 
-      std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
-      if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
-        assert(false &&
-               "Exceeding the space available in the TreeVerify batch");
-        break;
+      for (int j = 0; j < old_batches.size(); j++) {
+        std::vector<std::pair<BatchConfig::TokenId, int>> new_tree =
+            traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1);
+        all_dfs_trees.push_back(new_tree);
       }
+      assert(all_dfs_trees.size() == old_batches.size());
+      std::vector<std::pair<BatchConfig::TokenId, int>> dfs_tree_inputs =
+          merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid);
 
-      new_bc.requestsInfo[i].token_start_offset = 0;
-    } else {
-      // Incremental phase: only add the last committed token
-      new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-          request.tokens.size() - 1;
-
-      new_bc.num_tokens++;
-      new_bc.requestsInfo[i].num_tokens_in_batch++;
-
-      if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
-        assert(false &&
-               "Exceeding the space available in the TreeVerify batch");
-        break;
+      if (verbose) {
+        std::cout << "Request Tokens Size: " << request.tokens.size()
+                  << std::endl;
+        for (int k = 0; k < request.tokens.size(); k++) {
+          std::cout << k << ": " << request.tokens[k] << std::endl;
+        }
       }
 
-      new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
-    }
-
-    if (verbose) {
-      std::cout << "dfs_tree_inputs.size(): " << dfs_tree_inputs.size()
-                << std::endl;
-    }
+      // Normal Request Info
+      new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second;
+      new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length;
+      // TODO: Check this
+      new_bc.requestsInfo[i].num_tokens_in_batch = 0;
+      new_bc.request_completed[i] = false;
 
-    // add prompt to the dfs tree
-    if (committed_tokens.find(guid) != committed_tokens.end()) {
-      if (dfs_tree_inputs.at(0).second == request.initial_len +
-                                              committed_tokens.at(guid).size() -
-                                              1) { // commit prompt
+      // Add prompt token first in first verify iteration
+      if (request.tokens.size() == request.initial_len) {
+        // Initialization (prompt) phase
         for (int j = 0; j < request.initial_len; j++) {
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
-          if (verbose) {
-            std::cout << new_bc.num_tokens_to_commit
-                      << "- committed_token.token_depth: " << j
-                      << ", token_index: " << j << std::endl;
-          }
-          new_bc.num_tokens_to_commit++;
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j];
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j;
+
+          new_bc.num_tokens++;
+          new_bc.requestsInfo[i].num_tokens_in_batch++;
         }
-      } else { // commit the last token
-        // only add the root token
-        auto committed_token = committed_tokens.at(guid).at(0);
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-            committed_token.second;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-            committed_token.first;
-        if (verbose) {
-          std::cout << new_bc.num_tokens_to_commit
-                    << "- committed_token.token_depth: "
-                    << committed_token.first
-                    << ", token_index: " << committed_token.second << std::endl;
+
+        std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+        if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
+          assert(false &&
+                "Exceeding the space available in the TreeVerify batch");
+          break;
         }
-        new_bc.num_tokens_to_commit++;
-      }
-      if (verbose) {
-        std::cout << "new_bc.num_tokens_to_commit: "
-                  << new_bc.num_tokens_to_commit << std::endl;
-      }
-    }
 
-    // Token Info
-    for (int j = 1; j < dfs_tree_inputs.size(); j++) {
-      auto token = dfs_tree_inputs.at(j);
-      if (verbose) {
-        std::cout << "[" << j << "] Token: " << token.first
-                  << ", Depth:" << token.second << std::endl;
-      }
-      // Normal Token Info
-      new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second;
+        new_bc.requestsInfo[i].token_start_offset = 0;
+      } else {
+        // Incremental phase: only add the last committed token
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.tokens.size() - 1;
 
-      // TODO: Add committed token info
-      if (verbose) {
-        std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit
-                  << std::endl;
+        new_bc.num_tokens++;
+        new_bc.requestsInfo[i].num_tokens_in_batch++;
+
+        if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
+          assert(false &&
+                "Exceeding the space available in the TreeVerify batch");
+          break;
+        }
+
+        new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
       }
 
       if (committed_tokens.find(guid) != committed_tokens.end()) {
-        if (j < committed_tokens.at(guid).size()) {
-          auto committed_token = committed_tokens.at(guid).at(j);
+        if (dfs_tree_inputs.at(0).second == request.initial_len +
+                                                committed_tokens.at(guid).size() -
+                                                1) { // commit prompt
+          for (int j = 0; j < request.initial_len; j++) {
+            new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
+            new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+                i;
+            new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
+            if (verbose) {
+              std::cout << new_bc.num_tokens_to_commit
+                        << "- committed_token.token_depth: " << j
+                        << ", token_index: " << j << std::endl;
+            }
+            new_bc.num_tokens_to_commit++;
+          }
+        } else { // commit the last token
+          // only add the root token
+          auto committed_token = committed_tokens.at(guid).at(0);
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
               committed_token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i;
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
               committed_token.first;
           if (verbose) {
             std::cout << new_bc.num_tokens_to_commit
                       << "- committed_token.token_depth: "
                       << committed_token.first
-                      << ", token_index: " << committed_token.second
-                      << std::endl;
+                      << ", token_index: " << committed_token.second << std::endl;
           }
           new_bc.num_tokens_to_commit++;
         }
+        if (verbose) {
+          std::cout << "new_bc.num_tokens_to_commit: "
+                    << new_bc.num_tokens_to_commit << std::endl;
+        }
+
+
+        for (int j = 1; j < dfs_tree_inputs.size(); j++) {
+          if (j < committed_tokens.at(guid).size()) {
+            auto committed_token = committed_tokens.at(guid).at(j);
+            new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+                committed_token.second;
+            new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+                i;
+            new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+                committed_token.first;
+            if (verbose) {
+              std::cout << new_bc.num_tokens_to_commit
+                        << "- committed_token.token_depth: "
+                        << committed_token.first
+                        << ", token_index: " << committed_token.second
+                        << std::endl;
+            }
+            new_bc.num_tokens_to_commit++;
+          }
+        }
       }
       if (verbose) {
         std::cout << "new_bc.num_tokens_to_commit: "
                   << new_bc.num_tokens_to_commit << std::endl;
       }
 
-      new_bc.num_tokens++;
-      new_bc.requestsInfo[i].num_tokens_in_batch++;
 
-      if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
-        break;
+      // add prompt to the dfs tree
+      // if (committed_tokens.find(guid) != committed_tokens.end()) {
+      //   if (dfs_tree_inputs.at(0).second == request.initial_len +
+      //                                           committed_tokens.at(guid).size() -
+      //                                           1) { // commit prompt
+      //     for (int j = 0; j < request.initial_len; j++) {
+      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
+      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+      //           i;
+      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
+      //       if (verbose) {
+      //         std::cout << new_bc.num_tokens_to_commit
+      //                   << "- committed_token.token_depth: " << j
+      //                   << ", token_index: " << j << std::endl;
+      //       }
+      //       new_bc.num_tokens_to_commit++;
+      //     }
+      //   } else { // commit the last token
+      //     // only add the root token
+      //     auto committed_token = committed_tokens.at(guid).at(0);
+      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+      //         committed_token.second;
+      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i;
+      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+      //         committed_token.first;
+      //     if (verbose) {
+      //       std::cout << new_bc.num_tokens_to_commit
+      //                 << "- committed_token.token_depth: "
+      //                 << committed_token.first
+      //                 << ", token_index: " << committed_token.second << std::endl;
+      //     }
+      //     new_bc.num_tokens_to_commit++;
+      //   }
+      //   if (verbose) {
+      //     std::cout << "new_bc.num_tokens_to_commit: "
+      //               << new_bc.num_tokens_to_commit << std::endl;
+      //   }
+      // }
+
+      // Add Tokens from the DFS Tree to the next batch
+      for (int j = 1; j < dfs_tree_inputs.size(); j++) {
+        auto token = dfs_tree_inputs.at(j);
+        if (verbose) {
+          std::cout << "[" << j << "] Token: " << token.first
+                    << ", Depth:" << token.second << std::endl;
+        }
+        // Normal Token Info
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second;
+
+        // TODO: Add committed token info
+        if (verbose) {
+          std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit
+                    << std::endl;
+        }
+
+        new_bc.num_tokens++;
+        new_bc.requestsInfo[i].num_tokens_in_batch++;
+
+        if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) {
+          break;
+        }
       }
+
+      std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+    } else if (request.status == Request::PENDING) {
+      new_bc.request_running[i] = false;
+      std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl;
+    } else {
+      assert(false && "Request status is not RUNNING or PENDING");
     }
 
-    std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+
+
   }
 
   if (verbose) {

From 99bb6963c495f0cc9cc8cd3d4f366792ac0a9dfc Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Tue, 5 Sep 2023 00:23:58 -0400
Subject: [PATCH 05/30] pipeline update.

---
 include/flexflow/request_manager.h           |   4 +
 src/ops/tree_inc_multihead_self_attention.cu |   3 +
 src/runtime/request_manager.cc               | 391 ++++++++++++-------
 3 files changed, 258 insertions(+), 140 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e444402dd0..3156801368 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -59,6 +59,10 @@ struct Request {
   BatchConfig::RequestGuid guid;
   int max_sequence_length;
   int initial_len;
+  int ssm_cache_size = 0;
+  int llm_cache_size = 0;
+
+  
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 69f085d3eb..455797e9b3 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -500,6 +500,9 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                   m->oProjSize);
   }
 
+  std::cout << "processed_tokens_in_batch: " << processed_tokens_in_batch
+            << std::endl;
+  std::cout << "num_active_tokens: " << bc->num_active_tokens() << std::endl;
   assert(processed_tokens_in_batch == bc->num_active_tokens());
 }
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 215d7eb610..2afbab2f87 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -506,6 +506,8 @@ BeamSearchBatchConfig
     std::cout << "\n############### prepare_next_batch_init ###############\n";
   }
 
+  std::cout << "\n############### prepare_next_batch_init ###############\n";
+
   // Step 1: use result to update requests
   BeamSearchBatchConfig new_bc;
   new_bc.num_tokens = 0;
@@ -540,7 +542,9 @@ BeamSearchBatchConfig
       int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
       int token_id = result.token_ids[result_index];
 
-      if (abs_depth >= root_abs_depth) {
+      if (request.status == Request::PENDING) {
+        committed_tokens[guid].emplace_back(abs_depth, result_index);
+      } else if (abs_depth >= root_abs_depth) {
         tree_outputs.emplace_back(token_id, abs_depth + 1);
         committed_tokens[guid].emplace_back(abs_depth, result_index);
 
@@ -558,105 +562,157 @@ BeamSearchBatchConfig
                old_bc.tokensInfo[result_index].token_id,
                tree_outputs.back().second,
                token_id);
-      }
+      } 
       result_index++;
     }
+    
+    if (request.status == Request::RUNNING) {
+      std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
+          traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
+      log_req_mgr.print("Number of Verified Tokens = %zu",
+                        verified_tokens.size());
+
+      // check if the request is finished
+      if (verified_tokens.size() + request.tokens.size() >=
+          request.max_sequence_length) {
+        // Append all verified tokens to the request
+        for (auto const &token_pair : verified_tokens) {
+          if (token_pair.second < request.max_sequence_length) {
+            request.tokens.push_back(token_pair.first);
+          }
+        }
+        request.status = Request::COMPLETED;
+        log_req_mgr.print("[Done] guid(%zu) with final length(%zu)",
+                          request.guid,
+                          request.tokens.size());
+        std::string output = this->tokenizer_->Decode(request.tokens);
+        {
+          // update generation result and trigger future
+          GenerationResult &gr = request_generation_results[request.guid];
+          assert(gr.guid == request.guid);
+          gr.output_tokens = request.tokens;
+          gr.output_text = output;
+        }
+        log_req_mgr.print("Final output: %s", output.c_str());
 
-    std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
-        traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
-    log_req_mgr.print("Number of Verified Tokens = %zu",
-                      verified_tokens.size());
-
-    // check if the request is finished
-    if (verified_tokens.size() + request.tokens.size() >=
-        request.max_sequence_length) {
-      // Append all verified tokens to the request
-      for (auto const &token_pair : verified_tokens) {
-        if (token_pair.second < request.max_sequence_length) {
-          request.tokens.push_back(token_pair.first);
+        new_bc.request_completed[i] = true;
+        new_bc.request_running[i] = false;
+        num_processed_requests++;
+
+        // Log profiling info
+        ProfileInfo profile_info = profiling_requests[request.guid];
+        profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+        total_request_run_time +=
+            profile_info.finish_time - profile_info.start_time;
+        profiling_requests[request.guid] = profile_info;
+        log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) "
+                          "finish(%.1lf) latency(%.1lf)",
+                          request.guid,
+                          profile_info.decoding_steps,
+                          profile_info.start_time,
+                          profile_info.finish_time,
+                          profile_info.finish_time - profile_info.start_time);
+
+        // Write output to file if needed:
+        if (!output_filepath.empty()) {
+          std::ofstream outputFile(output_filepath);
+          if (outputFile.is_open()) {
+            outputFile << "end-to-end latency: " << std::fixed
+                      << std::setprecision(3) << total_request_run_time
+                      << std::endl;
+            outputFile << "num decoding steps: " << profile_info.decoding_steps
+                      << std::endl;
+            outputFile << "token IDs: ";
+            for (int i = 0; i < request.tokens.size(); i++) {
+              outputFile << request.tokens[i];
+              if (i < request.tokens.size() - 1) {
+                outputFile << ",";
+              }
+            }
+            outputFile << std::endl;
+            outputFile << output;
+            outputFile.close();
+          } else {
+            std::cout << "Unable to open the output file: " << output_filepath
+                      << std::endl;
+            assert(false);
+          }
         }
-      }
-      request.status = Request::COMPLETED;
-      log_req_mgr.print("[Done] guid(%zu) with final length(%zu)",
-                        request.guid,
-                        request.tokens.size());
-      std::string output = this->tokenizer_->Decode(request.tokens);
-      {
-        // update generation result and trigger future
-        GenerationResult &gr = request_generation_results[request.guid];
-        assert(gr.guid == request.guid);
-        gr.output_tokens = request.tokens;
-        gr.output_text = output;
-      }
-      log_req_mgr.print("Final output: %s", output.c_str());
 
-      new_bc.request_completed[i] = true;
-      num_processed_requests++;
+        // delete the old input tree from cache
+        dfs_tree_inputs.erase(request.guid);
 
-      // Log profiling info
-      ProfileInfo profile_info = profiling_requests[request.guid];
-      profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
-      total_request_run_time +=
-          profile_info.finish_time - profile_info.start_time;
-      profiling_requests[request.guid] = profile_info;
-      log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) "
-                        "finish(%.1lf) latency(%.1lf)",
-                        request.guid,
-                        profile_info.decoding_steps,
-                        profile_info.start_time,
-                        profile_info.finish_time,
-                        profile_info.finish_time - profile_info.start_time);
+      } else { // Request not finished, pass verified_tokens to next iteration
 
-      // Write output to file if needed:
-      if (!output_filepath.empty()) {
-        std::ofstream outputFile(output_filepath);
-        if (outputFile.is_open()) {
-          outputFile << "end-to-end latency: " << std::fixed
-                     << std::setprecision(3) << total_request_run_time
-                     << std::endl;
-          outputFile << "num decoding steps: " << profile_info.decoding_steps
-                     << std::endl;
-          outputFile << "token IDs: ";
-          for (int i = 0; i < request.tokens.size(); i++) {
-            outputFile << request.tokens[i];
-            if (i < request.tokens.size() - 1) {
-              outputFile << ",";
-            }
-          }
-          outputFile << std::endl;
-          outputFile << output;
-          outputFile.close();
-        } else {
-          std::cout << "Unable to open the output file: " << output_filepath
-                    << std::endl;
-          assert(false);
+        new_bc.request_completed[i] = false;
+        new_bc.request_running[i] = true;
+
+        // Normal Request Info
+        new_bc.requestsInfo[i].token_start_offset =
+            verified_tokens.front().second;
+        new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
+        new_bc.requestsInfo[i].max_sequence_length =
+            old_bc.requestsInfo[i].max_sequence_length;
+        new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
+
+        // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
+        int new_max_depth = new_bc.requestsInfo[i].max_sequence_length -
+                            new_bc.requestsInfo[i].token_start_offset -
+                            verified_tokens.size();
+        new_bc.beamRequestsInfo[i].current_depth = 1;
+        new_bc.beamRequestsInfo[i].beam_size =
+            BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+        new_bc.beamRequestsInfo[i].max_depth =
+            std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
+        for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
+          new_bc.beamRequestsInfo[i].parent_id[j] = 0;
+          new_bc.beamRequestsInfo[i].probs[j] = 1;
         }
-      }
 
-      // delete the old input tree from cache
-      dfs_tree_inputs.erase(request.guid);
+        new_bc.sub_requests[i] = 1;
 
-    } else { // Request not finished, pass verified_tokens to next iteration
+        // Token Info
+        for (int j = 0; j < verified_tokens.size(); j++) {
+          auto token = verified_tokens.at(j);
+
+          // Normal Token Info
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              token.second;
 
+          // Beam Token Info
+          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
+          new_bc.num_tokens++;
+
+          // Add verified token to request's token list
+          request.tokens.push_back(token.first);
+
+          if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
+            break;
+          }
+        }
+        std::string output = this->tokenizer_->Decode(request.tokens);
+        log_req_mgr.print("Output: %s", output.c_str());
+      }
+    } else if (request.status == Request::PENDING) {
       new_bc.request_completed[i] = false;
+      new_bc.request_running[i] = false;
+
+      assert(request.ssm_cache_size == request.initial_len);
 
       // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset =
-          verified_tokens.front().second;
+      new_bc.requestsInfo[i].token_start_offset = request.ssm_cache_size;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
       new_bc.requestsInfo[i].max_sequence_length =
           old_bc.requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
+      new_bc.requestsInfo[i].num_tokens_in_batch = 0;
 
       // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-      int new_max_depth = new_bc.requestsInfo[i].max_sequence_length -
-                          new_bc.requestsInfo[i].token_start_offset -
-                          verified_tokens.size();
       new_bc.beamRequestsInfo[i].current_depth = 1;
       new_bc.beamRequestsInfo[i].beam_size =
           BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      new_bc.beamRequestsInfo[i].max_depth =
-          std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
+      new_bc.beamRequestsInfo[i].max_depth = 0;
       for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) {
         new_bc.beamRequestsInfo[i].parent_id[j] = 0;
         new_bc.beamRequestsInfo[i].probs[j] = 1;
@@ -665,31 +721,13 @@ BeamSearchBatchConfig
       new_bc.sub_requests[i] = 1;
 
       // Token Info
-      for (int j = 0; j < verified_tokens.size(); j++) {
-        auto token = verified_tokens.at(j);
-
-        // Normal Token Info
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            token.second;
-
-        // Beam Token Info
-        new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
-        new_bc.num_tokens++;
-
-        // Add verified token to request's token list
-        request.tokens.push_back(token.first);
-
-        if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
-          break;
-        }
-      }
       std::string output = this->tokenizer_->Decode(request.tokens);
       log_req_mgr.print("Output: %s", output.c_str());
+    } else {
+      assert(false);
     }
   }
-
+  
   // Step 2: Initialize new request
   for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) {
     if (new_bc.request_completed[i]) {
@@ -741,17 +779,22 @@ BeamSearchBatchConfig
           new_bc.num_tokens++;
         }
 
-        if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) {
-          all_requests[new_request.guid].status = Request::PENDING;
-          new_bc.request_running[i] = false;
-          std::cout << "Request " << new_request.guid << " is pending"
-                    << std::endl;
-        } else {
-          all_requests[new_request.guid].status = Request::RUNNING;
-          new_bc.request_running[i] = true;
-          std::cout << "Request " << new_request.guid << " is running"
-                    << std::endl;
-        }
+        // if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) {
+        //   all_requests[new_request.guid].status = Request::PENDING;
+        //   new_bc.request_running[i] = false;
+        //   std::cout << "Request " << new_request.guid << " is pending"
+        //             << std::endl;
+        // } else {
+        //   all_requests[new_request.guid].status = Request::RUNNING;
+        //   new_bc.request_running[i] = true;
+        //   std::cout << "Request " << new_request.guid << " is running"
+        //             << std::endl;
+        // }
+        all_requests[new_request.guid].status = Request::PENDING;
+        all_requests[new_request.guid].ssm_cache_size = new_bc.requestsInfo[i].num_tokens_in_batch;
+        new_bc.request_running[i] = false;
+        std::cout << "SSM KV Cache Size: " << all_requests[new_request.guid].ssm_cache_size << std::endl;
+        std::cout << "LLM KV Cache Size: " << all_requests[new_request.guid].llm_cache_size << std::endl;
 
         std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch
                   << " tokens for request " << new_request.guid << std::endl;
@@ -904,7 +947,10 @@ BeamSearchBatchConfig
             std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
                      (int)request.tokens.size() -
                          new_bc.requestsInfo[i].token_start_offset);
+        request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
       }
+      std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl;
+      std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl;
 
       // register more tokens due to the beam width
       for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
@@ -1026,23 +1072,23 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       // Add prompt token first in first verify iteration
       if (request.tokens.size() == request.initial_len) {
         // Initialization (prompt) phase
-        for (int j = 0; j < request.initial_len; j++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j];
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j;
-
-          new_bc.num_tokens++;
-          new_bc.requestsInfo[i].num_tokens_in_batch++;
-        }
-
-        std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
-        if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
-          assert(false &&
-                "Exceeding the space available in the TreeVerify batch");
-          break;
-        }
-
-        new_bc.requestsInfo[i].token_start_offset = 0;
+        // for (int j = 0; j < request.initial_len; j++) {
+        //   new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+        //   new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j];
+        //   new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j;
+
+        //   new_bc.num_tokens++;
+        //   new_bc.requestsInfo[i].num_tokens_in_batch++;
+        // }
+
+        // std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+        // if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
+        //   assert(false &&
+        //         "Exceeding the space available in the TreeVerify batch");
+        //   break;
+        // }
+
+        // new_bc.requestsInfo[i].token_start_offset = 0;
       } else {
         // Incremental phase: only add the last committed token
         new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
@@ -1062,22 +1108,24 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
       }
 
+
+      // Committed Tokens
       if (committed_tokens.find(guid) != committed_tokens.end()) {
         if (dfs_tree_inputs.at(0).second == request.initial_len +
                                                 committed_tokens.at(guid).size() -
                                                 1) { // commit prompt
-          for (int j = 0; j < request.initial_len; j++) {
-            new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
-            new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-                i;
-            new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
-            if (verbose) {
-              std::cout << new_bc.num_tokens_to_commit
-                        << "- committed_token.token_depth: " << j
-                        << ", token_index: " << j << std::endl;
-            }
-            new_bc.num_tokens_to_commit++;
-          }
+          // for (int j = 0; j < request.initial_len; j++) {
+          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
+          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+          //       i;
+          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
+          //   if (verbose) {
+          //     std::cout << new_bc.num_tokens_to_commit
+          //               << "- committed_token.token_depth: " << j
+          //               << ", token_index: " << j << std::endl;
+          //   }
+          //   new_bc.num_tokens_to_commit++;
+          // }
         } else { // commit the last token
           // only add the root token
           auto committed_token = committed_tokens.at(guid).at(0);
@@ -1099,7 +1147,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
                     << new_bc.num_tokens_to_commit << std::endl;
         }
 
-
         for (int j = 1; j < dfs_tree_inputs.size(); j++) {
           if (j < committed_tokens.at(guid).size()) {
             auto committed_token = committed_tokens.at(guid).at(j);
@@ -1195,6 +1242,70 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     } else if (request.status == Request::PENDING) {
       new_bc.request_running[i] = false;
       std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl;
+      std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl;
+      std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl;
+
+
+      // Normal Request Info
+      new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size;
+      new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length;
+
+      new_bc.request_completed[i] = false;
+      new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
+          BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
+          (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
+
+
+      if (committed_tokens.find(guid) != committed_tokens.end()) {
+        for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
+          auto token = committed_tokens.at(guid).at(j);
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = token.second;
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+              i;
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = token.first;
+
+          new_bc.num_tokens_to_commit++;
+          request.llm_cache_size++;
+        }
+        std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl;
+      }
+
+      if (request.llm_cache_size < request.initial_len) {
+        // Initialization (prompt) phase
+        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.llm_cache_size + j];
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j;
+
+          new_bc.num_tokens++;
+        }
+
+        std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+        if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
+          assert(false &&
+                "Exceeding the space available in the TreeVerify batch");
+          break;
+        }
+      } else {
+        if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) {
+          request.status = Request::RUNNING;
+          new_bc.request_running[i] = true;
+
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.tokens.size() - 1;
+
+          new_bc.num_tokens++;
+          new_bc.requestsInfo[i].num_tokens_in_batch++;
+          std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+          std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+
+          
+          dfs_tree_inputs[guid] = std::vector<std::pair<BatchConfig::TokenId, int>>{ std::make_pair(request.tokens.back(), request.tokens.size() - 1) };
+        }
+      }
+
     } else {
       assert(false && "Request status is not RUNNING or PENDING");
     }

From e6f24744de140eb46915b2bc8021d2af2647bc47 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Tue, 5 Sep 2023 00:46:14 -0400
Subject: [PATCH 06/30] Format.

---
 include/flexflow/request_manager.h |   1 -
 src/runtime/request_manager.cc     | 153 ++++++++++++++++-------------
 2 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3156801368..d137f9421e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -62,7 +62,6 @@ struct Request {
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
-  
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2afbab2f87..2b6187ba5b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -562,10 +562,10 @@ BeamSearchBatchConfig
                old_bc.tokensInfo[result_index].token_id,
                tree_outputs.back().second,
                token_id);
-      } 
+      }
       result_index++;
     }
-    
+
     if (request.status == Request::RUNNING) {
       std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
           traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
@@ -618,10 +618,10 @@ BeamSearchBatchConfig
           std::ofstream outputFile(output_filepath);
           if (outputFile.is_open()) {
             outputFile << "end-to-end latency: " << std::fixed
-                      << std::setprecision(3) << total_request_run_time
-                      << std::endl;
+                       << std::setprecision(3) << total_request_run_time
+                       << std::endl;
             outputFile << "num decoding steps: " << profile_info.decoding_steps
-                      << std::endl;
+                       << std::endl;
             outputFile << "token IDs: ";
             for (int i = 0; i < request.tokens.size(); i++) {
               outputFile << request.tokens[i];
@@ -650,7 +650,8 @@ BeamSearchBatchConfig
         // Normal Request Info
         new_bc.requestsInfo[i].token_start_offset =
             verified_tokens.front().second;
-        new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
+        new_bc.requestsInfo[i].request_guid =
+            old_bc.requestsInfo[i].request_guid;
         new_bc.requestsInfo[i].max_sequence_length =
             old_bc.requestsInfo[i].max_sequence_length;
         new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
@@ -727,7 +728,7 @@ BeamSearchBatchConfig
       assert(false);
     }
   }
-  
+
   // Step 2: Initialize new request
   for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) {
     if (new_bc.request_completed[i]) {
@@ -779,7 +780,8 @@ BeamSearchBatchConfig
           new_bc.num_tokens++;
         }
 
-        // if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) {
+        // if (new_bc.requestsInfo[i].num_tokens_in_batch <
+        // new_request.initial_len) {
         //   all_requests[new_request.guid].status = Request::PENDING;
         //   new_bc.request_running[i] = false;
         //   std::cout << "Request " << new_request.guid << " is pending"
@@ -791,18 +793,19 @@ BeamSearchBatchConfig
         //             << std::endl;
         // }
         all_requests[new_request.guid].status = Request::PENDING;
-        all_requests[new_request.guid].ssm_cache_size = new_bc.requestsInfo[i].num_tokens_in_batch;
+        all_requests[new_request.guid].ssm_cache_size =
+            new_bc.requestsInfo[i].num_tokens_in_batch;
         new_bc.request_running[i] = false;
-        std::cout << "SSM KV Cache Size: " << all_requests[new_request.guid].ssm_cache_size << std::endl;
-        std::cout << "LLM KV Cache Size: " << all_requests[new_request.guid].llm_cache_size << std::endl;
+        std::cout << "SSM KV Cache Size: "
+                  << all_requests[new_request.guid].ssm_cache_size << std::endl;
+        std::cout << "LLM KV Cache Size: "
+                  << all_requests[new_request.guid].llm_cache_size << std::endl;
 
         std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch
                   << " tokens for request " << new_request.guid << std::endl;
         std::cout << "total prompt in request: " << new_request.initial_len
                   << std::endl;
 
-
-
         if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
           break;
         }
@@ -912,15 +915,15 @@ BeamSearchBatchConfig
       // how many sub request in current request
       // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries?
       new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size;
-      
+
       // update the parentid, accumalated_probs, depth, and token_ids
       new_bc.beamRequestsInfo[i].beam_size =
-        old_bc.beamRequestsInfo[i].beam_size;
+          old_bc.beamRequestsInfo[i].beam_size;
       new_bc.beamRequestsInfo[i].max_depth =
-        old_bc.beamRequestsInfo[i].max_depth;
+          old_bc.beamRequestsInfo[i].max_depth;
       if (request.status == Request::RUNNING) {
         new_bc.beamRequestsInfo[i].current_depth =
-          old_bc.beamRequestsInfo[i].current_depth + 1;
+            old_bc.beamRequestsInfo[i].current_depth + 1;
         new_bc.request_running[i] = true;
         // do the slot exchange to minimize the cache exchange in kernel.
         update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
@@ -928,14 +931,13 @@ BeamSearchBatchConfig
         // if the request is pending, we need to update the beam search
         // metadata based on the initial length
         new_bc.beamRequestsInfo[i].current_depth =
-          old_bc.beamRequestsInfo[i].current_depth;
+            old_bc.beamRequestsInfo[i].current_depth;
         new_bc.request_running[i] = false;
       }
 
-
-
       // do the slot exchange to minimize the cache exchange in kernel.
-      // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i);
+      // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
+      // i);
 
       if (new_bc.requestsInfo[i].token_start_offset + 1 >=
           request.tokens.size()) {
@@ -1036,9 +1038,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     // Profiling
     profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1;
 
-    if(request.status == Request::RUNNING) {
+    if (request.status == Request::RUNNING) {
       new_bc.request_running[i] = true;
-      std::cout << "[Verify] Request " << request.guid << " is running" << std::endl;
+      std::cout << "[Verify] Request " << request.guid << " is running"
+                << std::endl;
 
       // Get the dfs tree
       std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
@@ -1062,9 +1065,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       }
 
       // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second;
-      new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].token_start_offset =
+          dfs_tree_inputs.front().second;
+      new_bc.requestsInfo[i].request_guid =
+          old_batches.at(0).requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length =
+          old_batches.at(0).requestsInfo[i].max_sequence_length;
       // TODO: Check this
       new_bc.requestsInfo[i].num_tokens_in_batch = 0;
       new_bc.request_completed[i] = false;
@@ -1101,25 +1107,26 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
         if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
-                "Exceeding the space available in the TreeVerify batch");
+                 "Exceeding the space available in the TreeVerify batch");
           break;
         }
 
         new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
       }
 
-
       // Committed Tokens
       if (committed_tokens.find(guid) != committed_tokens.end()) {
-        if (dfs_tree_inputs.at(0).second == request.initial_len +
-                                                committed_tokens.at(guid).size() -
-                                                1) { // commit prompt
+        if (dfs_tree_inputs.at(0).second ==
+            request.initial_len + committed_tokens.at(guid).size() -
+                1) { // commit prompt
           // for (int j = 0; j < request.initial_len; j++) {
-          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
-          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index
+          //   = j;
+          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index
+          //   =
           //       i;
-          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
-          //   if (verbose) {
+          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth
+          //   = j; if (verbose) {
           //     std::cout << new_bc.num_tokens_to_commit
           //               << "- committed_token.token_depth: " << j
           //               << ", token_index: " << j << std::endl;
@@ -1131,14 +1138,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           auto committed_token = committed_tokens.at(guid).at(0);
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
               committed_token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i;
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+              i;
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
               committed_token.first;
           if (verbose) {
             std::cout << new_bc.num_tokens_to_commit
                       << "- committed_token.token_depth: "
                       << committed_token.first
-                      << ", token_index: " << committed_token.second << std::endl;
+                      << ", token_index: " << committed_token.second
+                      << std::endl;
           }
           new_bc.num_tokens_to_commit++;
         }
@@ -1172,18 +1181,19 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
                   << new_bc.num_tokens_to_commit << std::endl;
       }
 
-
       // add prompt to the dfs tree
       // if (committed_tokens.find(guid) != committed_tokens.end()) {
       //   if (dfs_tree_inputs.at(0).second == request.initial_len +
-      //                                           committed_tokens.at(guid).size() -
-      //                                           1) { // commit prompt
+      //                                           committed_tokens.at(guid).size()
+      //                                           - 1) { // commit prompt
       //     for (int j = 0; j < request.initial_len; j++) {
-      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j;
-      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index
+      //       = j;
+      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index
+      //       =
       //           i;
-      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j;
-      //       if (verbose) {
+      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth
+      //       = j; if (verbose) {
       //         std::cout << new_bc.num_tokens_to_commit
       //                   << "- committed_token.token_depth: " << j
       //                   << ", token_index: " << j << std::endl;
@@ -1195,14 +1205,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       //     auto committed_token = committed_tokens.at(guid).at(0);
       //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
       //         committed_token.second;
-      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i;
+      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index
+      //     = i;
       //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
       //         committed_token.first;
       //     if (verbose) {
       //       std::cout << new_bc.num_tokens_to_commit
       //                 << "- committed_token.token_depth: "
       //                 << committed_token.first
-      //                 << ", token_index: " << committed_token.second << std::endl;
+      //                 << ", token_index: " << committed_token.second <<
+      //                 std::endl;
       //     }
       //     new_bc.num_tokens_to_commit++;
       //   }
@@ -1222,12 +1234,13 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         // Normal Token Info
         new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            token.second;
 
         // TODO: Add committed token info
         if (verbose) {
-          std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit
-                    << std::endl;
+          std::cout << "committed_tokens.size(): "
+                    << new_bc.num_tokens_to_commit << std::endl;
         }
 
         new_bc.num_tokens++;
@@ -1241,42 +1254,48 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
     } else if (request.status == Request::PENDING) {
       new_bc.request_running[i] = false;
-      std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl;
+      std::cout << "[Verify] Request " << request.guid << " is pending"
+                << std::endl;
       std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl;
       std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl;
 
-
       // Normal Request Info
       new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size;
-      new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].request_guid =
+          old_batches.at(0).requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length =
+          old_batches.at(0).requestsInfo[i].max_sequence_length;
 
       new_bc.request_completed[i] = false;
       new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
           BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
           (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
 
-
       if (committed_tokens.find(guid) != committed_tokens.end()) {
         for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
           auto token = committed_tokens.at(guid).at(j);
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = token.second;
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+              token.second;
           new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
               i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = token.first;
+          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+              token.first;
 
           new_bc.num_tokens_to_commit++;
           request.llm_cache_size++;
         }
-        std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl;
+        std::cout << "[Verify] Committed Tokens from last loading batch: "
+                  << new_bc.num_tokens_to_commit << std::endl;
       }
 
       if (request.llm_cache_size < request.initial_len) {
         // Initialization (prompt) phase
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.llm_cache_size + j];
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id =
+              request.tokens[request.llm_cache_size + j];
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              request.llm_cache_size + j;
 
           new_bc.num_tokens++;
         }
@@ -1284,7 +1303,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
         if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
-                "Exceeding the space available in the TreeVerify batch");
+                 "Exceeding the space available in the TreeVerify batch");
           break;
         }
       } else {
@@ -1294,24 +1313,24 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.tokens.size() - 1;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              request.tokens.size() - 1;
 
           new_bc.num_tokens++;
           new_bc.requestsInfo[i].num_tokens_in_batch++;
           std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
-          std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+          std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
+                    << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
 
-          
-          dfs_tree_inputs[guid] = std::vector<std::pair<BatchConfig::TokenId, int>>{ std::make_pair(request.tokens.back(), request.tokens.size() - 1) };
+          dfs_tree_inputs[guid] =
+              std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
+                  request.tokens.back(), request.tokens.size() - 1)};
         }
       }
 
     } else {
       assert(false && "Request status is not RUNNING or PENDING");
     }
-
-
-
   }
 
   if (verbose) {

From c758c9ffa16016490ee74456a9e441a7917f022d Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Thu, 7 Sep 2023 14:39:49 -0400
Subject: [PATCH 07/30] fix

---
 src/runtime/request_manager.cc | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2b6187ba5b..dfdbc1e822 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1259,18 +1259,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl;
       std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl;
 
-      // Normal Request Info
-      new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size;
-      new_bc.requestsInfo[i].request_guid =
-          old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
-
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-          BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
-          (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
-
       if (committed_tokens.find(guid) != committed_tokens.end()) {
         for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
           auto token = committed_tokens.at(guid).at(j);
@@ -1287,6 +1275,19 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         std::cout << "[Verify] Committed Tokens from last loading batch: "
                   << new_bc.num_tokens_to_commit << std::endl;
       }
+      
+      // Normal Request Info
+      new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size;
+      new_bc.requestsInfo[i].request_guid =
+          old_batches.at(0).requestsInfo[i].request_guid;
+      new_bc.requestsInfo[i].max_sequence_length =
+          old_batches.at(0).requestsInfo[i].max_sequence_length;
+
+      new_bc.request_completed[i] = false;
+      new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
+          BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
+          (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
+
 
       if (request.llm_cache_size < request.initial_len) {
         // Initialization (prompt) phase

From 0b6b14650c2b2a5f06858e47d663878580cbd348 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Thu, 7 Sep 2023 14:44:30 -0400
Subject: [PATCH 08/30] .

---
 src/runtime/request_manager.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index dfdbc1e822..5f3d9a42d3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1275,7 +1275,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         std::cout << "[Verify] Committed Tokens from last loading batch: "
                   << new_bc.num_tokens_to_commit << std::endl;
       }
-      
+
       // Normal Request Info
       new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size;
       new_bc.requestsInfo[i].request_guid =
@@ -1288,7 +1288,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
           (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
 
-
       if (request.llm_cache_size < request.initial_len) {
         // Initialization (prompt) phase
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {

From 683c2833e22329af2278c3b3042d0791ad1b5548 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 10 Sep 2023 21:27:19 -0400
Subject: [PATCH 09/30] fix

---
 include/flexflow/batch_config.h          | 2 +-
 include/flexflow/model.h                 | 3 ++-
 include/flexflow/request_manager.h       | 4 ++--
 inference/incr_decoding/incr_decoding.cc | 6 ++++--
 inference/spec_infer/spec_infer.cc       | 6 +++++-
 src/c/flexflow_c.cc                      | 4 +++-
 6 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ee3d5ac183..8aa69a3cad 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -46,7 +46,7 @@ class BatchConfig {
   void print() const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
-  static int const MAX_NUM_REQUESTS = 1;
+  static int const MAX_NUM_REQUESTS = 4;
   static int const MAX_NUM_TOKENS = 64;
   static int const MAX_PROMPT_LENGTH = 62;
   static int const MAX_SEQ_LENGTH = 256;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 747c639933..3dc9e79116 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -751,7 +751,8 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  GenerationResult generate(std::string const &text, int max_seq_length);
+  GenerationResult generate(std::vector<std::string> &prompts,
+                            int max_seq_length);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d137f9421e..8b0e24dee1 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -105,10 +105,10 @@ class RequestManager {
   FFModel *get_model(int model_id);
 
   GenerationResult generate_incr_decoding(FFModel *model,
-                                          std::string const &text,
+                                          std::vector<std::string> &prompts,
                                           int max_seq_length);
   GenerationResult generate_spec_infer(FFModel *model,
-                                       std::string const &text,
+                                       std::vector<std::string> &prompts,
                                        int max_seq_length);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 19cd8726e2..3f913e4573 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -242,13 +242,15 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
+    std::vector<std::string> prompts;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      GenerationResult result =
-          model.generate(text, 128 /*max_sequence_length*/);
+      prompts.push_back(text);
     }
+    GenerationResult result =
+        model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
   // Execution fence
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 9d139997f7..2b1fb6e817 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -384,12 +384,16 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
+
+    std::vector<std::string> prompts;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      tree_model.generate(text, 128 /*max_sequence_length*/);
+      prompts.push_back(text);
+      // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
+    tree_model.generate(prompts, 128 /*max_sequence_length*/);
   }
 
   // Execution fence
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 0c1fad17df..81a41a3adc 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1442,8 +1442,10 @@ flexflow_generation_result_t
                             int max_seq_length,
                             int *output_length_and_tokens) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  std::vector<std::string> prompts;
   std::string const text_str(input_text);
-  GenerationResult result = handle->generate(text_str, max_seq_length);
+  prompts.push_back(input_text);
+  GenerationResult result = handle->generate(prompts, max_seq_length);
   DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length);
   assert(result.output_tokens.size() <= max_seq_length);
   output_length_and_tokens[0] = result.output_tokens.size();

From d44c1a1df1a9a6bb9bc1dfa2e0c1af953fa54b06 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 10 Sep 2023 21:27:51 -0400
Subject: [PATCH 10/30] fix

---
 src/runtime/request_manager.cc | 99 +++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 75d25b0cd7..5ed0045cf9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -620,7 +620,8 @@ BeamSearchBatchConfig
           std::ofstream outputFile(output_filepath);
           if (outputFile.is_open()) {
             outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
+                       << std::setprecision(3)
+                       << profile_info.finish_time - profile_info.start_time
                        << std::endl;
             outputFile << "num decoding steps: " << profile_info.decoding_steps
                        << std::endl;
@@ -798,9 +799,9 @@ BeamSearchBatchConfig
         all_requests[new_request.guid].ssm_cache_size =
             new_bc.requestsInfo[i].num_tokens_in_batch;
         new_bc.request_running[i] = false;
-        std::cout << "SSM KV Cache Size: "
+        std::cout << "SSM KV Cache Size init: "
                   << all_requests[new_request.guid].ssm_cache_size << std::endl;
-        std::cout << "LLM KV Cache Size: "
+        std::cout << "LLM KV Cache Size init: "
                   << all_requests[new_request.guid].llm_cache_size << std::endl;
 
         std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch
@@ -869,7 +870,6 @@ BeamSearchBatchConfig
     std::cout << "Current Beam Depth: "
               << old_bc.beamRequestsInfo[0].current_depth << "\n";
   }
-
   // Step 1: Store result to the beam tree struct
   store_beam_metadata(old_bc, result);
 
@@ -891,20 +891,29 @@ BeamSearchBatchConfig
 
     // assert(processed_tokens < request.tokens.size());
     log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
-    if (processed_tokens >
-        old_bc.beamRequestsInfo[i].max_depth + request.tokens.size()
-        // || ir.results[t] == 0 TODO: replace this with <EOS>
-    ) {
-      log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)",
-                        old_bc.requestsInfo[i].request_guid,
-                        old_bc.beamRequestsInfo[i].max_depth);
-      // new_bc.request_completed[i] = true;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].token_start_offset = processed_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-    } else {
+    // if (processed_tokens >
+    //         old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() &&
+    //     request.status == Request::RUNNING
+    //     // || ir.results[t] == 0 TODO: replace this with <EOS>
+    // ) {
+    //   // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)",
+    //   //                   old_bc.requestsInfo[i].request_guid,
+    //   //                   old_bc.beamRequestsInfo[i].max_depth);
+    //   // // new_bc.request_completed[i] = true;
+    //   // new_bc.request_completed[i] = false;
+    //   // new_bc.requestsInfo[i].token_start_offset = processed_tokens;
+    //   // new_bc.requestsInfo[i].request_guid =
+    //   // old_bc.requestsInfo[i].request_guid;
+    //   // new_bc.requestsInfo[i].max_sequence_length =
+    //   //     old_bc.requestsInfo[i].max_sequence_length;
+    //   // new_bc.beamRequestsInfo[i].current_depth =
+    //   //       old_bc.beamRequestsInfo[i].current_depth;
+    //   // new_bc.request_running[i] = false;
+    //   std::cout << "beam search end:" << request.status << i << ", "
+    //             << new_bc.requestsInfo[i].num_tokens_in_batch << "\n";
+    // }
+    // else
+    {
       log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
                           << new_bc.num_tokens;
       new_bc.request_completed[i] = false;
@@ -953,8 +962,10 @@ BeamSearchBatchConfig
                          new_bc.requestsInfo[i].token_start_offset);
         request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
       }
-      std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl;
-      std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl;
+      std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
+                << std::endl;
+      std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
+                << std::endl;
 
       // register more tokens due to the beam width
       for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
@@ -1258,8 +1269,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.request_running[i] = false;
       std::cout << "[Verify] Request " << request.guid << " is pending"
                 << std::endl;
-      std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl;
-      std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl;
+      std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size
+                << std::endl;
+      std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size
+                << std::endl;
 
       if (committed_tokens.find(guid) != committed_tokens.end()) {
         for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
@@ -1361,28 +1374,25 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
               << " tokens in the current batch.\n";
   }
 
-  for (int i = 0; i <= old_bc.num_tokens; i++) {
+  for (int i = 0; i < old_bc.num_tokens; i++) {
     int request_index = old_bc.tokensInfo[i].request_index;
-
-    // End of the request
-    if (i == old_bc.num_tokens ||
-        old_bc.requestsInfo[request_index].request_guid != guid) {
+    if (i == 0 || old_bc.requestsInfo[request_index].request_guid != guid) {
 
       // Each token yields (beam_width) results
       int beam_width = old_bc.beamRequestsInfo[request_index].beam_size;
 
       // Count tokens sent to model in this request to find the final token's
       // index
-      result_index +=
-          (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
-          beam_width;
+      // result_index +=
+      //     (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
+      //     beam_width;
 
       if (verbose) {
         std::cout << "i = " << i << ", result index = " << result_index
                   << ", value: " << result.token_ids[result_index] << "\n";
       }
 
-      int index = old_bc.tokensInfo[i - 1].request_index;
+      int index = old_bc.tokensInfo[i].request_index;
       int beam_size = old_bc.beamRequestsInfo[index].beam_size;
       int depth = old_bc.beamRequestsInfo[index].current_depth;
 
@@ -1802,24 +1812,27 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
   return merged_tree;
 }
 
-GenerationResult FFModel::generate(std::string const &text,
+GenerationResult FFModel::generate(std::vector<std::string> &prompts,
                                    int max_seq_length) {
   RequestManager *rm = RequestManager::get_request_manager();
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
-    return rm->generate_incr_decoding(this, text, max_seq_length);
+    return rm->generate_incr_decoding(this, prompts, max_seq_length);
   } else {
     // Registered SSMs: perform speculative inference
-    return rm->generate_spec_infer(this, text, max_seq_length);
+    return rm->generate_spec_infer(this, prompts, max_seq_length);
   }
 }
 
 /*static*/
-GenerationResult RequestManager::generate_incr_decoding(FFModel *llm,
-                                                        std::string const &text,
-                                                        int max_seq_length) {
+GenerationResult RequestManager::generate_incr_decoding(
+    FFModel *llm, std::vector<std::string> &prompts, int max_seq_length) {
   InferenceManager *im = InferenceManager::get_inference_manager();
-  RequestGuid guid = register_new_request(text, max_seq_length);
+  RequestGuid guid;
+  for (int i = 0; i < prompts.size(); i++) {
+    guid = register_new_request(prompts.at(i), max_seq_length);
+  }
+
   if (guid == 0) {
     std::cout
         << "=========== Discard request exceed prompt maximum... ==========="
@@ -1869,11 +1882,13 @@ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm,
 }
 
 /*static*/
-GenerationResult RequestManager::generate_spec_infer(FFModel *llm,
-                                                     std::string const &text,
-                                                     int max_seq_length) {
+GenerationResult RequestManager::generate_spec_infer(
+    FFModel *llm, std::vector<std::string> &prompts, int max_seq_length) {
   InferenceManager *im = InferenceManager::get_inference_manager();
-  RequestGuid guid = register_new_request(text, max_seq_length);
+  RequestGuid guid;
+  for (int i = 0; i < prompts.size(); i++) {
+    guid = register_new_request(prompts.at(i), max_seq_length);
+  }
   if (guid == 0) {
     std::cout
         << "=========== Discard request exceed prompt maximum... ==========="

From 35a33e5a48c6ee422d199bea27506bb088c0af1e Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Sun, 10 Sep 2023 23:23:36 -0400
Subject: [PATCH 11/30] fix.

---
 src/ops/spec_inc_multihead_self_attention.cu | 7 +++++--
 src/runtime/request_manager.cc               | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index af70a07e83..5aa5266f15 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -248,10 +248,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_completed[i] || !bc->request_running[i]) {
       continue;
     }
+    std::cout << "[Verifying batch]: " << i << std::endl;
+
     for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {
+      std::cout << "[Verifying sub request]: " << sub_req_id << std::endl;
 
       // int num_new_tokens = bc->num_processing_tokens[i];
       // int total_tokens = bc->token_last_available_idx[i] + 1;
@@ -543,7 +546,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
         output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
   }
 
-  assert(tokens_previous_requests == num_tokens);
+  // assert(tokens_previous_requests == num_tokens);
 }
 
 template <typename DT>
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5ed0045cf9..7c41ef1a80 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1118,7 +1118,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         new_bc.num_tokens++;
         new_bc.requestsInfo[i].num_tokens_in_batch++;
 
-        if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) {
+        if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
                  "Exceeding the space available in the TreeVerify batch");
           break;
@@ -1300,7 +1300,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
       new_bc.request_completed[i] = false;
       new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-          BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
+          BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens,
           (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
 
       if (request.llm_cache_size < request.initial_len) {

From 0d7524a200dd29b8e313bf0083494bec2451d3e2 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Mon, 11 Sep 2023 00:00:54 -0400
Subject: [PATCH 12/30] Fix reloading new request with long prompts.

---
 src/ops/spec_inc_multihead_self_attention.cu | 24 ++++++++++----------
 src/runtime/request_manager.cc               | 18 +++++++++++++--
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 5aa5266f15..d3eb88bd43 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -353,18 +353,18 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       }
       // add alibi position bias to qk production
       // add alibi position bias to qk production
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                  min((size_t)CUDA_NUM_THREADS, parallelism),
-                                  0,
-                                  stream>>>(C,
-                                            num_new_tokens,
-                                            total_tokens,
-                                            m->num_q_heads,
-                                            m->global_num_q_heads,
-                                            shard_id);
-    }
+      if (*m->position_bias) {
+        size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
+        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
+                                    min((size_t)CUDA_NUM_THREADS, parallelism),
+                                    0,
+                                    stream>>>(C,
+                                              num_new_tokens,
+                                              total_tokens,
+                                              m->num_q_heads,
+                                              m->global_num_q_heads,
+                                              shard_id);
+      }
       // Fill all elements above diagonal in qk prods with -inf to force
       // causal attention.
       assert(num_new_tokens <= total_tokens);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7c41ef1a80..9d8c7f6c2b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1041,6 +1041,15 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
   new_bc.num_tokens_to_commit = 0;
   new_bc.num_tokens = 0;
 
+  int max_prompt_load_size = BatchConfig::MAX_NUM_TOKENS;
+  for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) {
+    if (old_batches.at(0).request_completed[i]) {
+      continue;
+    } else if (old_batches.at(0).request_running[i]) {
+      max_prompt_load_size -= BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+    }
+  }
+
   for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) {
     if (old_batches.at(0).request_completed[i]) {
       continue;
@@ -1299,9 +1308,14 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           old_batches.at(0).requestsInfo[i].max_sequence_length;
 
       new_bc.request_completed[i] = false;
+      // new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
+      //     BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens,
+      //     (int)request.initial_len -
+      //     new_bc.requestsInfo[i].token_start_offset);
       new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-          BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens,
+          max_prompt_load_size,
           (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
+      max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch;
 
       if (request.llm_cache_size < request.initial_len) {
         // Initialization (prompt) phase
@@ -1316,7 +1330,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         }
 
         std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
-        if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
+        if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
                  "Exceeding the space available in the TreeVerify batch");
           break;

From 7c8227d0b0ecf16fc4372b8dc2be654137ce8511 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Mon, 11 Sep 2023 00:45:37 -0400
Subject: [PATCH 13/30] Fix edge cases.

---
 src/ops/tree_inc_multihead_self_attention.cu |  3 ---
 src/runtime/request_manager.cc               | 23 ++++++++++++++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c41312cc3c..f916bdb925 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -515,9 +515,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                   m->oProjSize);
   }
 
-  std::cout << "processed_tokens_in_batch: " << processed_tokens_in_batch
-            << std::endl;
-  std::cout << "num_active_tokens: " << bc->num_active_tokens() << std::endl;
   assert(processed_tokens_in_batch == bc->num_active_tokens());
 }
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9d8c7f6c2b..0505e8bfab 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -238,6 +238,7 @@ RequestManager::RequestGuid
   all_requests[request.guid] = request;
   {
     std::string output = "New request tokens:";
+    output = "[" + std::to_string(request.guid) + "]" + output;
     for (int i = 0; i < request.tokens.size(); i++) {
       output = output + " " + std::to_string(request.tokens[i]);
     }
@@ -523,6 +524,8 @@ BeamSearchBatchConfig
     size_t guid = old_bc.requestsInfo[i].request_guid;
     Request &request = all_requests[guid];
 
+    std::cout << "[ " << guid << " ]" << std::endl;
+
     // Verify this: get verified tokens from result
     std::vector<std::pair<BatchConfig::TokenId, int>> tree_outputs =
         std::vector<std::pair<BatchConfig::TokenId, int>>();
@@ -703,6 +706,8 @@ BeamSearchBatchConfig
       new_bc.request_completed[i] = false;
       new_bc.request_running[i] = false;
 
+      std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", "
+                << "initial_len: " << request.initial_len << std::endl;
       assert(request.ssm_cache_size == request.initial_len);
 
       // Normal Request Info
@@ -949,11 +954,12 @@ BeamSearchBatchConfig
       // do the slot exchange to minimize the cache exchange in kernel.
       // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
       // i);
-
-      if (new_bc.requestsInfo[i].token_start_offset + 1 >=
-          request.tokens.size()) {
+      std::cout << request.guid << std::endl;
+      if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) {
         // Incremental phase
         new_bc.requestsInfo[i].num_tokens_in_batch = 1;
+        std::cout << "Incremental phase: " << request.tokens.size()
+                  << std::endl;
       } else {
         // Prompt phase
         new_bc.requestsInfo[i].num_tokens_in_batch =
@@ -961,7 +967,11 @@ BeamSearchBatchConfig
                      (int)request.tokens.size() -
                          new_bc.requestsInfo[i].token_start_offset);
         request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
+        std::cout << "Prompt phase: " << request.tokens.size() << std::endl;
+        std::cout << "Update ssm cache size: " << request.ssm_cache_size
+                  << std::endl;
       }
+
       std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
                 << std::endl;
       std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
@@ -1046,7 +1056,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     if (old_batches.at(0).request_completed[i]) {
       continue;
     } else if (old_batches.at(0).request_running[i]) {
-      max_prompt_load_size -= BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1);
     }
   }
 
@@ -1317,6 +1327,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
       max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch;
 
+      std::cout << "max_prompt_load_size: " << max_prompt_load_size
+                << std::endl;
+      std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << i << ", "
+                << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+
       if (request.llm_cache_size < request.initial_len) {
         // Initialization (prompt) phase
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {

From 230e0bc64f90279f6c34ebe4f192ea4bca30fc6d Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Mon, 11 Sep 2023 01:05:25 -0400
Subject: [PATCH 14/30] Fix edge case

---
 src/runtime/request_manager.cc | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0505e8bfab..a928468026 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -143,17 +143,12 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_sequence_length = max_sequence_length;
 
-  if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) {
+  if (prompt.size() >= BatchConfig::MAX_SEQ_LENGTH) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
-              << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got "
+              << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got "
               << prompt.size() << ".\n";
-    // Truncate the prompt to MAX_NUM_TOKENS
-    // request.tokens.insert(request.tokens.end(),
-    //                       prompt.begin(),
-    //                       prompt.begin() + BatchConfig::MAX_PROMPT_LENGTH);
-    // request.initial_len = BatchConfig::MAX_PROMPT_LENGTH;
+
     printf("tokens size: %zu\n", request.tokens.size());
-    // assert(false);
     return 0;
   } else {
     request.initial_len = prompt.size();
@@ -206,14 +201,12 @@ RequestManager::RequestGuid
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
-  if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) {
+  if (tokens.size() >= BatchConfig::MAX_SEQ_LENGTH) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
-              << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got "
+              << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got "
               << tokens.size() << ".\n";
-    // Truncate the prompt to MAX_NUM_TOKENS
-    // tokens.resize(BatchConfig::MAX_PROMPT_LENGTH);
+
     printf("tokens size: %zu\n", tokens.size());
-    // assert(false);
     return 0;
   }
   for (int i = 0; i < tokens.size(); i++) {
@@ -959,15 +952,20 @@ BeamSearchBatchConfig
         // Incremental phase
         new_bc.requestsInfo[i].num_tokens_in_batch = 1;
         std::cout << "Incremental phase: " << request.tokens.size()
-                  << std::endl;
+                  << ", num_tokens_in_batch: "
+                  << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       } else {
         // Prompt phase
         new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
+            // std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens,
+            std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens -
+                         BatchConfig::MAX_NUM_REQUESTS + i,
                      (int)request.tokens.size() -
                          new_bc.requestsInfo[i].token_start_offset);
         request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
-        std::cout << "Prompt phase: " << request.tokens.size() << std::endl;
+        std::cout << "Prompt phase: " << request.tokens.size()
+                  << ", num_tokens_in_batch:"
+                  << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
         std::cout << "Update ssm cache size: " << request.ssm_cache_size
                   << std::endl;
       }

From 9ed2684ca5539b778c7c42a35c0cf753f25cb16e Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Mon, 11 Sep 2023 01:14:58 -0400
Subject: [PATCH 15/30] fix

---
 src/runtime/request_manager.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a928468026..08357ae06b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1055,6 +1055,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       continue;
     } else if (old_batches.at(0).request_running[i]) {
       max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1);
+    } else {
+      max_prompt_load_size -= 1;
     }
   }
 

From 87ef9cb20a85d603c8237f9b585825a3ebe32e27 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Mon, 11 Sep 2023 23:59:14 -0400
Subject: [PATCH 16/30] try a fix to CI

---
 include/flexflow/ops/kernels/softmax_kernels.h |  2 ++
 src/ops/kernels/softmax.cpp                    |  5 ++++-
 src/ops/kernels/softmax.cu                     | 12 ++++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index 14c07414e9..987a546459 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -15,8 +15,10 @@ class SoftmaxMeta : public OpMeta {
               Legion::Domain const &input_domain);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor;
+  cudnnTensorDescriptor_t outputTensor;
 #else
   miopenTensorDescriptor_t inputTensor;
+  miopenTensorDescriptor_t outputTensor;
 #endif
   bool profiling;
   int dim;
diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index bd8b46116d..d2b35cb48a 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -29,6 +29,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(
       cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      outputTensor, input_domain, softmax->data_type));
   dim = softmax->dim;
   profiling = softmax->profiling;
   std::strcpy(op_name, softmax->name);
@@ -127,7 +130,7 @@ void forward_kernel(SoftmaxMeta const *m,
                                      m->inputTensor,
                                      input_ptr,
                                      &beta,
-                                     m->inputTensor,
+                                     m->outputTensor,
                                      output_ptr,
                                      MIOPEN_SOFTMAX_ACCURATE,
                                      MIOPEN_SOFTMAX_MODE_CHANNEL));
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 15130c19a7..f399f43ac1 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -28,6 +28,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
       inputTensor, input_domain, softmax->data_type));
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      outputTensor, input_domain, softmax->data_type));
   dim = softmax->dim;
   profiling = softmax->profiling;
   std::strcpy(op_name, softmax->name);
@@ -42,8 +45,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
                             DT *output_ptr) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   cudaEvent_t t_start, t_end;
+  std::cout << "softmax kernel: "
+            << "\n";
   if (m->profiling) {
     cudaEventCreate(&t_start);
     cudaEventCreate(&t_end);
@@ -63,6 +67,10 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
     log_measure.debug(
         "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed);
   }
+  std::cout << "softmax kernel end: "
+            << "\n";
+
+  print_tensor<float>((float *)output_ptr, 32, "softmax output");
 }
 
 template <typename DT>
@@ -127,7 +135,7 @@ void forward_kernel(SoftmaxMeta const *m,
                                  m->inputTensor,
                                  input_ptr,
                                  &beta,
-                                 m->inputTensor,
+                                 m->outputTensor,
                                  output_ptr));
 }
 

From 8898493cb227903c0c981b766a205dad6519c58f Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Tue, 12 Sep 2023 01:32:15 -0400
Subject: [PATCH 17/30] .

---
 src/ops/kernels/softmax.cu                    |  6 ---
 .../python_test_configs/generate_configs.py   | 48 +++++++++----------
 2 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index f399f43ac1..67a9c21038 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -46,8 +46,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
-  std::cout << "softmax kernel: "
-            << "\n";
   if (m->profiling) {
     cudaEventCreate(&t_start);
     cudaEventCreate(&t_end);
@@ -67,10 +65,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
     log_measure.debug(
         "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed);
   }
-  std::cout << "softmax kernel end: "
-            << "\n";
-
-  print_tensor<float>((float *)output_ptr, 32, "softmax output");
 }
 
 template <typename DT>
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index e780bc17b0..0d30a3edad 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -63,35 +63,35 @@
 os.chdir(dname)
 
 
-# Generate incremental decoding configs
-all_models = llama_models + opt_models + falcon_models + mpt_models
-for model_name in all_models:
-    for full_precision in (True, False):
-        for parallelism_degrees in parallelism_settings:
+# # Generate incremental decoding configs
+# all_models = llama_models + opt_models + falcon_models + mpt_models
+# for model_name in all_models:
+#     for full_precision in (True, False):
+#         for parallelism_degrees in parallelism_settings:
             
-            tp, pp = parallelism_degrees
+#             tp, pp = parallelism_degrees
 
-            # Tensor parallelism not supported by small Falcon model atm
-            if tp > 1 and ("falcon" in model_name or "starcoder" in model_name):
-                continue
-            # skip tp=4 for big models
-            if tp > 2 and ("7b" in model_name or "6.7b" in model_name):
-                continue
+#             # Tensor parallelism not supported by small Falcon model atm
+#             if tp > 1 and ("falcon" in model_name or "starcoder" in model_name):
+#                 continue
+#             # skip tp=4 for big models
+#             if tp > 2 and ("7b" in model_name or "6.7b" in model_name):
+#                 continue
             
-            _, after_slash = model_name.rsplit("/", maxsplit=1)
-            filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp"
-            test_configs_file = "./" + filename + ".json"
-            output_file = os.path.join(output_folder, filename+".txt")
+#             _, after_slash = model_name.rsplit("/", maxsplit=1)
+#             filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp"
+#             test_configs_file = "./" + filename + ".json"
+#             output_file = os.path.join(output_folder, filename+".txt")
             
-            ff_init_configs["tensor_parallelism_degree"] = tp
-            ff_init_configs["pipeline_parallelism_degree"] = pp
-            ff_init_configs["llm_model"] = model_name
-            ff_init_configs["full_precision"] = full_precision
-            ff_init_configs["output_file"] = output_file
-            ff_init_configs["prompt"] = prompt_file
+#             ff_init_configs["tensor_parallelism_degree"] = tp
+#             ff_init_configs["pipeline_parallelism_degree"] = pp
+#             ff_init_configs["llm_model"] = model_name
+#             ff_init_configs["full_precision"] = full_precision
+#             ff_init_configs["output_file"] = output_file
+#             ff_init_configs["prompt"] = prompt_file
 
-            with open(test_configs_file, "w+") as outfile:
-                json.dump(ff_init_configs, outfile, indent=4)
+#             with open(test_configs_file, "w+") as outfile:
+#                 json.dump(ff_init_configs, outfile, indent=4)
 
 # Generate speculative inference configs
 model_pairs = [llama_models, opt_models]

From e328e2dc89576f4621f110abd37224ec337b7747 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Tue, 12 Sep 2023 09:26:28 -0400
Subject: [PATCH 18/30] fix

---
 src/ops/argmax.cpp          | 2 +-
 src/ops/argmax.cu           | 6 ++++--
 src/ops/kernels/softmax.cpp | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index ec5ea6c36a..8a1cf0b3b0 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -393,7 +393,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
 
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
-    checkCUDA(hipMemset(parent, 0, batch_size * sizeof(int)));
+    checkCUDA(hipMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
   }
   int num_shards = 0;
   int k = 1;
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 37e067006c..b951dbda86 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -59,7 +59,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
   DT alpha = 1.0f, beta = 0.0f;
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
-    checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int)));
+    checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
   }
   size_t temp_storage_bytes = m->temp_storage_bytes;
   // use cub
@@ -91,9 +91,10 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
                                     int batch_size) {
+  std::cout << "argmax kernel start" << "\n";                                    
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
+  std::cout << "argmax kernel" << "\n";
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
     cudaEventCreate(&t_start);
@@ -136,6 +137,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
     cudaEventDestroy(t_end);
     printf("[ArgMax] forward time = %.2lfms\n", elapsed);
   }
+  std::cout << "argmax kernel end" << "\n";
 }
 
 ArgMaxMeta::ArgMaxMeta(FFHandler handler,
diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index d2b35cb48a..edfc5bb544 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -30,8 +30,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(
       cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
-  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
-      outputTensor, input_domain, softmax->data_type));
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain));
   dim = softmax->dim;
   profiling = softmax->profiling;
   std::strcpy(op_name, softmax->name);

From 3a25189541b11eceb56b71ccb0b2671d4027bfef Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Wed, 13 Sep 2023 15:28:58 -0400
Subject: [PATCH 19/30] Fix: clean up code and fix decoding_steps.

---
 include/flexflow/request_manager.h           |   7 +-
 src/ops/spec_inc_multihead_self_attention.cu |   2 -
 src/runtime/request_manager.cc               | 210 +++++--------------
 3 files changed, 54 insertions(+), 165 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8b0e24dee1..8515d8a04b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -52,9 +52,10 @@ class InferenceManager {
 
 struct Request {
   enum Status {
-    PENDING = 101,
-    RUNNING = 102,
-    COMPLETED = 103,
+    PENDING = 101,   // loading prompt
+    RUNNING = 102,   // running inference
+    COMPLETED = 103, // finished and verified
+    FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
   int max_sequence_length;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index d3eb88bd43..080565c00a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -251,10 +251,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
     if (bc->request_completed[i] || !bc->request_running[i]) {
       continue;
     }
-    std::cout << "[Verifying batch]: " << i << std::endl;
 
     for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {
-      std::cout << "[Verifying sub request]: " << sub_req_id << std::endl;
 
       // int num_new_tokens = bc->num_processing_tokens[i];
       // int total_tokens = bc->token_last_available_idx[i] + 1;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 08357ae06b..6a08df4efa 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -947,13 +947,16 @@ BeamSearchBatchConfig
       // do the slot exchange to minimize the cache exchange in kernel.
       // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
       // i);
-      std::cout << request.guid << std::endl;
       if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) {
         // Incremental phase
         new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-        std::cout << "Incremental phase: " << request.tokens.size()
-                  << ", num_tokens_in_batch: "
-                  << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+
+        if (verbose) {
+          std::cout << "[ Beam Spec] " << request.guid << std::endl;
+          std::cout << "Incremental phase: " << request.tokens.size()
+                    << ", num_tokens_in_batch: "
+                    << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+        }
       } else {
         // Prompt phase
         new_bc.requestsInfo[i].num_tokens_in_batch =
@@ -963,17 +966,22 @@ BeamSearchBatchConfig
                      (int)request.tokens.size() -
                          new_bc.requestsInfo[i].token_start_offset);
         request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
-        std::cout << "Prompt phase: " << request.tokens.size()
-                  << ", num_tokens_in_batch:"
-                  << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-        std::cout << "Update ssm cache size: " << request.ssm_cache_size
-                  << std::endl;
+        if (verbose) {
+          std::cout << "[ Beam Spec] " << request.guid << std::endl;
+          std::cout << "Prompt phase: " << request.tokens.size()
+                    << ", num_tokens_in_batch:"
+                    << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+          std::cout << "Update ssm cache size: " << request.ssm_cache_size
+                    << std::endl;
+        }
       }
 
-      std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
-                << std::endl;
-      std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
-                << std::endl;
+      if (verbose) {
+        std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
+                  << std::endl;
+        std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
+                  << std::endl;
+      }
 
       // register more tokens due to the beam width
       for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
@@ -1068,7 +1076,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     Request &request = all_requests[guid];
 
     // Profiling
-    profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1;
+    profiling_requests[request.guid].decoding_steps += 1;
 
     if (request.status == Request::RUNNING) {
       new_bc.request_running[i] = true;
@@ -1107,88 +1115,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[i].num_tokens_in_batch = 0;
       new_bc.request_completed[i] = false;
 
-      // Add prompt token first in first verify iteration
-      if (request.tokens.size() == request.initial_len) {
-        // Initialization (prompt) phase
-        // for (int j = 0; j < request.initial_len; j++) {
-        //   new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-        //   new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j];
-        //   new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j;
-
-        //   new_bc.num_tokens++;
-        //   new_bc.requestsInfo[i].num_tokens_in_batch++;
-        // }
-
-        // std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
-        // if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) {
-        //   assert(false &&
-        //         "Exceeding the space available in the TreeVerify batch");
-        //   break;
-        // }
-
-        // new_bc.requestsInfo[i].token_start_offset = 0;
-      } else {
-        // Incremental phase: only add the last committed token
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            request.tokens.size() - 1;
-
-        new_bc.num_tokens++;
-        new_bc.requestsInfo[i].num_tokens_in_batch++;
-
-        if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
-          assert(false &&
-                 "Exceeding the space available in the TreeVerify batch");
-          break;
-        }
-
-        new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
-      }
-
       // Committed Tokens
       if (committed_tokens.find(guid) != committed_tokens.end()) {
-        if (dfs_tree_inputs.at(0).second ==
-            request.initial_len + committed_tokens.at(guid).size() -
-                1) { // commit prompt
-          // for (int j = 0; j < request.initial_len; j++) {
-          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index
-          //   = j;
-          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index
-          //   =
-          //       i;
-          //   new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth
-          //   = j; if (verbose) {
-          //     std::cout << new_bc.num_tokens_to_commit
-          //               << "- committed_token.token_depth: " << j
-          //               << ", token_index: " << j << std::endl;
-          //   }
-          //   new_bc.num_tokens_to_commit++;
-          // }
-        } else { // commit the last token
-          // only add the root token
-          auto committed_token = committed_tokens.at(guid).at(0);
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-              committed_token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-              committed_token.first;
-          if (verbose) {
-            std::cout << new_bc.num_tokens_to_commit
-                      << "- committed_token.token_depth: "
-                      << committed_token.first
-                      << ", token_index: " << committed_token.second
-                      << std::endl;
-          }
-          new_bc.num_tokens_to_commit++;
-        }
-        if (verbose) {
-          std::cout << "new_bc.num_tokens_to_commit: "
-                    << new_bc.num_tokens_to_commit << std::endl;
-        }
-
-        for (int j = 1; j < dfs_tree_inputs.size(); j++) {
+        for (int j = 0; j < dfs_tree_inputs.size(); j++) {
           if (j < committed_tokens.at(guid).size()) {
             auto committed_token = committed_tokens.at(guid).at(j);
             new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
@@ -1213,48 +1142,22 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
                   << new_bc.num_tokens_to_commit << std::endl;
       }
 
-      // add prompt to the dfs tree
-      // if (committed_tokens.find(guid) != committed_tokens.end()) {
-      //   if (dfs_tree_inputs.at(0).second == request.initial_len +
-      //                                           committed_tokens.at(guid).size()
-      //                                           - 1) { // commit prompt
-      //     for (int j = 0; j < request.initial_len; j++) {
-      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index
-      //       = j;
-      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index
-      //       =
-      //           i;
-      //       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth
-      //       = j; if (verbose) {
-      //         std::cout << new_bc.num_tokens_to_commit
-      //                   << "- committed_token.token_depth: " << j
-      //                   << ", token_index: " << j << std::endl;
-      //       }
-      //       new_bc.num_tokens_to_commit++;
-      //     }
-      //   } else { // commit the last token
-      //     // only add the root token
-      //     auto committed_token = committed_tokens.at(guid).at(0);
-      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-      //         committed_token.second;
-      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index
-      //     = i;
-      //     new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-      //         committed_token.first;
-      //     if (verbose) {
-      //       std::cout << new_bc.num_tokens_to_commit
-      //                 << "- committed_token.token_depth: "
-      //                 << committed_token.first
-      //                 << ", token_index: " << committed_token.second <<
-      //                 std::endl;
-      //     }
-      //     new_bc.num_tokens_to_commit++;
-      //   }
-      //   if (verbose) {
-      //     std::cout << "new_bc.num_tokens_to_commit: "
-      //               << new_bc.num_tokens_to_commit << std::endl;
-      //   }
-      // }
+      // Incremental phase: only add the last committed token
+      new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
+      new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
+      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+          request.tokens.size() - 1;
+
+      new_bc.num_tokens++;
+      new_bc.requestsInfo[i].num_tokens_in_batch++;
+
+      if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
+        assert(false &&
+               "Exceeding the space available in the TreeVerify batch");
+        break;
+      }
+
+      new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1;
 
       // Add Tokens from the DFS Tree to the next batch
       for (int j = 1; j < dfs_tree_inputs.size(); j++) {
@@ -1269,12 +1172,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
             token.second;
 
-        // TODO: Add committed token info
-        if (verbose) {
-          std::cout << "committed_tokens.size(): "
-                    << new_bc.num_tokens_to_commit << std::endl;
-        }
-
         new_bc.num_tokens++;
         new_bc.requestsInfo[i].num_tokens_in_batch++;
 
@@ -1286,13 +1183,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
     } else if (request.status == Request::PENDING) {
       new_bc.request_running[i] = false;
-      std::cout << "[Verify] Request " << request.guid << " is pending"
-                << std::endl;
-      std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size
-                << std::endl;
-      std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size
-                << std::endl;
+      if (verbose) {
+        std::cout << "[Verify] Request " << request.guid
+                  << " is pending in loading prompt phase" << std::endl;
+        std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size
+                  << std::endl;
+        std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size
+                  << std::endl;
+      }
 
+      // Commit all tokens from the last loading batch
       if (committed_tokens.find(guid) != committed_tokens.end()) {
         for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
           auto token = committed_tokens.at(guid).at(j);
@@ -1318,10 +1218,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           old_batches.at(0).requestsInfo[i].max_sequence_length;
 
       new_bc.request_completed[i] = false;
-      // new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
-      //     BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens,
-      //     (int)request.initial_len -
-      //     new_bc.requestsInfo[i].token_start_offset);
+
       new_bc.requestsInfo[i].num_tokens_in_batch = std::min(
           max_prompt_load_size,
           (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset);
@@ -1350,7 +1247,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
                  "Exceeding the space available in the TreeVerify batch");
           break;
         }
-      } else {
+      } else { // launch the request into running phase after loading all prompt
         if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) {
           request.status = Request::RUNNING;
           new_bc.request_running[i] = true;
@@ -1377,13 +1274,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     }
   }
 
-  if (verbose) {
-    std::cout << "prepare_next_batch_verify OLD vs NEW batchconfigs below:"
-              << std::endl;
-    // old_batches.print();
-    // new_bc.print();
-  }
-
   return new_bc;
 }
 

From c7f1b9e18f2366d22e3d2e520baeb0c42b6091d5 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Thu, 14 Sep 2023 19:31:02 -0400
Subject: [PATCH 20/30] try 1 try

---
 src/ops/argmax.cu              | 5 ++---
 src/runtime/request_manager.cc | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index b951dbda86..5a31269606 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -83,6 +83,8 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           prob_ptr,
                           batch_size,
                           m->beam_search);
+  print_tensor<int>(indices_ptr, 32, "argmax op");
+                    
 }
 
 /*static*/
@@ -91,10 +93,8 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
                                     int batch_size) {
-  std::cout << "argmax kernel start" << "\n";                                    
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  std::cout << "argmax kernel" << "\n";
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
     cudaEventCreate(&t_start);
@@ -137,7 +137,6 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
     cudaEventDestroy(t_end);
     printf("[ArgMax] forward time = %.2lfms\n", elapsed);
   }
-  std::cout << "argmax kernel end" << "\n";
 }
 
 ArgMaxMeta::ArgMaxMeta(FFHandler handler,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6a08df4efa..a4ada0e34c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -996,7 +996,7 @@ BeamSearchBatchConfig
                 new_bc.beamRequestsInfo[i].tokens[k];
           } else {
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
-                request.tokens[depth];
+                request.tokens[request.tokens.size() - 1];
           }
 
           new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;

From 55eb9139371d39e522e2a6139501dc291ba8b445 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Sat, 16 Sep 2023 15:50:16 -0400
Subject: [PATCH 21/30] fix: allow parse 0 tokens for pending request.

---
 src/runtime/request_manager.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a4ada0e34c..4fa52535ae 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -949,7 +949,11 @@ BeamSearchBatchConfig
       // i);
       if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) {
         // Incremental phase
-        new_bc.requestsInfo[i].num_tokens_in_batch = 1;
+        if (request.status == Request::RUNNING) {
+          new_bc.requestsInfo[i].num_tokens_in_batch = 1;
+        } else {
+          new_bc.requestsInfo[i].num_tokens_in_batch = 0;
+        }
 
         if (verbose) {
           std::cout << "[ Beam Spec] " << request.guid << std::endl;
@@ -992,9 +996,11 @@ BeamSearchBatchConfig
 
           // get value from requestinfo
           if (request.status == Request::RUNNING) {
+            std::cout << "[running ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
                 new_bc.beamRequestsInfo[i].tokens[k];
           } else {
+            std::cout << "[pending ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
                 request.tokens[request.tokens.size() - 1];
           }

From b88c4de5f5df10f0fb295544307f80a5c1b02d75 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Sat, 16 Sep 2023 15:52:18 -0400
Subject: [PATCH 22/30] format.

---
 src/ops/argmax.cu              | 1 -
 src/runtime/request_manager.cc | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 5a31269606..8fbf84f2a3 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -84,7 +84,6 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           batch_size,
                           m->beam_search);
   print_tensor<int>(indices_ptr, 32, "argmax op");
-                    
 }
 
 /*static*/
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4fa52535ae..6aa4782c3c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -996,11 +996,15 @@ BeamSearchBatchConfig
 
           // get value from requestinfo
           if (request.status == Request::RUNNING) {
-            std::cout << "[running ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+            std::cout << "[running ]Num of token in batch: "
+                      << new_bc.requestsInfo[i].num_tokens_in_batch
+                      << std::endl;
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
                 new_bc.beamRequestsInfo[i].tokens[k];
           } else {
-            std::cout << "[pending ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
+            std::cout << "[pending ]Num of token in batch: "
+                      << new_bc.requestsInfo[i].num_tokens_in_batch
+                      << std::endl;
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
                 request.tokens[request.tokens.size() - 1];
           }

From abcf94f8c18c39bd6242598ff1893091ae673bea Mon Sep 17 00:00:00 2001
From: xinhaoc <chengxh_98@163.com>
Date: Sat, 16 Sep 2023 17:55:06 -0400
Subject: [PATCH 23/30] remove comment tests

---
 .../python_test_configs/generate_configs.py   | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 0d30a3edad..e780bc17b0 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -63,35 +63,35 @@
 os.chdir(dname)
 
 
-# # Generate incremental decoding configs
-# all_models = llama_models + opt_models + falcon_models + mpt_models
-# for model_name in all_models:
-#     for full_precision in (True, False):
-#         for parallelism_degrees in parallelism_settings:
+# Generate incremental decoding configs
+all_models = llama_models + opt_models + falcon_models + mpt_models
+for model_name in all_models:
+    for full_precision in (True, False):
+        for parallelism_degrees in parallelism_settings:
             
-#             tp, pp = parallelism_degrees
+            tp, pp = parallelism_degrees
 
-#             # Tensor parallelism not supported by small Falcon model atm
-#             if tp > 1 and ("falcon" in model_name or "starcoder" in model_name):
-#                 continue
-#             # skip tp=4 for big models
-#             if tp > 2 and ("7b" in model_name or "6.7b" in model_name):
-#                 continue
+            # Tensor parallelism not supported by small Falcon model atm
+            if tp > 1 and ("falcon" in model_name or "starcoder" in model_name):
+                continue
+            # skip tp=4 for big models
+            if tp > 2 and ("7b" in model_name or "6.7b" in model_name):
+                continue
             
-#             _, after_slash = model_name.rsplit("/", maxsplit=1)
-#             filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp"
-#             test_configs_file = "./" + filename + ".json"
-#             output_file = os.path.join(output_folder, filename+".txt")
+            _, after_slash = model_name.rsplit("/", maxsplit=1)
+            filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp"
+            test_configs_file = "./" + filename + ".json"
+            output_file = os.path.join(output_folder, filename+".txt")
             
-#             ff_init_configs["tensor_parallelism_degree"] = tp
-#             ff_init_configs["pipeline_parallelism_degree"] = pp
-#             ff_init_configs["llm_model"] = model_name
-#             ff_init_configs["full_precision"] = full_precision
-#             ff_init_configs["output_file"] = output_file
-#             ff_init_configs["prompt"] = prompt_file
+            ff_init_configs["tensor_parallelism_degree"] = tp
+            ff_init_configs["pipeline_parallelism_degree"] = pp
+            ff_init_configs["llm_model"] = model_name
+            ff_init_configs["full_precision"] = full_precision
+            ff_init_configs["output_file"] = output_file
+            ff_init_configs["prompt"] = prompt_file
 
-#             with open(test_configs_file, "w+") as outfile:
-#                 json.dump(ff_init_configs, outfile, indent=4)
+            with open(test_configs_file, "w+") as outfile:
+                json.dump(ff_init_configs, outfile, indent=4)
 
 # Generate speculative inference configs
 model_pairs = [llama_models, opt_models]

From 66ee3674fd6a7723c0d0855d8d84d71cf66f984a Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Tue, 19 Sep 2023 13:22:03 -0400
Subject: [PATCH 24/30] remove print.

---
 src/ops/argmax.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 8fbf84f2a3..05c84719c1 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -83,7 +83,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           prob_ptr,
                           batch_size,
                           m->beam_search);
-  print_tensor<int>(indices_ptr, 32, "argmax op");
+  // print_tensor<int>(indices_ptr, 32, "argmax op");
 }
 
 /*static*/

From 801c56c7c8ccd262bf61eccf4d34d7e6cbf1dbf1 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 24 Sep 2023 18:32:40 -0400
Subject: [PATCH 25/30] fix decoding steps

---
 src/ops/spec_inc_multihead_self_attention.cu |  2 +-
 src/runtime/request_manager.cc               | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 881d517cf6..39c6ff9259 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -248,7 +248,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) {
-    if (bc->request_completed[i] || !bc->request_running[i]) {
+    if (bc->request_completed[i]) {
       continue;
     }
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index eea3cd4d13..bf64a17c1e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1190,7 +1190,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         }
       }
 
-      std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+      std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens << std::endl;
     } else if (request.status == Request::PENDING) {
       new_bc.request_running[i] = false;
       if (verbose) {
@@ -1251,7 +1251,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           new_bc.num_tokens++;
         }
 
-        std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+        std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens << std::endl;
         if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
                  "Exceeding the space available in the TreeVerify batch");
@@ -1269,7 +1269,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
           new_bc.num_tokens++;
           new_bc.requestsInfo[i].num_tokens_in_batch++;
-          std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl;
+          std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens << std::endl;
           std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
                     << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
 
@@ -1303,25 +1303,26 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
               << " tokens in the current batch.\n";
   }
 
-  for (int i = 0; i < old_bc.num_tokens; i++) {
+  for (int i = 0; i <= old_bc.num_tokens; i++) {
     int request_index = old_bc.tokensInfo[i].request_index;
-    if (i == 0 || old_bc.requestsInfo[request_index].request_guid != guid) {
+    if (i == old_bc.num_tokens ||
+        old_bc.requestsInfo[request_index].request_guid != guid) {
 
       // Each token yields (beam_width) results
       int beam_width = old_bc.beamRequestsInfo[request_index].beam_size;
 
       // Count tokens sent to model in this request to find the final token's
       // index
-      // result_index +=
-      //     (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
-      //     beam_width;
+      result_index +=
+          (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
+          beam_width;
 
       if (verbose) {
         std::cout << "i = " << i << ", result index = " << result_index
                   << ", value: " << result.token_ids[result_index] << "\n";
       }
 
-      int index = old_bc.tokensInfo[i].request_index;
+      int index = old_bc.tokensInfo[i - 1].request_index;
       int beam_size = old_bc.beamRequestsInfo[index].beam_size;
       int depth = old_bc.beamRequestsInfo[index].current_depth;
 

From 1d18fce570b4be09dc0ff2ab47134a64751d76a2 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Sun, 24 Sep 2023 18:37:54 -0400
Subject: [PATCH 26/30] .

---
 src/runtime/request_manager.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bf64a17c1e..4ee5f44261 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1190,7 +1190,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         }
       }
 
-      std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens << std::endl;
+      std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens
+                << std::endl;
     } else if (request.status == Request::PENDING) {
       new_bc.request_running[i] = false;
       if (verbose) {
@@ -1251,7 +1252,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           new_bc.num_tokens++;
         }
 
-        std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens << std::endl;
+        std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens
+                  << std::endl;
         if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
                  "Exceeding the space available in the TreeVerify batch");
@@ -1269,7 +1271,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
           new_bc.num_tokens++;
           new_bc.requestsInfo[i].num_tokens_in_batch++;
-          std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens << std::endl;
+          std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens
+                    << std::endl;
           std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
                     << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
 

From aed8850b4355b284c7abd683efc690560254eaa9 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Sun, 24 Sep 2023 22:32:50 -0400
Subject: [PATCH 27/30] quick fix.

---
 src/ops/spec_inc_multihead_self_attention.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 39c6ff9259..b4cdc77e2a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -260,6 +260,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
       int total_tokens = bc->requestsInfo[i].token_start_offset +
                          bc->requestsInfo[i].num_tokens_in_batch;
+
+      if (num_new_tokens <= 0) {
+        continue;
+      }
+
       // Compute (QK^T/sqrt(d_k))
       int m_ = num_new_tokens;
       int n = total_tokens;

From a39fb5b52e59e53f5b746b64cf69eafbba32e4ef Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Sun, 24 Sep 2023 22:38:43 -0400
Subject: [PATCH 28/30] remove debugging prints.

---
 src/runtime/request_manager.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4ee5f44261..c30e1a68cf 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1190,8 +1190,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         }
       }
 
-      std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens
-                << std::endl;
     } else if (request.status == Request::PENDING) {
       new_bc.request_running[i] = false;
       if (verbose) {
@@ -1252,8 +1250,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           new_bc.num_tokens++;
         }
 
-        std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens
-                  << std::endl;
         if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) {
           assert(false &&
                  "Exceeding the space available in the TreeVerify batch");
@@ -1271,8 +1267,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
           new_bc.num_tokens++;
           new_bc.requestsInfo[i].num_tokens_in_batch++;
-          std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens
-                    << std::endl;
           std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
                     << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
 

From 84a6fbad47dc493f31bf26a4831fecd0c59cde15 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Mon, 25 Sep 2023 00:54:01 -0400
Subject: [PATCH 29/30] fix store_beam_metadata.

---
 src/runtime/request_manager.cc | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c30e1a68cf..5489c9b06d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -996,15 +996,15 @@ BeamSearchBatchConfig
 
           // get value from requestinfo
           if (request.status == Request::RUNNING) {
-            std::cout << "[running ]Num of token in batch: "
-                      << new_bc.requestsInfo[i].num_tokens_in_batch
-                      << std::endl;
+            // std::cout << "[running ]Num of token in batch: "
+            //           << new_bc.requestsInfo[i].num_tokens_in_batch
+            //           << std::endl;
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
                 new_bc.beamRequestsInfo[i].tokens[k];
           } else {
-            std::cout << "[pending ]Num of token in batch: "
-                      << new_bc.requestsInfo[i].num_tokens_in_batch
-                      << std::endl;
+            // std::cout << "[pending ]Num of token in batch: "
+            //           << new_bc.requestsInfo[i].num_tokens_in_batch
+            //           << std::endl;
             new_bc.tokensInfo[new_bc.num_tokens].token_id =
                 request.tokens[request.tokens.size() - 1];
           }
@@ -1301,12 +1301,16 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
   }
 
   for (int i = 0; i <= old_bc.num_tokens; i++) {
-    int request_index = old_bc.tokensInfo[i].request_index;
     if (i == old_bc.num_tokens ||
-        old_bc.requestsInfo[request_index].request_guid != guid) {
+        old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid !=
+            guid) {
+
+      int index = old_bc.tokensInfo[i - 1].request_index;
+      int beam_size = old_bc.beamRequestsInfo[index].beam_size;
+      int depth = old_bc.beamRequestsInfo[index].current_depth;
 
       // Each token yields (beam_width) results
-      int beam_width = old_bc.beamRequestsInfo[request_index].beam_size;
+      int beam_width = old_bc.beamRequestsInfo[index].beam_size;
 
       // Count tokens sent to model in this request to find the final token's
       // index
@@ -1319,10 +1323,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
                   << ", value: " << result.token_ids[result_index] << "\n";
       }
 
-      int index = old_bc.tokensInfo[i - 1].request_index;
-      int beam_size = old_bc.beamRequestsInfo[index].beam_size;
-      int depth = old_bc.beamRequestsInfo[index].current_depth;
-
       Request &request = all_requests[old_bc.requestsInfo[index].request_guid];
 
       if (depth == 1) {
@@ -1366,7 +1366,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
 
       // update the guid and start_depth for current request
       if (i < old_bc.num_tokens) {
-        guid = old_bc.requestsInfo[request_index].request_guid;
+        guid = old_bc.requestsInfo[index].request_guid;
         start_depth = old_bc.tokensInfo[i].abs_depth_in_request;
       }
     }

From 59acaeb0f33c57851d9e3bc14d3eada6f484d8e8 Mon Sep 17 00:00:00 2001
From: xinhaoc <xinhaoc@andrew.cmu.edu>
Date: Mon, 25 Sep 2023 09:14:51 -0400
Subject: [PATCH 30/30] hip

---
 src/ops/kernels/softmax.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index edfc5bb544..ca4872d51b 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -29,7 +29,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(
       cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
-  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+  checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(
       cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain));
   dim = softmax->dim;