update

flexflow · Oct 18, 2024 · 0114211 · 0114211
1 parent 3b9d8bb
commit 0114211
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 38 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -300,7 +300,7 @@ if(NOT BUILD_LEGION_ONLY)
   include(FetchContent)
   FetchContent_Declare(
     suffix_decoding
-    GIT_REPOSITORY git@github.com:Snowflake-Labs/suffix-tree-decoding.git
+    GIT_REPOSITORY https://github.com/Snowflake-Labs/suffix-tree-decoding.git
     GIT_TAG main  # or a specific tag/commit hash
   )
   FetchContent_MakeAvailable(suffix_decoding)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -70,6 +70,7 @@ struct Request {
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   int max_length = -1;
   int max_new_tokens = -1;
+  bool add_special_tokens = true;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
@@ -47,7 +47,9 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
-                      int &max_sequence_length) {
+                      int &max_sequence_length,
+                      std::string &target_partition,
+                      std::string &output_trace_path) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -105,6 +107,14 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "-target-partition")) {
+      target_partition = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-output-trace-path")) {
+      output_trace_path = std::string(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -136,6 +146,8 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
+  std::string target_partition = "FEATURE_EXTRACTION";
+  std::string output_trace_path = "/home/yak/goliaro/FlexFlow/suffix_decoding/modified_json_file.json";
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -151,7 +163,9 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                   max_sequence_length);
+                   max_sequence_length,
+                   target_partition,
+                   output_trace_path);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -257,25 +271,66 @@ void FlexFlow::top_level_task(Task const *task,
 
   int total_num_requests = 0;
   {
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
+    // using json = nlohmann::json;
+    using json = nlohmann::ordered_json;
+    std::ifstream input_file(file_paths.prompt_file_path);
+    assert(input_file.good() && "Prompt file does not exist.");
+    json j;
+    input_file >> j;
+    input_file.close();
+
+    // Find the partition with name "FEATURE_EXTRACTION"
+    auto& partitions = j["partitions"];
+    auto it = std::find_if(partitions.begin(), partitions.end(),
+      [target_partition](const json& partition) {
+          return partition["partition_name"] == target_partition;
+      });
 
-    std::vector<Request> requests;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
-      Request inference_req;
-      inference_req.prompt = text;
-      inference_req.max_length = 128;
-      requests.push_back(inference_req);
-      total_num_requests++;
+    if (it != partitions.end()) {
+      // We found the partition
+      json& feature_extraction_partition = *it;
+
+      // Iterate through eval_entries
+      std::vector<Request> requests;
+      for (auto& entry : feature_extraction_partition["eval_entries"]) {
+        std::string text = entry["prompt"];
+        int max_new_tokens_ = entry["response_length"];
+        printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+        Request inference_req;
+        inference_req.prompt = text;
+        inference_req.max_new_tokens = max_new_tokens_;
+        inference_req.add_special_tokens = false;
+        requests.push_back(inference_req);
+        total_num_requests++;
+        // break;
+      }
+      std::vector<GenerationResult> result = model.generate(requests);
+      assert(result.size() == requests.size());
+      assert(result.size() == total_num_requests);
+      assert(result.size() == feature_extraction_partition["eval_entries"].size());
+      int i = 0;
+      for (auto& entry : feature_extraction_partition["eval_entries"]) {
+        entry["original_response"] = entry["response"];
+        entry["original_response_length"] = entry["response_length"];
+        std::string ff_out = result[i].output_text;
+        int tot_length = result[i].output_text.length();
+        entry["response"] = ff_out;
+        entry["response_length"] = result[i].output_tokens.size();
+        i++;
+      }
+
+      // Write the modified JSON to a file
+      std::ofstream output_file(output_trace_path);
+      if (output_file.is_open()) {
+        output_file << j.dump(2);
+        output_file.close();
+        std::cout << "Modified JSON has been saved to " << output_trace_path << std::endl;
+      } else {
+        std::cerr << "Unable to open file for writing." << std::endl;
+      }
+    } else {
+      std::cout << target_partition << " partition not found." << std::endl;
     }
-    std::vector<GenerationResult> result = model.generate(requests);
   }
 
   // terminate the request manager by stopping the background thread

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
@@ -56,6 +56,7 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
   os << "  peft_model_id: " << req.peft_model_id << "\n";
   os << "  max_length: " << req.max_length << "\n";
   os << "  max_new_tokens: " << req.max_new_tokens << "\n";
+  os << "  add_special_tokens: " << req.add_special_tokens << "\n";
   os << "  initial_len: " << req.initial_len << "\n";
   os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
   os << "  llm_cache_size: " << req.llm_cache_size << "\n";
@@ -413,6 +414,7 @@ RequestManager::RequestGuid
   request.guid = next_available_guid++;
   request.max_length = request_.max_length;
   request.max_new_tokens = request_.max_new_tokens;
+  request.add_special_tokens = request_.add_special_tokens;
   // both unset
   if (request.max_length == -1 && request.max_new_tokens == -1) {
     request.max_length = get_max_sequence_length() - 1;
@@ -427,7 +429,7 @@ RequestManager::RequestGuid
   }
   request.peft_model_id = request_.peft_model_id;
   request.warmup = request_.warmup;
-  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+  if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
   if (request_.benchmarking_tokens >= 0) {
@@ -520,6 +522,7 @@ RequestManager::RequestGuid
   request.initial_len = 0;
   request.max_length = request_.max_length;
   request.max_new_tokens = request_.max_new_tokens;
+  request.add_special_tokens = request_.add_special_tokens;
   if (request.max_new_tokens != -1) {
     std::cerr
         << "Error: max_new_tokens is not allowed for PEFT finetuning requests"
@@ -544,7 +547,7 @@ RequestManager::RequestGuid
     request.benchmarking_tokens = request_.benchmarking_tokens;
     std::vector<int32_t> input_tokens;
     std::vector<int32_t> output_tokens;
-    bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON);
+    bool bos_added = (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON);
     if (bos_added) {
       input_tokens.push_back(bos_token_id);
     }
@@ -566,7 +569,7 @@ RequestManager::RequestGuid
       std::string output_text("");
       std::vector<int32_t> input_tokens;
       input_tokens = this->tokenizer_->Encode(text);
-      if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+      if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) {
         input_tokens.insert(input_tokens.begin(), bos_token_id);
       }
       std::vector<int32_t> output_tokens =
@@ -755,6 +758,19 @@ void RequestManager::check_batch(BatchConfig const &old_bc,
   }
 }
 
+bool isPrefixAndRemove(const std::vector<int>& prefix, std::vector<int>& vec) {
+  if (prefix.size() > vec.size()) {
+    return false;
+  }
+
+  if (std::equal(prefix.begin(), prefix.end(), vec.begin())) {
+    vec.erase(vec.begin(), vec.begin() + prefix.size());
+    return true;
+  }
+
+  return false;
+}
+
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
@@ -814,7 +830,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
-        if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+        if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -823,7 +839,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           GenerationResult &gr = request_generation_results[request.guid];
           assert(gr.guid == request.guid);
           gr.output_tokens = request.tokens;
-          gr.output_text = output;
+          assert(isPrefixAndRemove(gr.input_tokens, gr.output_tokens));
+          if (gr.output_tokens.size() > 0 && gr.output_tokens[gr.output_tokens.size() - 1] == eos_token_id && !request.add_special_tokens) {
+            gr.output_tokens.pop_back();
+          }
+          gr.output_text = this->tokenizer_->Decode(gr.output_tokens);
         }
         request.status = Request::COMPLETED;
         trigger_request_completion_future(request.guid);
@@ -1275,7 +1295,7 @@ BeamSearchBatchConfig
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
-        if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+        if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1418,7 +1438,7 @@ BeamSearchBatchConfig
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token
-        if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+        if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
             request.tokens.at(0) == bos_token_id) {
           output = "<s> " + output;
         }
@@ -1466,7 +1486,7 @@ BeamSearchBatchConfig
       std::string output = this->tokenizer_->Decode(request.tokens);
       // Unlike Huggingface, the sentencepiece C++ library automatically removes
       // the BOS token
-      if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
+      if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
           request.tokens.at(0) == bos_token_id) {
         output = "<s> " + output;
       }

diff --git a/suffix_decoding/run_suffix_decoding.sh b/suffix_decoding/run_suffix_decoding.sh
@@ -6,16 +6,28 @@ set -x
 cd "${BASH_SOURCE[0]%/*}/../build"
 
 # Download models
-python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1
-export RUST_BACKTRACE=1
+python ../inference/utils/download_hf_model.py meta-llama/Meta-Llama-3-8B-Instruct Felladrin/Llama-160M-Chat-v1
+# export RUST_BACKTRACE=1
 
-gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \
+# gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \
+#     -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+#     -tensor-parallelism-degree 4 \
+#     -ll:fsize 20000 -ll:zsize 30000 \
+#     -llm-model meta-llama/Meta-Llama-3-8B \
+#     -ssm-model Felladrin/Llama-160M-Chat-v1 \
+#     -partition-name "" \
+#     -prompt ../../suffix-tree-decoding/trace/spider_v2.json \
+#     -output-file ../inference/output/spider_v2.out
+
+./inference/incr_decoding/incr_decoding \
     -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
     -tensor-parallelism-degree 4 \
-    -ll:fsize 20000 -ll:zsize 30000 \
-    -llm-model meta-llama/Meta-Llama-3-8B \
-    -ssm-model Felladrin/Llama-160M-Chat-v1 \
-    -partition-name "" \
-    -prompt ../../suffix-tree-decoding/trace/spider_v2.json \
-    -output-file ../inference/output/spider_v2.out
-
+    -ll:fsize 20000 -ll:zsize 60000 \
+    --max-sequence-length 1200 \
+    --max-requests-per-batch 1 \
+    --max-tokens-per-batch 256 \
+    -llm-model meta-llama/Meta-Llama-3-8B-Instruct \
+    -prompt /home/yak/goliaro/suffix-tree-decoding/trace/cortex_v2.json \
+    -output-file /home/yak/goliaro/FlexFlow/suffix_decoding/test.out \
+    -target-partition FEATURE_EXTRACTION \
+    -output-trace-path /home/yak/goliaro/suffix-tree-decoding/trace/flexflow/cortex_ff_FEATURE_EXTRACTION.json