diff --git a/CMakeLists.txt b/CMakeLists.txt index d0a1c3a4fc..d1cb58b343 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -300,7 +300,7 @@ if(NOT BUILD_LEGION_ONLY) include(FetchContent) FetchContent_Declare( suffix_decoding - GIT_REPOSITORY git@github.com:Snowflake-Labs/suffix-tree-decoding.git + GIT_REPOSITORY https://github.com/Snowflake-Labs/suffix-tree-decoding.git GIT_TAG main # or a specific tag/commit hash ) FetchContent_MakeAvailable(suffix_decoding) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 7aa11e8845..415ad0b586 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -70,6 +70,7 @@ struct Request { PEFTModelID peft_model_id = PEFTModelID::NO_ID; int max_length = -1; int max_new_tokens = -1; + bool add_special_tokens = true; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index f8e16f24fa..26266ab3d3 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -47,7 +47,9 @@ void parse_input_args(char **argv, float &topp, int &max_requests_per_batch, int &max_tokens_per_batch, - int &max_sequence_length) { + int &max_sequence_length, + std::string &target_partition, + std::string &output_trace_path) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -105,6 +107,14 @@ void parse_input_args(char **argv, max_sequence_length = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "-target-partition")) { + target_partition = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-output-trace-path")) { + output_trace_path = std::string(argv[++i]); + continue; + } } if (paths.cache_folder_path.empty()) { char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); @@ -136,6 +146,8 @@ void FlexFlow::top_level_task(Task const *task, int max_requests_per_batch = 8; int max_tokens_per_batch = 128; int max_sequence_length = 256; + std::string target_partition = "FEATURE_EXTRACTION"; + std::string output_trace_path = "/home/yak/goliaro/FlexFlow/suffix_decoding/modified_json_file.json"; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -151,7 +163,9 @@ void FlexFlow::top_level_task(Task const *task, topp, max_requests_per_batch, max_tokens_per_batch, - max_sequence_length); + max_sequence_length, + target_partition, + output_trace_path); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == @@ -257,25 +271,66 @@ void FlexFlow::top_level_task(Task const *task, int total_num_requests = 0; { - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); + // using json = nlohmann::json; + using json = nlohmann::ordered_json; + std::ifstream input_file(file_paths.prompt_file_path); + assert(input_file.good() && "Prompt file does not exist."); + json j; + input_file >> j; + input_file.close(); + + // Find the partition with name "FEATURE_EXTRACTION" + auto& partitions = j["partitions"]; + auto it = std::find_if(partitions.begin(), partitions.end(), + [target_partition](const json& partition) { + return partition["partition_name"] == target_partition; + }); - std::vector requests; - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - Request inference_req; - inference_req.prompt = text; - inference_req.max_length = 128; - requests.push_back(inference_req); - total_num_requests++; + if (it != partitions.end()) { + // We found the partition + json& feature_extraction_partition = *it; + + // Iterate through eval_entries + std::vector requests; + for (auto& entry : feature_extraction_partition["eval_entries"]) { + std::string text = entry["prompt"]; + int max_new_tokens_ = entry["response_length"]; + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + Request inference_req; + inference_req.prompt = text; + inference_req.max_new_tokens = max_new_tokens_; + inference_req.add_special_tokens = false; + requests.push_back(inference_req); + total_num_requests++; + // break; + } + std::vector result = model.generate(requests); + assert(result.size() == requests.size()); + assert(result.size() == total_num_requests); + assert(result.size() == feature_extraction_partition["eval_entries"].size()); + int i = 0; + for (auto& entry : feature_extraction_partition["eval_entries"]) { + entry["original_response"] = entry["response"]; + entry["original_response_length"] = entry["response_length"]; + std::string ff_out = result[i].output_text; + int tot_length = result[i].output_text.length(); + entry["response"] = ff_out; + entry["response_length"] = result[i].output_tokens.size(); + i++; + } + + // Write the modified JSON to a file + std::ofstream output_file(output_trace_path); + if (output_file.is_open()) { + output_file << j.dump(2); + output_file.close(); + std::cout << "Modified JSON has been saved to " << output_trace_path << std::endl; + } else { + std::cerr << "Unable to open file for writing." << std::endl; + } + } else { + std::cout << target_partition << " partition not found." << std::endl; } - std::vector result = model.generate(requests); } // terminate the request manager by stopping the background thread diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 634794320d..7a301438ed 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -56,6 +56,7 @@ std::ostream &operator<<(std::ostream &os, Request const &req) { os << " peft_model_id: " << req.peft_model_id << "\n"; os << " max_length: " << req.max_length << "\n"; os << " max_new_tokens: " << req.max_new_tokens << "\n"; + os << " add_special_tokens: " << req.add_special_tokens << "\n"; os << " initial_len: " << req.initial_len << "\n"; os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; os << " llm_cache_size: " << req.llm_cache_size << "\n"; @@ -413,6 +414,7 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_length = request_.max_length; request.max_new_tokens = request_.max_new_tokens; + request.add_special_tokens = request_.add_special_tokens; // both unset if (request.max_length == -1 && request.max_new_tokens == -1) { request.max_length = get_max_sequence_length() - 1; @@ -427,7 +429,7 @@ RequestManager::RequestGuid } request.peft_model_id = request_.peft_model_id; request.warmup = request_.warmup; - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } if (request_.benchmarking_tokens >= 0) { @@ -520,6 +522,7 @@ RequestManager::RequestGuid request.initial_len = 0; request.max_length = request_.max_length; request.max_new_tokens = request_.max_new_tokens; + request.add_special_tokens = request_.add_special_tokens; if (request.max_new_tokens != -1) { std::cerr << "Error: max_new_tokens is not allowed for PEFT finetuning requests" @@ -544,7 +547,7 @@ RequestManager::RequestGuid request.benchmarking_tokens = request_.benchmarking_tokens; std::vector input_tokens; std::vector output_tokens; - bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON); + bool bos_added = (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON); if (bos_added) { input_tokens.push_back(bos_token_id); } @@ -566,7 +569,7 @@ RequestManager::RequestGuid std::string output_text(""); std::vector input_tokens; input_tokens = this->tokenizer_->Encode(text); - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) { input_tokens.insert(input_tokens.begin(), bos_token_id); } std::vector output_tokens = @@ -755,6 +758,19 @@ void RequestManager::check_batch(BatchConfig const &old_bc, } } +bool isPrefixAndRemove(const std::vector& prefix, std::vector& vec) { + if (prefix.size() > vec.size()) { + return false; + } + + if (std::equal(prefix.begin(), prefix.end(), vec.begin())) { + vec.erase(vec.begin(), vec.begin() + prefix.size()); + return true; + } + + return false; +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); @@ -814,7 +830,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token - if (model_type == ModelType::LLAMA && old_llama_tokenizer && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -823,7 +839,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); gr.output_tokens = request.tokens; - gr.output_text = output; + assert(isPrefixAndRemove(gr.input_tokens, gr.output_tokens)); + if (gr.output_tokens.size() > 0 && gr.output_tokens[gr.output_tokens.size() - 1] == eos_token_id && !request.add_special_tokens) { + gr.output_tokens.pop_back(); + } + gr.output_text = this->tokenizer_->Decode(gr.output_tokens); } request.status = Request::COMPLETED; trigger_request_completion_future(request.guid); @@ -1275,7 +1295,7 @@ BeamSearchBatchConfig std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token - if (model_type == ModelType::LLAMA && old_llama_tokenizer && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1418,7 +1438,7 @@ BeamSearchBatchConfig std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token - if (model_type == ModelType::LLAMA && old_llama_tokenizer && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1466,7 +1486,7 @@ BeamSearchBatchConfig std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically removes // the BOS token - if (model_type == ModelType::LLAMA && old_llama_tokenizer && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } diff --git a/suffix_decoding/run_suffix_decoding.sh b/suffix_decoding/run_suffix_decoding.sh index bd0480426a..b2c6cf293c 100755 --- a/suffix_decoding/run_suffix_decoding.sh +++ b/suffix_decoding/run_suffix_decoding.sh @@ -6,16 +6,28 @@ set -x cd "${BASH_SOURCE[0]%/*}/../build" # Download models -python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1 -export RUST_BACKTRACE=1 +python ../inference/utils/download_hf_model.py meta-llama/Meta-Llama-3-8B-Instruct Felladrin/Llama-160M-Chat-v1 +# export RUST_BACKTRACE=1 -gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \ +# gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \ +# -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ +# -tensor-parallelism-degree 4 \ +# -ll:fsize 20000 -ll:zsize 30000 \ +# -llm-model meta-llama/Meta-Llama-3-8B \ +# -ssm-model Felladrin/Llama-160M-Chat-v1 \ +# -partition-name "" \ +# -prompt ../../suffix-tree-decoding/trace/spider_v2.json \ +# -output-file ../inference/output/spider_v2.out + +./inference/incr_decoding/incr_decoding \ -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ -tensor-parallelism-degree 4 \ - -ll:fsize 20000 -ll:zsize 30000 \ - -llm-model meta-llama/Meta-Llama-3-8B \ - -ssm-model Felladrin/Llama-160M-Chat-v1 \ - -partition-name "" \ - -prompt ../../suffix-tree-decoding/trace/spider_v2.json \ - -output-file ../inference/output/spider_v2.out - + -ll:fsize 20000 -ll:zsize 60000 \ + --max-sequence-length 1200 \ + --max-requests-per-batch 1 \ + --max-tokens-per-batch 256 \ + -llm-model meta-llama/Meta-Llama-3-8B-Instruct \ + -prompt /home/yak/goliaro/suffix-tree-decoding/trace/cortex_v2.json \ + -output-file /home/yak/goliaro/FlexFlow/suffix_decoding/test.out \ + -target-partition FEATURE_EXTRACTION \ + -output-trace-path /home/yak/goliaro/suffix-tree-decoding/trace/flexflow/cortex_ff_FEATURE_EXTRACTION.json