Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-goliaro committed Oct 18, 2024
1 parent 3b9d8bb commit 0114211
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 38 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ if(NOT BUILD_LEGION_ONLY)
include(FetchContent)
FetchContent_Declare(
suffix_decoding
GIT_REPOSITORY git@github.com:Snowflake-Labs/suffix-tree-decoding.git
GIT_REPOSITORY https://github.com/Snowflake-Labs/suffix-tree-decoding.git
GIT_TAG main # or a specific tag/commit hash
)
FetchContent_MakeAvailable(suffix_decoding)
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ struct Request {
PEFTModelID peft_model_id = PEFTModelID::NO_ID;
int max_length = -1;
int max_new_tokens = -1;
bool add_special_tokens = true;
int initial_len;
int ssm_cache_size = 0;
int llm_cache_size = 0;
Expand Down
93 changes: 74 additions & 19 deletions inference/incr_decoding/incr_decoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ void parse_input_args(char **argv,
float &topp,
int &max_requests_per_batch,
int &max_tokens_per_batch,
int &max_sequence_length) {
int &max_sequence_length,
std::string &target_partition,
std::string &output_trace_path) {
for (int i = 1; i < argc; i++) {
// llm model type
if (!strcmp(argv[i], "-llm-model")) {
Expand Down Expand Up @@ -105,6 +107,14 @@ void parse_input_args(char **argv,
max_sequence_length = std::stoi(argv[++i]);
continue;
}
if (!strcmp(argv[i], "-target-partition")) {
target_partition = std::string(argv[++i]);
continue;
}
if (!strcmp(argv[i], "-output-trace-path")) {
output_trace_path = std::string(argv[++i]);
continue;
}
}
if (paths.cache_folder_path.empty()) {
char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
Expand Down Expand Up @@ -136,6 +146,8 @@ void FlexFlow::top_level_task(Task const *task,
int max_requests_per_batch = 8;
int max_tokens_per_batch = 128;
int max_sequence_length = 256;
std::string target_partition = "FEATURE_EXTRACTION";
std::string output_trace_path = "/home/yak/goliaro/FlexFlow/suffix_decoding/modified_json_file.json";

InputArgs const &command_args = HighLevelRuntime::get_input_args();
char **argv = command_args.argv;
Expand All @@ -151,7 +163,9 @@ void FlexFlow::top_level_task(Task const *task,
topp,
max_requests_per_batch,
max_tokens_per_batch,
max_sequence_length);
max_sequence_length,
target_partition,
output_trace_path);

assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
ffconfig.pipeline_parallelism_degree ==
Expand Down Expand Up @@ -257,25 +271,66 @@ void FlexFlow::top_level_task(Task const *task,

int total_num_requests = 0;
{
using json = nlohmann::json;
std::ifstream file_handle(file_paths.prompt_file_path);
assert(file_handle.good() && "Prompt file does not exist.");
json prompt_json = json::parse(file_handle,
/*parser_callback_t */ nullptr,
/*allow_exceptions */ true,
/*ignore_comments */ true);
// using json = nlohmann::json;
using json = nlohmann::ordered_json;
std::ifstream input_file(file_paths.prompt_file_path);
assert(input_file.good() && "Prompt file does not exist.");
json j;
input_file >> j;
input_file.close();

// Find the partition with name "FEATURE_EXTRACTION"
auto& partitions = j["partitions"];
auto it = std::find_if(partitions.begin(), partitions.end(),
[target_partition](const json& partition) {
return partition["partition_name"] == target_partition;
});

std::vector<Request> requests;
for (auto &prompt : prompt_json) {
std::string text = prompt.get<std::string>();
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.max_length = 128;
requests.push_back(inference_req);
total_num_requests++;
if (it != partitions.end()) {
// We found the partition
json& feature_extraction_partition = *it;

// Iterate through eval_entries
std::vector<Request> requests;
for (auto& entry : feature_extraction_partition["eval_entries"]) {
std::string text = entry["prompt"];
int max_new_tokens_ = entry["response_length"];
printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.max_new_tokens = max_new_tokens_;
inference_req.add_special_tokens = false;
requests.push_back(inference_req);
total_num_requests++;
// break;
}
std::vector<GenerationResult> result = model.generate(requests);
assert(result.size() == requests.size());
assert(result.size() == total_num_requests);
assert(result.size() == feature_extraction_partition["eval_entries"].size());
int i = 0;
for (auto& entry : feature_extraction_partition["eval_entries"]) {
entry["original_response"] = entry["response"];
entry["original_response_length"] = entry["response_length"];
std::string ff_out = result[i].output_text;
int tot_length = result[i].output_text.length();
entry["response"] = ff_out;
entry["response_length"] = result[i].output_tokens.size();
i++;
}

// Write the modified JSON to a file
std::ofstream output_file(output_trace_path);
if (output_file.is_open()) {
output_file << j.dump(2);
output_file.close();
std::cout << "Modified JSON has been saved to " << output_trace_path << std::endl;
} else {
std::cerr << "Unable to open file for writing." << std::endl;
}
} else {
std::cout << target_partition << " partition not found." << std::endl;
}
std::vector<GenerationResult> result = model.generate(requests);
}

// terminate the request manager by stopping the background thread
Expand Down
36 changes: 28 additions & 8 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
os << " peft_model_id: " << req.peft_model_id << "\n";
os << " max_length: " << req.max_length << "\n";
os << " max_new_tokens: " << req.max_new_tokens << "\n";
os << " add_special_tokens: " << req.add_special_tokens << "\n";
os << " initial_len: " << req.initial_len << "\n";
os << " ssm_cache_size: " << req.ssm_cache_size << "\n";
os << " llm_cache_size: " << req.llm_cache_size << "\n";
Expand Down Expand Up @@ -413,6 +414,7 @@ RequestManager::RequestGuid
request.guid = next_available_guid++;
request.max_length = request_.max_length;
request.max_new_tokens = request_.max_new_tokens;
request.add_special_tokens = request_.add_special_tokens;
// both unset
if (request.max_length == -1 && request.max_new_tokens == -1) {
request.max_length = get_max_sequence_length() - 1;
Expand All @@ -427,7 +429,7 @@ RequestManager::RequestGuid
}
request.peft_model_id = request_.peft_model_id;
request.warmup = request_.warmup;
if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) {
request.tokens.push_back(bos_token_id);
}
if (request_.benchmarking_tokens >= 0) {
Expand Down Expand Up @@ -520,6 +522,7 @@ RequestManager::RequestGuid
request.initial_len = 0;
request.max_length = request_.max_length;
request.max_new_tokens = request_.max_new_tokens;
request.add_special_tokens = request_.add_special_tokens;
if (request.max_new_tokens != -1) {
std::cerr
<< "Error: max_new_tokens is not allowed for PEFT finetuning requests"
Expand All @@ -544,7 +547,7 @@ RequestManager::RequestGuid
request.benchmarking_tokens = request_.benchmarking_tokens;
std::vector<int32_t> input_tokens;
std::vector<int32_t> output_tokens;
bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON);
bool bos_added = (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON);
if (bos_added) {
input_tokens.push_back(bos_token_id);
}
Expand All @@ -566,7 +569,7 @@ RequestManager::RequestGuid
std::string output_text("");
std::vector<int32_t> input_tokens;
input_tokens = this->tokenizer_->Encode(text);
if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) {
input_tokens.insert(input_tokens.begin(), bos_token_id);
}
std::vector<int32_t> output_tokens =
Expand Down Expand Up @@ -755,6 +758,19 @@ void RequestManager::check_batch(BatchConfig const &old_bc,
}
}

bool isPrefixAndRemove(const std::vector<int>& prefix, std::vector<int>& vec) {
if (prefix.size() > vec.size()) {
return false;
}

if (std::equal(prefix.begin(), prefix.end(), vec.begin())) {
vec.erase(vec.begin(), vec.begin() + prefix.size());
return true;
}

return false;
}

BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
InferenceResult const &result) {
const std::lock_guard<std::mutex> lock(request_queue_mutex);
Expand Down Expand Up @@ -814,7 +830,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
std::string output = this->tokenizer_->Decode(request.tokens);
// Unlike Huggingface, the sentencepiece C++ library automatically
// removes the BOS token
if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
request.tokens.at(0) == bos_token_id) {
output = "<s> " + output;
}
Expand All @@ -823,7 +839,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
GenerationResult &gr = request_generation_results[request.guid];
assert(gr.guid == request.guid);
gr.output_tokens = request.tokens;
gr.output_text = output;
assert(isPrefixAndRemove(gr.input_tokens, gr.output_tokens));
if (gr.output_tokens.size() > 0 && gr.output_tokens[gr.output_tokens.size() - 1] == eos_token_id && !request.add_special_tokens) {
gr.output_tokens.pop_back();
}
gr.output_text = this->tokenizer_->Decode(gr.output_tokens);
}
request.status = Request::COMPLETED;
trigger_request_completion_future(request.guid);
Expand Down Expand Up @@ -1275,7 +1295,7 @@ BeamSearchBatchConfig
std::string output = this->tokenizer_->Decode(request.tokens);
// Unlike Huggingface, the sentencepiece C++ library automatically
// removes the BOS token
if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
request.tokens.at(0) == bos_token_id) {
output = "<s> " + output;
}
Expand Down Expand Up @@ -1418,7 +1438,7 @@ BeamSearchBatchConfig
std::string output = this->tokenizer_->Decode(request.tokens);
// Unlike Huggingface, the sentencepiece C++ library automatically
// removes the BOS token
if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
request.tokens.at(0) == bos_token_id) {
output = "<s> " + output;
}
Expand Down Expand Up @@ -1466,7 +1486,7 @@ BeamSearchBatchConfig
std::string output = this->tokenizer_->Decode(request.tokens);
// Unlike Huggingface, the sentencepiece C++ library automatically removes
// the BOS token
if (model_type == ModelType::LLAMA && old_llama_tokenizer &&
if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.add_special_tokens &&
request.tokens.at(0) == bos_token_id) {
output = "<s> " + output;
}
Expand Down
32 changes: 22 additions & 10 deletions suffix_decoding/run_suffix_decoding.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,28 @@ set -x
cd "${BASH_SOURCE[0]%/*}/../build"

# Download models
python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1
export RUST_BACKTRACE=1
python ../inference/utils/download_hf_model.py meta-llama/Meta-Llama-3-8B-Instruct Felladrin/Llama-160M-Chat-v1
# export RUST_BACKTRACE=1

gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \
# gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \
# -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
# -tensor-parallelism-degree 4 \
# -ll:fsize 20000 -ll:zsize 30000 \
# -llm-model meta-llama/Meta-Llama-3-8B \
# -ssm-model Felladrin/Llama-160M-Chat-v1 \
# -partition-name "" \
# -prompt ../../suffix-tree-decoding/trace/spider_v2.json \
# -output-file ../inference/output/spider_v2.out

./inference/incr_decoding/incr_decoding \
-ll:gpu 4 -ll:cpu 4 -ll:util 4 \
-tensor-parallelism-degree 4 \
-ll:fsize 20000 -ll:zsize 30000 \
-llm-model meta-llama/Meta-Llama-3-8B \
-ssm-model Felladrin/Llama-160M-Chat-v1 \
-partition-name "" \
-prompt ../../suffix-tree-decoding/trace/spider_v2.json \
-output-file ../inference/output/spider_v2.out

-ll:fsize 20000 -ll:zsize 60000 \
--max-sequence-length 1200 \
--max-requests-per-batch 1 \
--max-tokens-per-batch 256 \
-llm-model meta-llama/Meta-Llama-3-8B-Instruct \
-prompt /home/yak/goliaro/suffix-tree-decoding/trace/cortex_v2.json \
-output-file /home/yak/goliaro/FlexFlow/suffix_decoding/test.out \
-target-partition FEATURE_EXTRACTION \
-output-trace-path /home/yak/goliaro/suffix-tree-decoding/trace/flexflow/cortex_ff_FEATURE_EXTRACTION.json

0 comments on commit 0114211

Please sign in to comment.