From 6451a3bc152f6dd784e3f043def0f57900a753d0 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Fri, 18 Aug 2023 00:20:43 -0400 Subject: [PATCH 01/30] Reorder pipeline. --- include/flexflow/model.h | 2 +- src/mapper/mapper.cc | 2 +- src/runtime/request_manager.cc | 289 +++++++++++++++++---------------- 3 files changed, 149 insertions(+), 144 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index bc3c7e6545..60ab164e08 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -229,8 +229,8 @@ enum TaskIDs { RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, - RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, + RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index 3d08eb0bcc..a86a6167a6 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -284,8 +284,8 @@ void FFMapper::select_task_options(const MapperContext ctx, return; } if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) || - (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || + (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) { output.initial_proc = all_cpus[0]; return; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 348272a69b..7a1bdf8e7e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -458,149 +458,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } /* ----- Speculative Inference Specific functions ----- */ -BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( - BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); - - RequestManager *rm = this; - TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, - TaskArgument(&rm, sizeof(RequestManager *))); - launcher.add_future(old_bc); - launcher.add_future(result); - return runtime->execute_task(ctx, launcher); -} - -BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - RequestManager *rm = *((RequestManager **)task->args); - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - BeamInferenceResult const &result = - Future(task->futures[1]).get_result(); - return rm->prepare_next_batch_beam(bc, result); -} - -// update beam search metadata -BeamSearchBatchConfig - RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, - BeamInferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); - if (verbose) { - std::cout << "\n############### prepare_next_batch_beam ###############\n"; - } - if (verbose) { - std::cout << "print all results" - << "\n"; - for (int i = 0; i < 40; i++) { - std::cout << result.token_ids[i] << ", "; - } - std::cout << "Current Beam Depth: " - << old_bc.beamRequestsInfo[0].current_depth << "\n"; - } - - // Step 1: Store result to the beam tree struct - store_beam_metadata(old_bc, result); - - // Step 2: preparing the next batch for existing requests - BeamSearchBatchConfig new_bc; - new_bc.max_init_length = 0; - new_bc.model_id = old_bc.model_id; - // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; - - for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { - if (old_bc.request_completed[i]) { - continue; - } - // Comment out this assertion since num_tokens_in_batch can be - // zero when beam search has reached required sequence length - // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); - Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + - old_bc.requestsInfo[i].num_tokens_in_batch; - - // assert(processed_tokens < request.tokens.size()); - log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - if (processed_tokens > - old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() - // || ir.results[t] == 0 TODO: replace this with - ) { - log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - old_bc.requestsInfo[i].request_guid, - old_bc.beamRequestsInfo[i].max_depth); - // new_bc.request_completed[i] = true; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - } else { - log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " - << new_bc.num_tokens; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - - // update the beam search metadata - // how many sub request in current request - // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - // update the parentid, accumalated_probs, depth, and token_ids - new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth + 1; - new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; - new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; - - // do the slot exchange to minimize the cache exchange in kernel. - // std::cout << "update metadata" << std::endl; - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); - - if (new_bc.requestsInfo[i].token_start_offset + 1 >= - request.tokens.size()) { - // Incremental phase - new_bc.requestsInfo[i].num_tokens_in_batch = 1; - } else { - // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); - } - - // register more tokens due to the beam width - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - - // get value from requestinfo - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_bc.beamRequestsInfo[i].tokens[k]; - // request.tokens[depth]; - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; - new_bc.num_tokens++; - } - } - } - } - if (verbose) { - std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" - << std::endl; - old_bc.print(); - new_bc.print(); - } - return new_bc; -} +/***** Request Init Phase *****/ BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init( TreeVerifyBatchConfigFuture const &old_bc, InferenceResultFuture const &result, @@ -894,6 +753,152 @@ BeamSearchBatchConfig return new_bc; } +/***** Beam Search Phase *****/ +BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( + BeamSearchBatchConfigFuture const &old_bc, + BeamInferenceResultFuture const &result) { + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + + RequestManager *rm = this; + TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, + TaskArgument(&rm, sizeof(RequestManager *))); + launcher.add_future(old_bc); + launcher.add_future(result); + return runtime->execute_task(ctx, launcher); +} + +BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + RequestManager *rm = *((RequestManager **)task->args); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + BeamInferenceResult const &result = + Future(task->futures[1]).get_result(); + return rm->prepare_next_batch_beam(bc, result); +} + +// update beam search metadata +BeamSearchBatchConfig + RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, + BeamInferenceResult const &result) { + const std::lock_guard lock(request_queue_mutex); + if (verbose) { + std::cout << "\n############### prepare_next_batch_beam ###############\n"; + } + if (verbose) { + std::cout << "print all results" + << "\n"; + for (int i = 0; i < 40; i++) { + std::cout << result.token_ids[i] << ", "; + } + std::cout << "Current Beam Depth: " + << old_bc.beamRequestsInfo[0].current_depth << "\n"; + } + + // Step 1: Store result to the beam tree struct + store_beam_metadata(old_bc, result); + + // Step 2: preparing the next batch for existing requests + BeamSearchBatchConfig new_bc; + new_bc.max_init_length = 0; + new_bc.model_id = old_bc.model_id; + // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; + + for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_bc.request_completed[i]) { + continue; + } + // Comment out this assertion since num_tokens_in_batch can be + // zero when beam search has reached required sequence length + // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + old_bc.requestsInfo[i].num_tokens_in_batch; + + // assert(processed_tokens < request.tokens.size()); + log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; + if (processed_tokens > + old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() + // || ir.results[t] == 0 TODO: replace this with + ) { + log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", + old_bc.requestsInfo[i].request_guid, + old_bc.beamRequestsInfo[i].max_depth); + // new_bc.request_completed[i] = true; + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + } else { + log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " + << new_bc.num_tokens; + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + + // update the beam search metadata + // how many sub request in current request + // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? + new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + // update the parentid, accumalated_probs, depth, and token_ids + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth + 1; + new_bc.beamRequestsInfo[i].beam_size = + old_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].max_depth = + old_bc.beamRequestsInfo[i].max_depth; + + // do the slot exchange to minimize the cache exchange in kernel. + // std::cout << "update metadata" << std::endl; + update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + + if (new_bc.requestsInfo[i].token_start_offset + 1 >= + request.tokens.size()) { + // Incremental phase + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + } else { + // Prompt phase + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].token_start_offset); + } + + // register more tokens due to the beam width + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].token_start_offset + j; + for (int k = 0; k < new_bc.sub_requests[i]; k++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + + // get value from requestinfo + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_bc.beamRequestsInfo[i].tokens[k]; + // request.tokens[depth]; + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; + new_bc.num_tokens++; + } + } + } + } + if (verbose) { + std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:" + << std::endl; + old_bc.print(); + new_bc.print(); + } + return new_bc; +} + +/***** Verify Phase *****/ + TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { Runtime *runtime = Runtime::get_runtime(); From 1de3e2169fa15a789bdc6c66c158b07aac5b7b56 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Tue, 22 Aug 2023 01:23:04 -0400 Subject: [PATCH 02/30] refactor and small fixes. --- CMakeLists.txt | 1 + include/flexflow/batch_config.h | 2 +- src/runtime/request_manager.cc | 166 +++++++++++++++----------------- 3 files changed, 82 insertions(+), 87 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fced69cf8..95e4066037 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,6 +307,7 @@ list(APPEND FF_NVCC_FLAGS list(APPEND FF_LD_FLAGS -lrt -ldl + -lstdc++fs -rdynamic) # Set FF FLAGS diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ce331d3e41..a8e974841c 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -53,6 +53,7 @@ class BatchConfig { // These are set by update int num_tokens; + bool loading_prompt = false; struct PerRequestInfo { int token_start_offset; @@ -113,7 +114,6 @@ class BeamSearchBatchConfig : public BatchConfig { inline static int const MAX_BEAM_DEPTH = 8; int model_id; - int max_init_length = 0; struct BeamSearchPerRequestInfo { int beam_size; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7a1bdf8e7e..e251fb4638 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -136,7 +136,7 @@ RequestManager::RequestGuid // Add a new request Request request; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; + request.max_sequence_length = max_sequence_length; if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " @@ -498,6 +498,7 @@ BeamSearchBatchConfig if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; } + // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; @@ -517,62 +518,52 @@ BeamSearchBatchConfig assert(old_bc.num_tokens > 0); - int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; - if (committed_tokens.find(guid) == committed_tokens.end()) { - committed_tokens[guid] = std::vector>(); + // int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; + + // reset committed_tokens + if (committed_tokens.count(guid) == 0) { + committed_tokens[guid] = {}; } else { - committed_tokens.at(guid).clear(); + committed_tokens[guid].clear(); } + + // iterate through all the tokens that belong to request i - while (result_index < old_bc.num_tokens && - old_bc.tokensInfo[result_index].request_index == i) { - // new tokens have not been appended yet, so the last appended token is - // the root of the beam search token tree - int root_abs_depth = request.tokens.size() - 1; - if (old_bc.tokensInfo[result_index].abs_depth_in_request >= - root_abs_depth) { - // append to tree_outputs a pair consisting of (token id, depth) - tree_outputs.push_back(std::make_pair( - result.token_ids[result_index], - old_bc.tokensInfo[result_index].abs_depth_in_request + 1)); - // append (depth, index of the token in result) to committed_tokens - // array - committed_tokens.at(guid).push_back( - std::make_pair(old_bc.tokensInfo[result_index].abs_depth_in_request, - result_index)); + int root_abs_depth = request.tokens.size() - 1; - if (verbose) { - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - old_bc.tokensInfo[result_index].abs_depth_in_request, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - tree_outputs.back().first); + while (result_index < old_bc.num_tokens && + old_bc.tokensInfo[result_index].request_index == i) { + int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; + int token_id = result.token_ids[result_index]; + + if (abs_depth >= root_abs_depth) { + tree_outputs.emplace_back(token_id, abs_depth + 1); + committed_tokens[guid].emplace_back(abs_depth, result_index); + + if (verbose) { + std::cout << "Index within old batch: " << result_index << std::endl; + printf(" Input: [%d] %d ---> [%d] %d \n", + abs_depth, old_bc.tokensInfo[result_index].token_id, + tree_outputs.back().second, token_id); + } } - // std::cout << " Input: " << old_bc.tokensInfo[result_index].token_id - // << "" - // << old_bc.tokensInfo[result_index].abs_depth_in_request << - // std::endl; - // std::cout << " Result: " << result.token_ids[result_index] << ", - // depth: " - // << old_bc.tokensInfo[result_index].abs_depth_in_request + 1 << - // std::endl; - } - result_index++; + result_index++; } + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); + // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { // Append all verified tokens to the request - for (int j = 0; j < verified_tokens.size(); j++) { - if (verified_tokens[j].second < request.max_sequence_length) { - request.tokens.push_back(verified_tokens[j].first); - } + for (const auto &token_pair : verified_tokens) { + if (token_pair.second < request.max_sequence_length) { + request.tokens.push_back(token_pair.first); + } } log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", @@ -587,8 +578,11 @@ BeamSearchBatchConfig gr.output_text = output; } log_req_mgr.print("Final output: %s", output.c_str()); + new_bc.request_completed[i] = true; num_processed_requests++; + + // Log profiling info ProfileInfo profile_info = profiling_requests[request.guid]; profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); total_request_run_time += @@ -632,68 +626,69 @@ BeamSearchBatchConfig // delete the old input tree from cache dfs_tree_inputs.erase(request.guid); - continue; - } + } else { // Request not finished, pass verified_tokens to next iteration - new_bc.request_completed[i] = false; + new_bc.request_completed[i] = false; - // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - - // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - - new_bc.requestsInfo[i].token_start_offset - - verified_tokens.size(); - new_bc.beamRequestsInfo[i].current_depth = 1; - new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; - new_bc.beamRequestsInfo[i].max_depth = - std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { - new_bc.beamRequestsInfo[i].parent_id[j] = 0; - new_bc.beamRequestsInfo[i].probs[j] = 1; - } + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - new_bc.sub_requests[i] = 1; + // TODO: Beam Request Info, missing from VerifyTreeBatchConfig + int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].token_start_offset - + verified_tokens.size(); + new_bc.beamRequestsInfo[i].current_depth = 1; + new_bc.beamRequestsInfo[i].beam_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + new_bc.beamRequestsInfo[i].max_depth = + std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); + for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + new_bc.beamRequestsInfo[i].parent_id[j] = 0; + new_bc.beamRequestsInfo[i].probs[j] = 1; + } - // Token Info - for (int j = 0; j < verified_tokens.size(); j++) { - auto token = verified_tokens.at(j); + new_bc.sub_requests[i] = 1; - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + // Token Info + for (int j = 0; j < verified_tokens.size(); j++) { + auto token = verified_tokens.at(j); - // Beam Token Info - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; - new_bc.num_tokens++; + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; - // Add verified token to request's token list - request.tokens.push_back(token.first); + // Beam Token Info + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; + new_bc.num_tokens++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - break; + // Add verified token to request's token list + request.tokens.push_back(token.first); + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } } + std::string output = this->tokenizer_->Decode(request.tokens); + log_req_mgr.print("Output: %s", output.c_str()); } - std::string output = this->tokenizer_->Decode(request.tokens); - log_req_mgr.print("Output: %s", output.c_str()); } + + + + // Step 2: Initialize new request - new_bc.max_init_length = 0; for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { if (!pending_request_queue.empty() && new_bc.num_tokens < BeamSearchBatchConfig::MAX_NUM_TOKENS) { Request new_request = pending_request_queue.front(); pending_request_queue.pop(); - new_bc.max_init_length = - std::max(new_bc.max_init_length, new_request.initial_len); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].token_start_offset = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -804,7 +799,6 @@ BeamSearchBatchConfig // Step 2: preparing the next batch for existing requests BeamSearchBatchConfig new_bc; - new_bc.max_init_length = 0; new_bc.model_id = old_bc.model_id; // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n"; From c37235fd5d264ec6790b9435406e0ab668760a88 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Thu, 24 Aug 2023 21:21:05 -0400 Subject: [PATCH 03/30] Update --- src/runtime/request_manager.cc | 79 ++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e251fb4638..cc1322334c 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -136,7 +136,7 @@ RequestManager::RequestGuid // Add a new request Request request; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; + request.max_sequence_length = max_sequence_length; if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " @@ -518,39 +518,43 @@ BeamSearchBatchConfig assert(old_bc.num_tokens > 0); - // int start_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; - // reset committed_tokens if (committed_tokens.count(guid) == 0) { - committed_tokens[guid] = {}; + committed_tokens[guid] = {}; } else { - committed_tokens[guid].clear(); + committed_tokens[guid].clear(); } - // iterate through all the tokens that belong to request i int root_abs_depth = request.tokens.size() - 1; while (result_index < old_bc.num_tokens && - old_bc.tokensInfo[result_index].request_index == i) { - int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; - int token_id = result.token_ids[result_index]; - - if (abs_depth >= root_abs_depth) { - tree_outputs.emplace_back(token_id, abs_depth + 1); - committed_tokens[guid].emplace_back(abs_depth, result_index); - - if (verbose) { - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - abs_depth, old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, token_id); - } + old_bc.tokensInfo[result_index].request_index == i) { + int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; + int token_id = result.token_ids[result_index]; + + if (abs_depth >= root_abs_depth) { + tree_outputs.emplace_back(token_id, abs_depth + 1); + committed_tokens[guid].emplace_back(abs_depth, result_index); + + if (verbose) { + std::cout << "Index within old batch: " << result_index << std::endl; + printf(" Input: [%d] %d ---> [%d] %d \n", + abs_depth, + old_bc.tokensInfo[result_index].token_id, + tree_outputs.back().second, + token_id); } - result_index++; + std::cout << "Index within old batch: " << result_index << std::endl; + printf(" Input: [%d] %d ---> [%d] %d \n", + abs_depth, + old_bc.tokensInfo[result_index].token_id, + tree_outputs.back().second, + token_id); + } + result_index++; } - std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", @@ -560,10 +564,10 @@ BeamSearchBatchConfig if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { // Append all verified tokens to the request - for (const auto &token_pair : verified_tokens) { - if (token_pair.second < request.max_sequence_length) { - request.tokens.push_back(token_pair.first); - } + for (auto const &token_pair : verified_tokens) { + if (token_pair.second < request.max_sequence_length) { + request.tokens.push_back(token_pair.first); + } } log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", @@ -631,7 +635,8 @@ BeamSearchBatchConfig new_bc.request_completed[i] = false; // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; + new_bc.requestsInfo[i].token_start_offset = + verified_tokens.front().second; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -660,7 +665,8 @@ BeamSearchBatchConfig // Normal Token Info new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + token.second; // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; @@ -678,10 +684,6 @@ BeamSearchBatchConfig } } - - - - // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { @@ -850,7 +852,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].max_depth; // do the slot exchange to minimize the cache exchange in kernel. - // std::cout << "update metadata" << std::endl; update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); if (new_bc.requestsInfo[i].token_start_offset + 1 >= @@ -974,7 +975,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Profiling profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; - // TODO: Add prompt token first in first verify iteration + + // Add prompt token first in first verify iteration if (request.tokens.size() == request.initial_len) { // Initialization (prompt) phase for (int j = 0; j < request.initial_len; j++) { @@ -1020,8 +1022,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // add prompt to the dfs tree if (committed_tokens.find(guid) != committed_tokens.end()) { - if (dfs_tree_inputs.at(0).second == - request.initial_len + committed_tokens.at(guid).size() - 1) { + if (dfs_tree_inputs.at(0).second == request.initial_len + + committed_tokens.at(guid).size() - + 1) { // commit prompt for (int j = 0; j < request.initial_len; j++) { new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = @@ -1034,7 +1037,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } new_bc.num_tokens_to_commit++; } - } else { + } else { // commit the last token // only add the root token auto committed_token = committed_tokens.at(guid).at(0); new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = @@ -1101,7 +1104,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) { + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { break; } } From d18926f61013c7b4543279c611957847c683b79b Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Mon, 4 Sep 2023 20:00:01 -0400 Subject: [PATCH 04/30] Refactor backup. --- include/flexflow/batch_config.h | 1 + src/runtime/request_manager.cc | 359 ++++++++++++++++++++------------ 2 files changed, 223 insertions(+), 137 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index a8e974841c..ee3d5ac183 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -70,6 +70,7 @@ class BatchConfig { PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; bool request_completed[MAX_NUM_REQUESTS]; + bool request_running[MAX_NUM_TOKENS]; }; class TreeVerifyBatchConfig : public BatchConfig { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 3d5780418e..215d7eb610 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -740,6 +740,26 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; } + + if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) { + all_requests[new_request.guid].status = Request::PENDING; + new_bc.request_running[i] = false; + std::cout << "Request " << new_request.guid << " is pending" + << std::endl; + } else { + all_requests[new_request.guid].status = Request::RUNNING; + new_bc.request_running[i] = true; + std::cout << "Request " << new_request.guid << " is running" + << std::endl; + } + + std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch + << " tokens for request " << new_request.guid << std::endl; + std::cout << "total prompt in request: " << new_request.initial_len + << std::endl; + + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { break; } @@ -849,16 +869,30 @@ BeamSearchBatchConfig // how many sub request in current request // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + // update the parentid, accumalated_probs, depth, and token_ids - new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + old_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; + old_bc.beamRequestsInfo[i].max_depth; + if (request.status == Request::RUNNING) { + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth + 1; + new_bc.request_running[i] = true; + // do the slot exchange to minimize the cache exchange in kernel. + update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + } else { + // if the request is pending, we need to update the beam search + // metadata based on the initial length + new_bc.beamRequestsInfo[i].current_depth = + old_bc.beamRequestsInfo[i].current_depth; + new_bc.request_running[i] = false; + } + + // do the slot exchange to minimize the cache exchange in kernel. - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); if (new_bc.requestsInfo[i].token_start_offset + 1 >= request.tokens.size()) { @@ -880,9 +914,14 @@ BeamSearchBatchConfig new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; // get value from requestinfo - new_bc.tokensInfo[new_bc.num_tokens].token_id = - new_bc.beamRequestsInfo[i].tokens[k]; - // request.tokens[depth]; + if (request.status == Request::RUNNING) { + new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_bc.beamRequestsInfo[i].tokens[k]; + } else { + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[depth]; + } + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; } @@ -948,174 +987,220 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = all_requests[guid]; - // Get the dfs tree - std::vector>> - all_dfs_trees; - - for (int j = 0; j < old_batches.size(); j++) { - std::vector> new_tree = - traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1); - all_dfs_trees.push_back(new_tree); - } - assert(all_dfs_trees.size() == old_batches.size()); - std::vector> dfs_tree_inputs = - merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - - if (verbose) { - std::cout << "Request Tokens Size: " << request.tokens.size() - << std::endl; - for (int k = 0; k < request.tokens.size(); k++) { - std::cout << k << ": " << request.tokens[k] << std::endl; - } - } - - // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second; - new_bc.requestsInfo[i].request_guid = - old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; - // TODO: Check this - new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.request_completed[i] = false; - // Profiling profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; - // Add prompt token first in first verify iteration - if (request.tokens.size() == request.initial_len) { - // Initialization (prompt) phase - for (int j = 0; j < request.initial_len; j++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; + if(request.status == Request::RUNNING) { + new_bc.request_running[i] = true; + std::cout << "[Verify] Request " << request.guid << " is running" << std::endl; - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - } + // Get the dfs tree + std::vector>> + all_dfs_trees; - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; + for (int j = 0; j < old_batches.size(); j++) { + std::vector> new_tree = + traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1); + all_dfs_trees.push_back(new_tree); } + assert(all_dfs_trees.size() == old_batches.size()); + std::vector> dfs_tree_inputs = + merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - new_bc.requestsInfo[i].token_start_offset = 0; - } else { - // Incremental phase: only add the last committed token - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - request.tokens.size() - 1; - - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; + if (verbose) { + std::cout << "Request Tokens Size: " << request.tokens.size() + << std::endl; + for (int k = 0; k < request.tokens.size(); k++) { + std::cout << k << ": " << request.tokens[k] << std::endl; + } } - new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; - } - - if (verbose) { - std::cout << "dfs_tree_inputs.size(): " << dfs_tree_inputs.size() - << std::endl; - } + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second; + new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + // TODO: Check this + new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.request_completed[i] = false; - // add prompt to the dfs tree - if (committed_tokens.find(guid) != committed_tokens.end()) { - if (dfs_tree_inputs.at(0).second == request.initial_len + - committed_tokens.at(guid).size() - - 1) { // commit prompt + // Add prompt token first in first verify iteration + if (request.tokens.size() == request.initial_len) { + // Initialization (prompt) phase for (int j = 0; j < request.initial_len; j++) { - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " << j - << ", token_index: " << j << std::endl; - } - new_bc.num_tokens_to_commit++; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; } - } else { // commit the last token - // only add the root token - auto committed_token = committed_tokens.at(guid).at(0); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second << std::endl; + + std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { + assert(false && + "Exceeding the space available in the TreeVerify batch"); + break; } - new_bc.num_tokens_to_commit++; - } - if (verbose) { - std::cout << "new_bc.num_tokens_to_commit: " - << new_bc.num_tokens_to_commit << std::endl; - } - } - // Token Info - for (int j = 1; j < dfs_tree_inputs.size(); j++) { - auto token = dfs_tree_inputs.at(j); - if (verbose) { - std::cout << "[" << j << "] Token: " << token.first - << ", Depth:" << token.second << std::endl; - } - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + new_bc.requestsInfo[i].token_start_offset = 0; + } else { + // Incremental phase: only add the last committed token + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.tokens.size() - 1; - // TODO: Add committed token info - if (verbose) { - std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit - << std::endl; + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + assert(false && + "Exceeding the space available in the TreeVerify batch"); + break; + } + + new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; } if (committed_tokens.find(guid) != committed_tokens.end()) { - if (j < committed_tokens.at(guid).size()) { - auto committed_token = committed_tokens.at(guid).at(j); + if (dfs_tree_inputs.at(0).second == request.initial_len + + committed_tokens.at(guid).size() - + 1) { // commit prompt + for (int j = 0; j < request.initial_len; j++) { + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " << j + << ", token_index: " << j << std::endl; + } + new_bc.num_tokens_to_commit++; + } + } else { // commit the last token + // only add the root token + auto committed_token = committed_tokens.at(guid).at(0); new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; if (verbose) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; + << ", token_index: " << committed_token.second << std::endl; } new_bc.num_tokens_to_commit++; } + if (verbose) { + std::cout << "new_bc.num_tokens_to_commit: " + << new_bc.num_tokens_to_commit << std::endl; + } + + + for (int j = 1; j < dfs_tree_inputs.size(); j++) { + if (j < committed_tokens.at(guid).size()) { + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; + } + new_bc.num_tokens_to_commit++; + } + } } if (verbose) { std::cout << "new_bc.num_tokens_to_commit: " << new_bc.num_tokens_to_commit << std::endl; } - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - break; + // add prompt to the dfs tree + // if (committed_tokens.find(guid) != committed_tokens.end()) { + // if (dfs_tree_inputs.at(0).second == request.initial_len + + // committed_tokens.at(guid).size() - + // 1) { // commit prompt + // for (int j = 0; j < request.initial_len; j++) { + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + // i; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; + // if (verbose) { + // std::cout << new_bc.num_tokens_to_commit + // << "- committed_token.token_depth: " << j + // << ", token_index: " << j << std::endl; + // } + // new_bc.num_tokens_to_commit++; + // } + // } else { // commit the last token + // // only add the root token + // auto committed_token = committed_tokens.at(guid).at(0); + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + // committed_token.second; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + // committed_token.first; + // if (verbose) { + // std::cout << new_bc.num_tokens_to_commit + // << "- committed_token.token_depth: " + // << committed_token.first + // << ", token_index: " << committed_token.second << std::endl; + // } + // new_bc.num_tokens_to_commit++; + // } + // if (verbose) { + // std::cout << "new_bc.num_tokens_to_commit: " + // << new_bc.num_tokens_to_commit << std::endl; + // } + // } + + // Add Tokens from the DFS Tree to the next batch + for (int j = 1; j < dfs_tree_inputs.size(); j++) { + auto token = dfs_tree_inputs.at(j); + if (verbose) { + std::cout << "[" << j << "] Token: " << token.first + << ", Depth:" << token.second << std::endl; + } + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + + // TODO: Add committed token info + if (verbose) { + std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit + << std::endl; + } + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS - 1) { + break; + } } + + std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + } else if (request.status == Request::PENDING) { + new_bc.request_running[i] = false; + std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl; + } else { + assert(false && "Request status is not RUNNING or PENDING"); } - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + + } if (verbose) { From 99bb6963c495f0cc9cc8cd3d4f366792ac0a9dfc Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Tue, 5 Sep 2023 00:23:58 -0400 Subject: [PATCH 05/30] pipeline update. --- include/flexflow/request_manager.h | 4 + src/ops/tree_inc_multihead_self_attention.cu | 3 + src/runtime/request_manager.cc | 391 ++++++++++++------- 3 files changed, 258 insertions(+), 140 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index e444402dd0..3156801368 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -59,6 +59,10 @@ struct Request { BatchConfig::RequestGuid guid; int max_sequence_length; int initial_len; + int ssm_cache_size = 0; + int llm_cache_size = 0; + + Status status = PENDING; std::vector tokens; diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 69f085d3eb..455797e9b3 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -500,6 +500,9 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } + std::cout << "processed_tokens_in_batch: " << processed_tokens_in_batch + << std::endl; + std::cout << "num_active_tokens: " << bc->num_active_tokens() << std::endl; assert(processed_tokens_in_batch == bc->num_active_tokens()); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 215d7eb610..2afbab2f87 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -506,6 +506,8 @@ BeamSearchBatchConfig std::cout << "\n############### prepare_next_batch_init ###############\n"; } + std::cout << "\n############### prepare_next_batch_init ###############\n"; + // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; @@ -540,7 +542,9 @@ BeamSearchBatchConfig int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; int token_id = result.token_ids[result_index]; - if (abs_depth >= root_abs_depth) { + if (request.status == Request::PENDING) { + committed_tokens[guid].emplace_back(abs_depth, result_index); + } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); committed_tokens[guid].emplace_back(abs_depth, result_index); @@ -558,105 +562,157 @@ BeamSearchBatchConfig old_bc.tokensInfo[result_index].token_id, tree_outputs.back().second, token_id); - } + } result_index++; } + + if (request.status == Request::RUNNING) { + std::vector> verified_tokens = + traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + log_req_mgr.print("Number of Verified Tokens = %zu", + verified_tokens.size()); + + // check if the request is finished + if (verified_tokens.size() + request.tokens.size() >= + request.max_sequence_length) { + // Append all verified tokens to the request + for (auto const &token_pair : verified_tokens) { + if (token_pair.second < request.max_sequence_length) { + request.tokens.push_back(token_pair.first); + } + } + request.status = Request::COMPLETED; + log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", + request.guid, + request.tokens.size()); + std::string output = this->tokenizer_->Decode(request.tokens); + { + // update generation result and trigger future + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.output_tokens = request.tokens; + gr.output_text = output; + } + log_req_mgr.print("Final output: %s", output.c_str()); - std::vector> verified_tokens = - traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); - log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); - - // check if the request is finished - if (verified_tokens.size() + request.tokens.size() >= - request.max_sequence_length) { - // Append all verified tokens to the request - for (auto const &token_pair : verified_tokens) { - if (token_pair.second < request.max_sequence_length) { - request.tokens.push_back(token_pair.first); + new_bc.request_completed[i] = true; + new_bc.request_running[i] = false; + num_processed_requests++; + + // Log profiling info + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); + + // Write output to file if needed: + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath); + if (outputFile.is_open()) { + outputFile << "end-to-end latency: " << std::fixed + << std::setprecision(3) << total_request_run_time + << std::endl; + outputFile << "num decoding steps: " << profile_info.decoding_steps + << std::endl; + outputFile << "token IDs: "; + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } + } + outputFile << std::endl; + outputFile << output; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } } - } - request.status = Request::COMPLETED; - log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", - request.guid, - request.tokens.size()); - std::string output = this->tokenizer_->Decode(request.tokens); - { - // update generation result and trigger future - GenerationResult &gr = request_generation_results[request.guid]; - assert(gr.guid == request.guid); - gr.output_tokens = request.tokens; - gr.output_text = output; - } - log_req_mgr.print("Final output: %s", output.c_str()); - new_bc.request_completed[i] = true; - num_processed_requests++; + // delete the old input tree from cache + dfs_tree_inputs.erase(request.guid); - // Log profiling info - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + } else { // Request not finished, pass verified_tokens to next iteration - // Write output to file if needed: - if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath); - if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; - } - } - outputFile << std::endl; - outputFile << output; - outputFile.close(); - } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); + new_bc.request_completed[i] = false; + new_bc.request_running[i] = true; + + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = + verified_tokens.front().second; + new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + + // TODO: Beam Request Info, missing from VerifyTreeBatchConfig + int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].token_start_offset - + verified_tokens.size(); + new_bc.beamRequestsInfo[i].current_depth = 1; + new_bc.beamRequestsInfo[i].beam_size = + BeamSearchBatchConfig::MAX_BEAM_WIDTH; + new_bc.beamRequestsInfo[i].max_depth = + std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); + for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + new_bc.beamRequestsInfo[i].parent_id[j] = 0; + new_bc.beamRequestsInfo[i].probs[j] = 1; } - } - // delete the old input tree from cache - dfs_tree_inputs.erase(request.guid); + new_bc.sub_requests[i] = 1; - } else { // Request not finished, pass verified_tokens to next iteration + // Token Info + for (int j = 0; j < verified_tokens.size(); j++) { + auto token = verified_tokens.at(j); + + // Normal Token Info + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + token.second; + // Beam Token Info + new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; + new_bc.num_tokens++; + + // Add verified token to request's token list + request.tokens.push_back(token.first); + + if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + break; + } + } + std::string output = this->tokenizer_->Decode(request.tokens); + log_req_mgr.print("Output: %s", output.c_str()); + } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; + new_bc.request_running[i] = false; + + assert(request.ssm_cache_size == request.initial_len); // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = - verified_tokens.front().second; + new_bc.requestsInfo[i].token_start_offset = request.ssm_cache_size; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + new_bc.requestsInfo[i].num_tokens_in_batch = 0; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - - new_bc.requestsInfo[i].token_start_offset - - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].beam_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH; - new_bc.beamRequestsInfo[i].max_depth = - std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); + new_bc.beamRequestsInfo[i].max_depth = 0; for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; @@ -665,31 +721,13 @@ BeamSearchBatchConfig new_bc.sub_requests[i] = 1; // Token Info - for (int j = 0; j < verified_tokens.size(); j++) { - auto token = verified_tokens.at(j); - - // Normal Token Info - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - token.second; - - // Beam Token Info - new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; - new_bc.num_tokens++; - - // Add verified token to request's token list - request.tokens.push_back(token.first); - - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { - break; - } - } std::string output = this->tokenizer_->Decode(request.tokens); log_req_mgr.print("Output: %s", output.c_str()); + } else { + assert(false); } } - + // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { @@ -741,17 +779,22 @@ BeamSearchBatchConfig new_bc.num_tokens++; } - if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) { - all_requests[new_request.guid].status = Request::PENDING; - new_bc.request_running[i] = false; - std::cout << "Request " << new_request.guid << " is pending" - << std::endl; - } else { - all_requests[new_request.guid].status = Request::RUNNING; - new_bc.request_running[i] = true; - std::cout << "Request " << new_request.guid << " is running" - << std::endl; - } + // if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) { + // all_requests[new_request.guid].status = Request::PENDING; + // new_bc.request_running[i] = false; + // std::cout << "Request " << new_request.guid << " is pending" + // << std::endl; + // } else { + // all_requests[new_request.guid].status = Request::RUNNING; + // new_bc.request_running[i] = true; + // std::cout << "Request " << new_request.guid << " is running" + // << std::endl; + // } + all_requests[new_request.guid].status = Request::PENDING; + all_requests[new_request.guid].ssm_cache_size = new_bc.requestsInfo[i].num_tokens_in_batch; + new_bc.request_running[i] = false; + std::cout << "SSM KV Cache Size: " << all_requests[new_request.guid].ssm_cache_size << std::endl; + std::cout << "LLM KV Cache Size: " << all_requests[new_request.guid].llm_cache_size << std::endl; std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch << " tokens for request " << new_request.guid << std::endl; @@ -904,7 +947,10 @@ BeamSearchBatchConfig std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, (int)request.tokens.size() - new_bc.requestsInfo[i].token_start_offset); + request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; } + std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl; + std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl; // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -1026,23 +1072,23 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Add prompt token first in first verify iteration if (request.tokens.size() == request.initial_len) { // Initialization (prompt) phase - for (int j = 0; j < request.initial_len; j++) { - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; - - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - } - - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; - } - - new_bc.requestsInfo[i].token_start_offset = 0; + // for (int j = 0; j < request.initial_len; j++) { + // new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + // new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; + // new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; + + // new_bc.num_tokens++; + // new_bc.requestsInfo[i].num_tokens_in_batch++; + // } + + // std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + // if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { + // assert(false && + // "Exceeding the space available in the TreeVerify batch"); + // break; + // } + + // new_bc.requestsInfo[i].token_start_offset = 0; } else { // Incremental phase: only add the last committed token new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1062,22 +1108,24 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; } + + // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { if (dfs_tree_inputs.at(0).second == request.initial_len + committed_tokens.at(guid).size() - 1) { // commit prompt - for (int j = 0; j < request.initial_len; j++) { - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " << j - << ", token_index: " << j << std::endl; - } - new_bc.num_tokens_to_commit++; - } + // for (int j = 0; j < request.initial_len; j++) { + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + // i; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; + // if (verbose) { + // std::cout << new_bc.num_tokens_to_commit + // << "- committed_token.token_depth: " << j + // << ", token_index: " << j << std::endl; + // } + // new_bc.num_tokens_to_commit++; + // } } else { // commit the last token // only add the root token auto committed_token = committed_tokens.at(guid).at(0); @@ -1099,7 +1147,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } - for (int j = 1; j < dfs_tree_inputs.size(); j++) { if (j < committed_tokens.at(guid).size()) { auto committed_token = committed_tokens.at(guid).at(j); @@ -1195,6 +1242,70 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl; + std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl; + std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl; + + + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; + new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); + + + if (committed_tokens.find(guid) != committed_tokens.end()) { + for (int j = 0; j < committed_tokens.at(guid).size(); j++) { + auto token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = token.first; + + new_bc.num_tokens_to_commit++; + request.llm_cache_size++; + } + std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl; + } + + if (request.llm_cache_size < request.initial_len) { + // Initialization (prompt) phase + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.llm_cache_size + j]; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; + + new_bc.num_tokens++; + } + + std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { + assert(false && + "Exceeding the space available in the TreeVerify batch"); + break; + } + } else { + if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) { + request.status = Request::RUNNING; + new_bc.request_running[i] = true; + + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.tokens.size() - 1; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + + + dfs_tree_inputs[guid] = std::vector>{ std::make_pair(request.tokens.back(), request.tokens.size() - 1) }; + } + } + } else { assert(false && "Request status is not RUNNING or PENDING"); } From e6f24744de140eb46915b2bc8021d2af2647bc47 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Tue, 5 Sep 2023 00:46:14 -0400 Subject: [PATCH 06/30] Format. --- include/flexflow/request_manager.h | 1 - src/runtime/request_manager.cc | 153 ++++++++++++++++------------- 2 files changed, 86 insertions(+), 68 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 3156801368..d137f9421e 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -62,7 +62,6 @@ struct Request { int ssm_cache_size = 0; int llm_cache_size = 0; - Status status = PENDING; std::vector tokens; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2afbab2f87..2b6187ba5b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -562,10 +562,10 @@ BeamSearchBatchConfig old_bc.tokensInfo[result_index].token_id, tree_outputs.back().second, token_id); - } + } result_index++; } - + if (request.status == Request::RUNNING) { std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); @@ -618,10 +618,10 @@ BeamSearchBatchConfig std::ofstream outputFile(output_filepath); if (outputFile.is_open()) { outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; + << std::setprecision(3) << total_request_run_time + << std::endl; outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -650,7 +650,8 @@ BeamSearchBatchConfig // Normal Request Info new_bc.requestsInfo[i].token_start_offset = verified_tokens.front().second; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].request_guid = + old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); @@ -727,7 +728,7 @@ BeamSearchBatchConfig assert(false); } } - + // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::MAX_NUM_REQUESTS; i++) { if (new_bc.request_completed[i]) { @@ -779,7 +780,8 @@ BeamSearchBatchConfig new_bc.num_tokens++; } - // if (new_bc.requestsInfo[i].num_tokens_in_batch < new_request.initial_len) { + // if (new_bc.requestsInfo[i].num_tokens_in_batch < + // new_request.initial_len) { // all_requests[new_request.guid].status = Request::PENDING; // new_bc.request_running[i] = false; // std::cout << "Request " << new_request.guid << " is pending" @@ -791,18 +793,19 @@ BeamSearchBatchConfig // << std::endl; // } all_requests[new_request.guid].status = Request::PENDING; - all_requests[new_request.guid].ssm_cache_size = new_bc.requestsInfo[i].num_tokens_in_batch; + all_requests[new_request.guid].ssm_cache_size = + new_bc.requestsInfo[i].num_tokens_in_batch; new_bc.request_running[i] = false; - std::cout << "SSM KV Cache Size: " << all_requests[new_request.guid].ssm_cache_size << std::endl; - std::cout << "LLM KV Cache Size: " << all_requests[new_request.guid].llm_cache_size << std::endl; + std::cout << "SSM KV Cache Size: " + << all_requests[new_request.guid].ssm_cache_size << std::endl; + std::cout << "LLM KV Cache Size: " + << all_requests[new_request.guid].llm_cache_size << std::endl; std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch << " tokens for request " << new_request.guid << std::endl; std::cout << "total prompt in request: " << new_request.initial_len << std::endl; - - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { break; } @@ -912,15 +915,15 @@ BeamSearchBatchConfig // how many sub request in current request // why is sub_requests has MAX_NUM_REQUESTS * MAX_BEAM_WIDTH entries? new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - + // update the parentid, accumalated_probs, depth, and token_ids new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + old_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].max_depth = - old_bc.beamRequestsInfo[i].max_depth; + old_bc.beamRequestsInfo[i].max_depth; if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth + 1; + old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.request_running[i] = true; // do the slot exchange to minimize the cache exchange in kernel. update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); @@ -928,14 +931,13 @@ BeamSearchBatchConfig // if the request is pending, we need to update the beam search // metadata based on the initial length new_bc.beamRequestsInfo[i].current_depth = - old_bc.beamRequestsInfo[i].current_depth; + old_bc.beamRequestsInfo[i].current_depth; new_bc.request_running[i] = false; } - - // do the slot exchange to minimize the cache exchange in kernel. - // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), + // i); if (new_bc.requestsInfo[i].token_start_offset + 1 >= request.tokens.size()) { @@ -1036,9 +1038,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Profiling profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; - if(request.status == Request::RUNNING) { + if (request.status == Request::RUNNING) { new_bc.request_running[i] = true; - std::cout << "[Verify] Request " << request.guid << " is running" << std::endl; + std::cout << "[Verify] Request " << request.guid << " is running" + << std::endl; // Get the dfs tree std::vector>> @@ -1062,9 +1065,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = dfs_tree_inputs.front().second; - new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].token_start_offset = + dfs_tree_inputs.front().second; + new_bc.requestsInfo[i].request_guid = + old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_batches.at(0).requestsInfo[i].max_sequence_length; // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; @@ -1101,25 +1107,26 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { assert(false && - "Exceeding the space available in the TreeVerify batch"); + "Exceeding the space available in the TreeVerify batch"); break; } new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; } - // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { - if (dfs_tree_inputs.at(0).second == request.initial_len + - committed_tokens.at(guid).size() - - 1) { // commit prompt + if (dfs_tree_inputs.at(0).second == + request.initial_len + committed_tokens.at(guid).size() - + 1) { // commit prompt // for (int j = 0; j < request.initial_len; j++) { - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index + // = j; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index + // = // i; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; - // if (verbose) { + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth + // = j; if (verbose) { // std::cout << new_bc.num_tokens_to_commit // << "- committed_token.token_depth: " << j // << ", token_index: " << j << std::endl; @@ -1131,14 +1138,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( auto committed_token = committed_tokens.at(guid).at(0); new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; if (verbose) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << committed_token.first - << ", token_index: " << committed_token.second << std::endl; + << ", token_index: " << committed_token.second + << std::endl; } new_bc.num_tokens_to_commit++; } @@ -1172,18 +1181,19 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } - // add prompt to the dfs tree // if (committed_tokens.find(guid) != committed_tokens.end()) { // if (dfs_tree_inputs.at(0).second == request.initial_len + - // committed_tokens.at(guid).size() - - // 1) { // commit prompt + // committed_tokens.at(guid).size() + // - 1) { // commit prompt // for (int j = 0; j < request.initial_len; j++) { - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = j; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index + // = j; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index + // = // i; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = j; - // if (verbose) { + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth + // = j; if (verbose) { // std::cout << new_bc.num_tokens_to_commit // << "- committed_token.token_depth: " << j // << ", token_index: " << j << std::endl; @@ -1195,14 +1205,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // auto committed_token = committed_tokens.at(guid).at(0); // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = // committed_token.second; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; + // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index + // = i; // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = // committed_token.first; // if (verbose) { // std::cout << new_bc.num_tokens_to_commit // << "- committed_token.token_depth: " // << committed_token.first - // << ", token_index: " << committed_token.second << std::endl; + // << ", token_index: " << committed_token.second << + // std::endl; // } // new_bc.num_tokens_to_commit++; // } @@ -1222,12 +1234,13 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // Normal Token Info new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + token.second; // TODO: Add committed token info if (verbose) { - std::cout << "committed_tokens.size(): " << new_bc.num_tokens_to_commit - << std::endl; + std::cout << "committed_tokens.size(): " + << new_bc.num_tokens_to_commit << std::endl; } new_bc.num_tokens++; @@ -1241,42 +1254,48 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; - std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl; + std::cout << "[Verify] Request " << request.guid << " is pending" + << std::endl; std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl; std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl; - // Normal Request Info new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; - new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].request_guid = + old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_batches.at(0).requestsInfo[i].max_sequence_length; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].num_tokens_in_batch = std::min( BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); - if (committed_tokens.find(guid) != committed_tokens.end()) { for (int j = 0; j < committed_tokens.at(guid).size(); j++) { auto token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + token.second; new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = token.first; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + token.first; new_bc.num_tokens_to_commit++; request.llm_cache_size++; } - std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl; + std::cout << "[Verify] Committed Tokens from last loading batch: " + << new_bc.num_tokens_to_commit << std::endl; } if (request.llm_cache_size < request.initial_len) { // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.llm_cache_size + j]; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[request.llm_cache_size + j]; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.llm_cache_size + j; new_bc.num_tokens++; } @@ -1284,7 +1303,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { assert(false && - "Exceeding the space available in the TreeVerify batch"); + "Exceeding the space available in the TreeVerify batch"); break; } } else { @@ -1294,24 +1313,24 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.tokens.size() - 1; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.tokens.size() - 1; new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; - - dfs_tree_inputs[guid] = std::vector>{ std::make_pair(request.tokens.back(), request.tokens.size() - 1) }; + dfs_tree_inputs[guid] = + std::vector>{std::make_pair( + request.tokens.back(), request.tokens.size() - 1)}; } } } else { assert(false && "Request status is not RUNNING or PENDING"); } - - - } if (verbose) { From c758c9ffa16016490ee74456a9e441a7917f022d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 7 Sep 2023 14:39:49 -0400 Subject: [PATCH 07/30] fix --- src/runtime/request_manager.cc | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2b6187ba5b..dfdbc1e822 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1259,18 +1259,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl; std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl; - // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; - new_bc.requestsInfo[i].request_guid = - old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; - - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, - (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); - if (committed_tokens.find(guid) != committed_tokens.end()) { for (int j = 0; j < committed_tokens.at(guid).size(); j++) { auto token = committed_tokens.at(guid).at(j); @@ -1287,6 +1275,19 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl; } + + // Normal Request Info + new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; + new_bc.requestsInfo[i].request_guid = + old_batches.at(0).requestsInfo[i].request_guid; + new_bc.requestsInfo[i].max_sequence_length = + old_batches.at(0).requestsInfo[i].max_sequence_length; + + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); + if (request.llm_cache_size < request.initial_len) { // Initialization (prompt) phase From 0b6b14650c2b2a5f06858e47d663878580cbd348 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 7 Sep 2023 14:44:30 -0400 Subject: [PATCH 08/30] . --- src/runtime/request_manager.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index dfdbc1e822..5f3d9a42d3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1275,7 +1275,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "[Verify] Committed Tokens from last loading batch: " << new_bc.num_tokens_to_commit << std::endl; } - + // Normal Request Info new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; new_bc.requestsInfo[i].request_guid = @@ -1288,7 +1288,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); - if (request.llm_cache_size < request.initial_len) { // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { From 683c2833e22329af2278c3b3042d0791ad1b5548 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 10 Sep 2023 21:27:19 -0400 Subject: [PATCH 09/30] fix --- include/flexflow/batch_config.h | 2 +- include/flexflow/model.h | 3 ++- include/flexflow/request_manager.h | 4 ++-- inference/incr_decoding/incr_decoding.cc | 6 ++++-- inference/spec_infer/spec_infer.cc | 6 +++++- src/c/flexflow_c.cc | 4 +++- 6 files changed, 17 insertions(+), 8 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ee3d5ac183..8aa69a3cad 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -46,7 +46,7 @@ class BatchConfig { void print() const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); - static int const MAX_NUM_REQUESTS = 1; + static int const MAX_NUM_REQUESTS = 4; static int const MAX_NUM_TOKENS = 64; static int const MAX_PROMPT_LENGTH = 62; static int const MAX_SEQ_LENGTH = 256; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 747c639933..3dc9e79116 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -751,7 +751,8 @@ class FFModel { // ======================================== // Inference APIs // ======================================== - GenerationResult generate(std::string const &text, int max_seq_length); + GenerationResult generate(std::vector &prompts, + int max_seq_length); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index d137f9421e..8b0e24dee1 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -105,10 +105,10 @@ class RequestManager { FFModel *get_model(int model_id); GenerationResult generate_incr_decoding(FFModel *model, - std::string const &text, + std::vector &prompts, int max_seq_length); GenerationResult generate_spec_infer(FFModel *model, - std::string const &text, + std::vector &prompts, int max_seq_length); GenerationResult get_generation_result(RequestGuid const &guid); RequestGuid register_new_request(std::string const &prompt, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 19cd8726e2..3f913e4573 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -242,13 +242,15 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); + std::vector prompts; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - GenerationResult result = - model.generate(text, 128 /*max_sequence_length*/); + prompts.push_back(text); } + GenerationResult result = + model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9d139997f7..2b1fb6e817 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -384,12 +384,16 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); + + std::vector prompts; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - tree_model.generate(text, 128 /*max_sequence_length*/); + prompts.push_back(text); + // tree_model.generate(text, 128 /*max_sequence_length*/); } + tree_model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 0c1fad17df..81a41a3adc 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1442,8 +1442,10 @@ flexflow_generation_result_t int max_seq_length, int *output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); + std::vector prompts; std::string const text_str(input_text); - GenerationResult result = handle->generate(text_str, max_seq_length); + prompts.push_back(input_text); + GenerationResult result = handle->generate(prompts, max_seq_length); DEBUG_PRINT("[Model] generate %p %s %i", handle, text_str, max_seq_length); assert(result.output_tokens.size() <= max_seq_length); output_length_and_tokens[0] = result.output_tokens.size(); From d44c1a1df1a9a6bb9bc1dfa2e0c1af953fa54b06 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 10 Sep 2023 21:27:51 -0400 Subject: [PATCH 10/30] fix --- src/runtime/request_manager.cc | 99 +++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 42 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 75d25b0cd7..5ed0045cf9 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -620,7 +620,8 @@ BeamSearchBatchConfig std::ofstream outputFile(output_filepath); if (outputFile.is_open()) { outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time + << std::setprecision(3) + << profile_info.finish_time - profile_info.start_time << std::endl; outputFile << "num decoding steps: " << profile_info.decoding_steps << std::endl; @@ -798,9 +799,9 @@ BeamSearchBatchConfig all_requests[new_request.guid].ssm_cache_size = new_bc.requestsInfo[i].num_tokens_in_batch; new_bc.request_running[i] = false; - std::cout << "SSM KV Cache Size: " + std::cout << "SSM KV Cache Size init: " << all_requests[new_request.guid].ssm_cache_size << std::endl; - std::cout << "LLM KV Cache Size: " + std::cout << "LLM KV Cache Size init: " << all_requests[new_request.guid].llm_cache_size << std::endl; std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch @@ -869,7 +870,6 @@ BeamSearchBatchConfig std::cout << "Current Beam Depth: " << old_bc.beamRequestsInfo[0].current_depth << "\n"; } - // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); @@ -891,20 +891,29 @@ BeamSearchBatchConfig // assert(processed_tokens < request.tokens.size()); log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - if (processed_tokens > - old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() - // || ir.results[t] == 0 TODO: replace this with - ) { - log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - old_bc.requestsInfo[i].request_guid, - old_bc.beamRequestsInfo[i].max_depth); - // new_bc.request_completed[i] = true; - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; - new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - } else { + // if (processed_tokens > + // old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() && + // request.status == Request::RUNNING + // // || ir.results[t] == 0 TODO: replace this with + // ) { + // // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", + // // old_bc.requestsInfo[i].request_guid, + // // old_bc.beamRequestsInfo[i].max_depth); + // // // new_bc.request_completed[i] = true; + // // new_bc.request_completed[i] = false; + // // new_bc.requestsInfo[i].token_start_offset = processed_tokens; + // // new_bc.requestsInfo[i].request_guid = + // // old_bc.requestsInfo[i].request_guid; + // // new_bc.requestsInfo[i].max_sequence_length = + // // old_bc.requestsInfo[i].max_sequence_length; + // // new_bc.beamRequestsInfo[i].current_depth = + // // old_bc.beamRequestsInfo[i].current_depth; + // // new_bc.request_running[i] = false; + // std::cout << "beam search end:" << request.status << i << ", " + // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; + // } + // else + { log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; new_bc.request_completed[i] = false; @@ -953,8 +962,10 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].token_start_offset); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; } - std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl; - std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl; + std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size + << std::endl; // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -1258,8 +1269,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.request_running[i] = false; std::cout << "[Verify] Request " << request.guid << " is pending" << std::endl; - std::cout << "SSM KV Cache Size: " << request.ssm_cache_size << std::endl; - std::cout << "LLM KV Cache Size: " << request.llm_cache_size << std::endl; + std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size + << std::endl; if (committed_tokens.find(guid) != committed_tokens.end()) { for (int j = 0; j < committed_tokens.at(guid).size(); j++) { @@ -1361,28 +1374,25 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, << " tokens in the current batch.\n"; } - for (int i = 0; i <= old_bc.num_tokens; i++) { + for (int i = 0; i < old_bc.num_tokens; i++) { int request_index = old_bc.tokensInfo[i].request_index; - - // End of the request - if (i == old_bc.num_tokens || - old_bc.requestsInfo[request_index].request_guid != guid) { + if (i == 0 || old_bc.requestsInfo[request_index].request_guid != guid) { // Each token yields (beam_width) results int beam_width = old_bc.beamRequestsInfo[request_index].beam_size; // Count tokens sent to model in this request to find the final token's // index - result_index += - (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - beam_width; + // result_index += + // (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * + // beam_width; if (verbose) { std::cout << "i = " << i << ", result index = " << result_index << ", value: " << result.token_ids[result_index] << "\n"; } - int index = old_bc.tokensInfo[i - 1].request_index; + int index = old_bc.tokensInfo[i].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; @@ -1802,24 +1812,27 @@ std::vector> return merged_tree; } -GenerationResult FFModel::generate(std::string const &text, +GenerationResult FFModel::generate(std::vector &prompts, int max_seq_length) { RequestManager *rm = RequestManager::get_request_manager(); if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding - return rm->generate_incr_decoding(this, text, max_seq_length); + return rm->generate_incr_decoding(this, prompts, max_seq_length); } else { // Registered SSMs: perform speculative inference - return rm->generate_spec_infer(this, text, max_seq_length); + return rm->generate_spec_infer(this, prompts, max_seq_length); } } /*static*/ -GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, - std::string const &text, - int max_seq_length) { +GenerationResult RequestManager::generate_incr_decoding( + FFModel *llm, std::vector &prompts, int max_seq_length) { InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid = register_new_request(text, max_seq_length); + RequestGuid guid; + for (int i = 0; i < prompts.size(); i++) { + guid = register_new_request(prompts.at(i), max_seq_length); + } + if (guid == 0) { std::cout << "=========== Discard request exceed prompt maximum... ===========" @@ -1869,11 +1882,13 @@ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, } /*static*/ -GenerationResult RequestManager::generate_spec_infer(FFModel *llm, - std::string const &text, - int max_seq_length) { +GenerationResult RequestManager::generate_spec_infer( + FFModel *llm, std::vector &prompts, int max_seq_length) { InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid = register_new_request(text, max_seq_length); + RequestGuid guid; + for (int i = 0; i < prompts.size(); i++) { + guid = register_new_request(prompts.at(i), max_seq_length); + } if (guid == 0) { std::cout << "=========== Discard request exceed prompt maximum... ===========" From 35a33e5a48c6ee422d199bea27506bb088c0af1e Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Sun, 10 Sep 2023 23:23:36 -0400 Subject: [PATCH 11/30] fix. --- src/ops/spec_inc_multihead_self_attention.cu | 7 +++++-- src/runtime/request_manager.cc | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index af70a07e83..5aa5266f15 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -248,10 +248,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || !bc->request_running[i]) { continue; } + std::cout << "[Verifying batch]: " << i << std::endl; + for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { + std::cout << "[Verifying sub request]: " << sub_req_id << std::endl; // int num_new_tokens = bc->num_processing_tokens[i]; // int total_tokens = bc->token_last_available_idx[i] + 1; @@ -543,7 +546,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); } - assert(tokens_previous_requests == num_tokens); + // assert(tokens_previous_requests == num_tokens); } template diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5ed0045cf9..7c41ef1a80 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1118,7 +1118,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1300,7 +1300,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.request_completed[i] = false; new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens, (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); if (request.llm_cache_size < request.initial_len) { From 0d7524a200dd29b8e313bf0083494bec2451d3e2 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Mon, 11 Sep 2023 00:00:54 -0400 Subject: [PATCH 12/30] Fix reloading new request with long prompts. --- src/ops/spec_inc_multihead_self_attention.cu | 24 ++++++++++---------- src/runtime/request_manager.cc | 18 +++++++++++++-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 5aa5266f15..d3eb88bd43 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -353,18 +353,18 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } // add alibi position bias to qk production // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } // Fill all elements above diagonal in qk prods with -inf to force // causal attention. assert(num_new_tokens <= total_tokens); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7c41ef1a80..9d8c7f6c2b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1041,6 +1041,15 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; + int max_prompt_load_size = BatchConfig::MAX_NUM_TOKENS; + for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { + if (old_batches.at(0).request_completed[i]) { + continue; + } else if (old_batches.at(0).request_running[i]) { + max_prompt_load_size -= BeamSearchBatchConfig::MAX_BEAM_DEPTH; + } + } + for (int i = 0; i < TreeVerifyBatchConfig::MAX_NUM_REQUESTS; i++) { if (old_batches.at(0).request_completed[i]) { continue; @@ -1299,9 +1308,14 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].max_sequence_length; new_bc.request_completed[i] = false; + // new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + // BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens, + // (int)request.initial_len - + // new_bc.requestsInfo[i].token_start_offset); new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens, + max_prompt_load_size, (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); + max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch; if (request.llm_cache_size < request.initial_len) { // Initialization (prompt) phase @@ -1316,7 +1330,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { + if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; From 7c8227d0b0ecf16fc4372b8dc2be654137ce8511 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Mon, 11 Sep 2023 00:45:37 -0400 Subject: [PATCH 13/30] Fix edge cases. --- src/ops/tree_inc_multihead_self_attention.cu | 3 --- src/runtime/request_manager.cc | 23 ++++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index c41312cc3c..f916bdb925 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -515,9 +515,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - std::cout << "processed_tokens_in_batch: " << processed_tokens_in_batch - << std::endl; - std::cout << "num_active_tokens: " << bc->num_active_tokens() << std::endl; assert(processed_tokens_in_batch == bc->num_active_tokens()); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 9d8c7f6c2b..0505e8bfab 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -238,6 +238,7 @@ RequestManager::RequestGuid all_requests[request.guid] = request; { std::string output = "New request tokens:"; + output = "[" + std::to_string(request.guid) + "]" + output; for (int i = 0; i < request.tokens.size(); i++) { output = output + " " + std::to_string(request.tokens[i]); } @@ -523,6 +524,8 @@ BeamSearchBatchConfig size_t guid = old_bc.requestsInfo[i].request_guid; Request &request = all_requests[guid]; + std::cout << "[ " << guid << " ]" << std::endl; + // Verify this: get verified tokens from result std::vector> tree_outputs = std::vector>(); @@ -703,6 +706,8 @@ BeamSearchBatchConfig new_bc.request_completed[i] = false; new_bc.request_running[i] = false; + std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " + << "initial_len: " << request.initial_len << std::endl; assert(request.ssm_cache_size == request.initial_len); // Normal Request Info @@ -949,11 +954,12 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), // i); - - if (new_bc.requestsInfo[i].token_start_offset + 1 >= - request.tokens.size()) { + std::cout << request.guid << std::endl; + if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) { // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; + std::cout << "Incremental phase: " << request.tokens.size() + << std::endl; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -961,7 +967,11 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].token_start_offset); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; + std::cout << "Prompt phase: " << request.tokens.size() << std::endl; + std::cout << "Update ssm cache size: " << request.ssm_cache_size + << std::endl; } + std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size << std::endl; std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size @@ -1046,7 +1056,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (old_batches.at(0).request_completed[i]) { continue; } else if (old_batches.at(0).request_running[i]) { - max_prompt_load_size -= BeamSearchBatchConfig::MAX_BEAM_DEPTH; + max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1); } } @@ -1317,6 +1327,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch; + std::cout << "max_prompt_load_size: " << max_prompt_load_size + << std::endl; + std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << i << ", " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + if (request.llm_cache_size < request.initial_len) { // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { From 230e0bc64f90279f6c34ebe4f192ea4bca30fc6d Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Mon, 11 Sep 2023 01:05:25 -0400 Subject: [PATCH 14/30] Fix edge case --- src/runtime/request_manager.cc | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 0505e8bfab..a928468026 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -143,17 +143,12 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; - if (prompt.size() > BatchConfig::MAX_PROMPT_LENGTH) { + if (prompt.size() >= BatchConfig::MAX_SEQ_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " - << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " + << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got " << prompt.size() << ".\n"; - // Truncate the prompt to MAX_NUM_TOKENS - // request.tokens.insert(request.tokens.end(), - // prompt.begin(), - // prompt.begin() + BatchConfig::MAX_PROMPT_LENGTH); - // request.initial_len = BatchConfig::MAX_PROMPT_LENGTH; + printf("tokens size: %zu\n", request.tokens.size()); - // assert(false); return 0; } else { request.initial_len = prompt.size(); @@ -206,14 +201,12 @@ RequestManager::RequestGuid request.tokens.push_back(bos_token_id); } std::vector tokens = this->tokenizer_->Encode(prompt); - if (tokens.size() > BatchConfig::MAX_PROMPT_LENGTH) { + if (tokens.size() >= BatchConfig::MAX_SEQ_LENGTH) { std::cout << "Warning: too many tokens in prompt, only load up to " - << BatchConfig::MAX_PROMPT_LENGTH << " tokens, but got " + << BatchConfig::MAX_SEQ_LENGTH << " tokens, but got " << tokens.size() << ".\n"; - // Truncate the prompt to MAX_NUM_TOKENS - // tokens.resize(BatchConfig::MAX_PROMPT_LENGTH); + printf("tokens size: %zu\n", tokens.size()); - // assert(false); return 0; } for (int i = 0; i < tokens.size(); i++) { @@ -959,15 +952,20 @@ BeamSearchBatchConfig // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; std::cout << "Incremental phase: " << request.tokens.size() - << std::endl; + << ", num_tokens_in_batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + // std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens, + std::min(BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens - + BatchConfig::MAX_NUM_REQUESTS + i, (int)request.tokens.size() - new_bc.requestsInfo[i].token_start_offset); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; - std::cout << "Prompt phase: " << request.tokens.size() << std::endl; + std::cout << "Prompt phase: " << request.tokens.size() + << ", num_tokens_in_batch:" + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; std::cout << "Update ssm cache size: " << request.ssm_cache_size << std::endl; } From 9ed2684ca5539b778c7c42a35c0cf753f25cb16e Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Mon, 11 Sep 2023 01:14:58 -0400 Subject: [PATCH 15/30] fix --- src/runtime/request_manager.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a928468026..08357ae06b 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1055,6 +1055,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( continue; } else if (old_batches.at(0).request_running[i]) { max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1); + } else { + max_prompt_load_size -= 1; } } From 87ef9cb20a85d603c8237f9b585825a3ebe32e27 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 11 Sep 2023 23:59:14 -0400 Subject: [PATCH 16/30] try a fix to CI --- include/flexflow/ops/kernels/softmax_kernels.h | 2 ++ src/ops/kernels/softmax.cpp | 5 ++++- src/ops/kernels/softmax.cu | 12 ++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 14c07414e9..987a546459 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -15,8 +15,10 @@ class SoftmaxMeta : public OpMeta { Legion::Domain const &input_domain); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor; + cudnnTensorDescriptor_t outputTensor; #else miopenTensorDescriptor_t inputTensor; + miopenTensorDescriptor_t outputTensor; #endif bool profiling; int dim; diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index bd8b46116d..d2b35cb48a 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -29,6 +29,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN( cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); @@ -127,7 +130,7 @@ void forward_kernel(SoftmaxMeta const *m, m->inputTensor, input_ptr, &beta, - m->inputTensor, + m->outputTensor, output_ptr, MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_MODE_CHANNEL)); diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 15130c19a7..f399f43ac1 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -28,6 +28,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( inputTensor, input_domain, softmax->data_type)); + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); @@ -42,8 +45,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, DT *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - cudaEvent_t t_start, t_end; + std::cout << "softmax kernel: " + << "\n"; if (m->profiling) { cudaEventCreate(&t_start); cudaEventCreate(&t_end); @@ -63,6 +67,10 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, log_measure.debug( "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed); } + std::cout << "softmax kernel end: " + << "\n"; + + print_tensor((float *)output_ptr, 32, "softmax output"); } template @@ -127,7 +135,7 @@ void forward_kernel(SoftmaxMeta const *m, m->inputTensor, input_ptr, &beta, - m->inputTensor, + m->outputTensor, output_ptr)); } From 8898493cb227903c0c981b766a205dad6519c58f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 12 Sep 2023 01:32:15 -0400 Subject: [PATCH 17/30] . --- src/ops/kernels/softmax.cu | 6 --- .../python_test_configs/generate_configs.py | 48 +++++++++---------- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index f399f43ac1..67a9c21038 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -46,8 +46,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; - std::cout << "softmax kernel: " - << "\n"; if (m->profiling) { cudaEventCreate(&t_start); cudaEventCreate(&t_end); @@ -67,10 +65,6 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, log_measure.debug( "%s [Softmax] forward time = %.2fms\n", m->op_name, elapsed); } - std::cout << "softmax kernel end: " - << "\n"; - - print_tensor((float *)output_ptr, 32, "softmax output"); } template diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index e780bc17b0..0d30a3edad 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -63,35 +63,35 @@ os.chdir(dname) -# Generate incremental decoding configs -all_models = llama_models + opt_models + falcon_models + mpt_models -for model_name in all_models: - for full_precision in (True, False): - for parallelism_degrees in parallelism_settings: +# # Generate incremental decoding configs +# all_models = llama_models + opt_models + falcon_models + mpt_models +# for model_name in all_models: +# for full_precision in (True, False): +# for parallelism_degrees in parallelism_settings: - tp, pp = parallelism_degrees +# tp, pp = parallelism_degrees - # Tensor parallelism not supported by small Falcon model atm - if tp > 1 and ("falcon" in model_name or "starcoder" in model_name): - continue - # skip tp=4 for big models - if tp > 2 and ("7b" in model_name or "6.7b" in model_name): - continue +# # Tensor parallelism not supported by small Falcon model atm +# if tp > 1 and ("falcon" in model_name or "starcoder" in model_name): +# continue +# # skip tp=4 for big models +# if tp > 2 and ("7b" in model_name or "6.7b" in model_name): +# continue - _, after_slash = model_name.rsplit("/", maxsplit=1) - filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" - test_configs_file = "./" + filename + ".json" - output_file = os.path.join(output_folder, filename+".txt") +# _, after_slash = model_name.rsplit("/", maxsplit=1) +# filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" +# test_configs_file = "./" + filename + ".json" +# output_file = os.path.join(output_folder, filename+".txt") - ff_init_configs["tensor_parallelism_degree"] = tp - ff_init_configs["pipeline_parallelism_degree"] = pp - ff_init_configs["llm_model"] = model_name - ff_init_configs["full_precision"] = full_precision - ff_init_configs["output_file"] = output_file - ff_init_configs["prompt"] = prompt_file +# ff_init_configs["tensor_parallelism_degree"] = tp +# ff_init_configs["pipeline_parallelism_degree"] = pp +# ff_init_configs["llm_model"] = model_name +# ff_init_configs["full_precision"] = full_precision +# ff_init_configs["output_file"] = output_file +# ff_init_configs["prompt"] = prompt_file - with open(test_configs_file, "w+") as outfile: - json.dump(ff_init_configs, outfile, indent=4) +# with open(test_configs_file, "w+") as outfile: +# json.dump(ff_init_configs, outfile, indent=4) # Generate speculative inference configs model_pairs = [llama_models, opt_models] From e328e2dc89576f4621f110abd37224ec337b7747 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 12 Sep 2023 09:26:28 -0400 Subject: [PATCH 18/30] fix --- src/ops/argmax.cpp | 2 +- src/ops/argmax.cu | 6 ++++-- src/ops/kernels/softmax.cpp | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index ec5ea6c36a..8a1cf0b3b0 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -393,7 +393,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, if (m->beam_search) { // set all parents id zero in arg top1 case. - checkCUDA(hipMemset(parent, 0, batch_size * sizeof(int))); + checkCUDA(hipMemsetAsync(parent, 0, batch_size * sizeof(int), stream)); } int num_shards = 0; int k = 1; diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 37e067006c..b951dbda86 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -59,7 +59,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, DT alpha = 1.0f, beta = 0.0f; if (m->beam_search) { // set all parents id zero in arg top1 case. - checkCUDA(cudaMemset(parent, 0, batch_size * sizeof(int))); + checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream)); } size_t temp_storage_bytes = m->temp_storage_bytes; // use cub @@ -91,9 +91,10 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, int batch_size) { + std::cout << "argmax kernel start" << "\n"; cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - + std::cout << "argmax kernel" << "\n"; cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -136,6 +137,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, cudaEventDestroy(t_end); printf("[ArgMax] forward time = %.2lfms\n", elapsed); } + std::cout << "argmax kernel end" << "\n"; } ArgMaxMeta::ArgMaxMeta(FFHandler handler, diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index d2b35cb48a..edfc5bb544 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -30,8 +30,8 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN( cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( - outputTensor, input_domain, softmax->data_type)); + checkCUDNN( + cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); dim = softmax->dim; profiling = softmax->profiling; std::strcpy(op_name, softmax->name); From 3a25189541b11eceb56b71ccb0b2671d4027bfef Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Wed, 13 Sep 2023 15:28:58 -0400 Subject: [PATCH 19/30] Fix: clean up code and fix decoding_steps. --- include/flexflow/request_manager.h | 7 +- src/ops/spec_inc_multihead_self_attention.cu | 2 - src/runtime/request_manager.cc | 210 +++++-------------- 3 files changed, 54 insertions(+), 165 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 8b0e24dee1..8515d8a04b 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -52,9 +52,10 @@ class InferenceManager { struct Request { enum Status { - PENDING = 101, - RUNNING = 102, - COMPLETED = 103, + PENDING = 101, // loading prompt + RUNNING = 102, // running inference + COMPLETED = 103, // finished and verified + FINISHING = 104, // finishing request, but not yet verified }; BatchConfig::RequestGuid guid; int max_sequence_length; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index d3eb88bd43..080565c00a 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -251,10 +251,8 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (bc->request_completed[i] || !bc->request_running[i]) { continue; } - std::cout << "[Verifying batch]: " << i << std::endl; for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - std::cout << "[Verifying sub request]: " << sub_req_id << std::endl; // int num_new_tokens = bc->num_processing_tokens[i]; // int total_tokens = bc->token_last_available_idx[i] + 1; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 08357ae06b..6a08df4efa 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -947,13 +947,16 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), // i); - std::cout << request.guid << std::endl; if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) { // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; - std::cout << "Incremental phase: " << request.tokens.size() - << ", num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + + if (verbose) { + std::cout << "[ Beam Spec] " << request.guid << std::endl; + std::cout << "Incremental phase: " << request.tokens.size() + << ", num_tokens_in_batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + } } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -963,17 +966,22 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].token_start_offset); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; - std::cout << "Prompt phase: " << request.tokens.size() - << ", num_tokens_in_batch:" - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; - std::cout << "Update ssm cache size: " << request.ssm_cache_size - << std::endl; + if (verbose) { + std::cout << "[ Beam Spec] " << request.guid << std::endl; + std::cout << "Prompt phase: " << request.tokens.size() + << ", num_tokens_in_batch:" + << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << "Update ssm cache size: " << request.ssm_cache_size + << std::endl; + } } - std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size - << std::endl; - std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size - << std::endl; + if (verbose) { + std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size + << std::endl; + } // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -1068,7 +1076,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( Request &request = all_requests[guid]; // Profiling - profiling_requests[new_bc.requestsInfo[i].request_guid].decoding_steps += 1; + profiling_requests[request.guid].decoding_steps += 1; if (request.status == Request::RUNNING) { new_bc.request_running[i] = true; @@ -1107,88 +1115,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; - // Add prompt token first in first verify iteration - if (request.tokens.size() == request.initial_len) { - // Initialization (prompt) phase - // for (int j = 0; j < request.initial_len; j++) { - // new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - // new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[j]; - // new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = j; - - // new_bc.num_tokens++; - // new_bc.requestsInfo[i].num_tokens_in_batch++; - // } - - // std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; - // if (new_bc.num_tokens >= BatchConfig::MAX_NUM_TOKENS) { - // assert(false && - // "Exceeding the space available in the TreeVerify batch"); - // break; - // } - - // new_bc.requestsInfo[i].token_start_offset = 0; - } else { - // Incremental phase: only add the last committed token - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = - request.tokens.size() - 1; - - new_bc.num_tokens++; - new_bc.requestsInfo[i].num_tokens_in_batch++; - - if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { - assert(false && - "Exceeding the space available in the TreeVerify batch"); - break; - } - - new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; - } - // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { - if (dfs_tree_inputs.at(0).second == - request.initial_len + committed_tokens.at(guid).size() - - 1) { // commit prompt - // for (int j = 0; j < request.initial_len; j++) { - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index - // = j; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index - // = - // i; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth - // = j; if (verbose) { - // std::cout << new_bc.num_tokens_to_commit - // << "- committed_token.token_depth: " << j - // << ", token_index: " << j << std::endl; - // } - // new_bc.num_tokens_to_commit++; - // } - } else { // commit the last token - // only add the root token - auto committed_token = committed_tokens.at(guid).at(0); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } - new_bc.num_tokens_to_commit++; - } - if (verbose) { - std::cout << "new_bc.num_tokens_to_commit: " - << new_bc.num_tokens_to_commit << std::endl; - } - - for (int j = 1; j < dfs_tree_inputs.size(); j++) { + for (int j = 0; j < dfs_tree_inputs.size(); j++) { if (j < committed_tokens.at(guid).size()) { auto committed_token = committed_tokens.at(guid).at(j); new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = @@ -1213,48 +1142,22 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } - // add prompt to the dfs tree - // if (committed_tokens.find(guid) != committed_tokens.end()) { - // if (dfs_tree_inputs.at(0).second == request.initial_len + - // committed_tokens.at(guid).size() - // - 1) { // commit prompt - // for (int j = 0; j < request.initial_len; j++) { - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index - // = j; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index - // = - // i; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth - // = j; if (verbose) { - // std::cout << new_bc.num_tokens_to_commit - // << "- committed_token.token_depth: " << j - // << ", token_index: " << j << std::endl; - // } - // new_bc.num_tokens_to_commit++; - // } - // } else { // commit the last token - // // only add the root token - // auto committed_token = committed_tokens.at(guid).at(0); - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - // committed_token.second; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index - // = i; - // new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - // committed_token.first; - // if (verbose) { - // std::cout << new_bc.num_tokens_to_commit - // << "- committed_token.token_depth: " - // << committed_token.first - // << ", token_index: " << committed_token.second << - // std::endl; - // } - // new_bc.num_tokens_to_commit++; - // } - // if (verbose) { - // std::cout << "new_bc.num_tokens_to_commit: " - // << new_bc.num_tokens_to_commit << std::endl; - // } - // } + // Incremental phase: only add the last committed token + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back(); + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = + request.tokens.size() - 1; + + new_bc.num_tokens++; + new_bc.requestsInfo[i].num_tokens_in_batch++; + + if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { + assert(false && + "Exceeding the space available in the TreeVerify batch"); + break; + } + + new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { @@ -1269,12 +1172,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; - // TODO: Add committed token info - if (verbose) { - std::cout << "committed_tokens.size(): " - << new_bc.num_tokens_to_commit << std::endl; - } - new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; @@ -1286,13 +1183,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; - std::cout << "[Verify] Request " << request.guid << " is pending" - << std::endl; - std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size - << std::endl; - std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size - << std::endl; + if (verbose) { + std::cout << "[Verify] Request " << request.guid + << " is pending in loading prompt phase" << std::endl; + std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size + << std::endl; + std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size + << std::endl; + } + // Commit all tokens from the last loading batch if (committed_tokens.find(guid) != committed_tokens.end()) { for (int j = 0; j < committed_tokens.at(guid).size(); j++) { auto token = committed_tokens.at(guid).at(j); @@ -1318,10 +1218,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].max_sequence_length; new_bc.request_completed[i] = false; - // new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - // BatchConfig::MAX_NUM_TOKENS - 1 - new_bc.num_tokens, - // (int)request.initial_len - - // new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( max_prompt_load_size, (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); @@ -1350,7 +1247,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( "Exceeding the space available in the TreeVerify batch"); break; } - } else { + } else { // launch the request into running phase after loading all prompt if (BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens > 0) { request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1377,13 +1274,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - if (verbose) { - std::cout << "prepare_next_batch_verify OLD vs NEW batchconfigs below:" - << std::endl; - // old_batches.print(); - // new_bc.print(); - } - return new_bc; } From c7f1b9e18f2366d22e3d2e520baeb0c42b6091d5 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 14 Sep 2023 19:31:02 -0400 Subject: [PATCH 20/30] try 1 try --- src/ops/argmax.cu | 5 ++--- src/runtime/request_manager.cc | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index b951dbda86..5a31269606 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -83,6 +83,8 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, prob_ptr, batch_size, m->beam_search); + print_tensor(indices_ptr, 32, "argmax op"); + } /*static*/ @@ -91,10 +93,8 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, int batch_size) { - std::cout << "argmax kernel start" << "\n"; cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - std::cout << "argmax kernel" << "\n"; cudaEvent_t t_start, t_end; if (m->profiling) { cudaEventCreate(&t_start); @@ -137,7 +137,6 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, cudaEventDestroy(t_end); printf("[ArgMax] forward time = %.2lfms\n", elapsed); } - std::cout << "argmax kernel end" << "\n"; } ArgMaxMeta::ArgMaxMeta(FFHandler handler, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 6a08df4efa..a4ada0e34c 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -996,7 +996,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].tokens[k]; } else { new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[depth]; + request.tokens[request.tokens.size() - 1]; } new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; From 55eb9139371d39e522e2a6139501dc291ba8b445 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Sat, 16 Sep 2023 15:50:16 -0400 Subject: [PATCH 21/30] fix: allow parse 0 tokens for pending request. --- src/runtime/request_manager.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a4ada0e34c..4fa52535ae 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -949,7 +949,11 @@ BeamSearchBatchConfig // i); if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) { // Incremental phase - new_bc.requestsInfo[i].num_tokens_in_batch = 1; + if (request.status == Request::RUNNING) { + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + } else { + new_bc.requestsInfo[i].num_tokens_in_batch = 0; + } if (verbose) { std::cout << "[ Beam Spec] " << request.guid << std::endl; @@ -992,9 +996,11 @@ BeamSearchBatchConfig // get value from requestinfo if (request.status == Request::RUNNING) { + std::cout << "[running ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; new_bc.tokensInfo[new_bc.num_tokens].token_id = new_bc.beamRequestsInfo[i].tokens[k]; } else { + std::cout << "[pending ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.tokens.size() - 1]; } From b88c4de5f5df10f0fb295544307f80a5c1b02d75 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Sat, 16 Sep 2023 15:52:18 -0400 Subject: [PATCH 22/30] format. --- src/ops/argmax.cu | 1 - src/runtime/request_manager.cc | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 5a31269606..8fbf84f2a3 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -84,7 +84,6 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, batch_size, m->beam_search); print_tensor(indices_ptr, 32, "argmax op"); - } /*static*/ diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 4fa52535ae..6aa4782c3c 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -996,11 +996,15 @@ BeamSearchBatchConfig // get value from requestinfo if (request.status == Request::RUNNING) { - std::cout << "[running ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << "[running ]Num of token in batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch + << std::endl; new_bc.tokensInfo[new_bc.num_tokens].token_id = new_bc.beamRequestsInfo[i].tokens[k]; } else { - std::cout << "[pending ]Num of token in batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + std::cout << "[pending ]Num of token in batch: " + << new_bc.requestsInfo[i].num_tokens_in_batch + << std::endl; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.tokens.size() - 1]; } From abcf94f8c18c39bd6242598ff1893091ae673bea Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 16 Sep 2023 17:55:06 -0400 Subject: [PATCH 23/30] remove comment tests --- .../python_test_configs/generate_configs.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 0d30a3edad..e780bc17b0 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -63,35 +63,35 @@ os.chdir(dname) -# # Generate incremental decoding configs -# all_models = llama_models + opt_models + falcon_models + mpt_models -# for model_name in all_models: -# for full_precision in (True, False): -# for parallelism_degrees in parallelism_settings: +# Generate incremental decoding configs +all_models = llama_models + opt_models + falcon_models + mpt_models +for model_name in all_models: + for full_precision in (True, False): + for parallelism_degrees in parallelism_settings: -# tp, pp = parallelism_degrees + tp, pp = parallelism_degrees -# # Tensor parallelism not supported by small Falcon model atm -# if tp > 1 and ("falcon" in model_name or "starcoder" in model_name): -# continue -# # skip tp=4 for big models -# if tp > 2 and ("7b" in model_name or "6.7b" in model_name): -# continue + # Tensor parallelism not supported by small Falcon model atm + if tp > 1 and ("falcon" in model_name or "starcoder" in model_name): + continue + # skip tp=4 for big models + if tp > 2 and ("7b" in model_name or "6.7b" in model_name): + continue -# _, after_slash = model_name.rsplit("/", maxsplit=1) -# filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" -# test_configs_file = "./" + filename + ".json" -# output_file = os.path.join(output_folder, filename+".txt") + _, after_slash = model_name.rsplit("/", maxsplit=1) + filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + test_configs_file = "./" + filename + ".json" + output_file = os.path.join(output_folder, filename+".txt") -# ff_init_configs["tensor_parallelism_degree"] = tp -# ff_init_configs["pipeline_parallelism_degree"] = pp -# ff_init_configs["llm_model"] = model_name -# ff_init_configs["full_precision"] = full_precision -# ff_init_configs["output_file"] = output_file -# ff_init_configs["prompt"] = prompt_file + ff_init_configs["tensor_parallelism_degree"] = tp + ff_init_configs["pipeline_parallelism_degree"] = pp + ff_init_configs["llm_model"] = model_name + ff_init_configs["full_precision"] = full_precision + ff_init_configs["output_file"] = output_file + ff_init_configs["prompt"] = prompt_file -# with open(test_configs_file, "w+") as outfile: -# json.dump(ff_init_configs, outfile, indent=4) + with open(test_configs_file, "w+") as outfile: + json.dump(ff_init_configs, outfile, indent=4) # Generate speculative inference configs model_pairs = [llama_models, opt_models] From 66ee3674fd6a7723c0d0855d8d84d71cf66f984a Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Tue, 19 Sep 2023 13:22:03 -0400 Subject: [PATCH 24/30] remove print. --- src/ops/argmax.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 8fbf84f2a3..05c84719c1 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -83,7 +83,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, prob_ptr, batch_size, m->beam_search); - print_tensor(indices_ptr, 32, "argmax op"); + // print_tensor(indices_ptr, 32, "argmax op"); } /*static*/ From 801c56c7c8ccd262bf61eccf4d34d7e6cbf1dbf1 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 24 Sep 2023 18:32:40 -0400 Subject: [PATCH 25/30] fix decoding steps --- src/ops/spec_inc_multihead_self_attention.cu | 2 +- src/runtime/request_manager.cc | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 881d517cf6..39c6ff9259 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -248,7 +248,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { - if (bc->request_completed[i] || !bc->request_running[i]) { + if (bc->request_completed[i]) { continue; } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index eea3cd4d13..bf64a17c1e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1190,7 +1190,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens << std::endl; } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; if (verbose) { @@ -1251,7 +1251,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens << std::endl; if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); @@ -1269,7 +1269,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.num_tokens: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens << std::endl; std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; @@ -1303,25 +1303,26 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, << " tokens in the current batch.\n"; } - for (int i = 0; i < old_bc.num_tokens; i++) { + for (int i = 0; i <= old_bc.num_tokens; i++) { int request_index = old_bc.tokensInfo[i].request_index; - if (i == 0 || old_bc.requestsInfo[request_index].request_guid != guid) { + if (i == old_bc.num_tokens || + old_bc.requestsInfo[request_index].request_guid != guid) { // Each token yields (beam_width) results int beam_width = old_bc.beamRequestsInfo[request_index].beam_size; // Count tokens sent to model in this request to find the final token's // index - // result_index += - // (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - // beam_width; + result_index += + (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * + beam_width; if (verbose) { std::cout << "i = " << i << ", result index = " << result_index << ", value: " << result.token_ids[result_index] << "\n"; } - int index = old_bc.tokensInfo[i].request_index; + int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; From 1d18fce570b4be09dc0ff2ab47134a64751d76a2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 24 Sep 2023 18:37:54 -0400 Subject: [PATCH 26/30] . --- src/runtime/request_manager.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index bf64a17c1e..4ee5f44261 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1190,7 +1190,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens + << std::endl; } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; if (verbose) { @@ -1251,7 +1252,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens + << std::endl; if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); @@ -1269,7 +1271,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens << std::endl; + std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens + << std::endl; std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; From aed8850b4355b284c7abd683efc690560254eaa9 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Sun, 24 Sep 2023 22:32:50 -0400 Subject: [PATCH 27/30] quick fix. --- src/ops/spec_inc_multihead_self_attention.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 39c6ff9259..b4cdc77e2a 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -260,6 +260,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].token_start_offset + bc->requestsInfo[i].num_tokens_in_batch; + + if (num_new_tokens <= 0) { + continue; + } + // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; int n = total_tokens; From a39fb5b52e59e53f5b746b64cf69eafbba32e4ef Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Sun, 24 Sep 2023 22:38:43 -0400 Subject: [PATCH 28/30] remove debugging prints. --- src/runtime/request_manager.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 4ee5f44261..c30e1a68cf 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1190,8 +1190,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - std::cout << "new_bc.num_tokens after dfs: " << new_bc.num_tokens - << std::endl; } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; if (verbose) { @@ -1252,8 +1250,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - std::cout << "new_bc.num_tokens init: " << new_bc.num_tokens - << std::endl; if (new_bc.num_tokens > BatchConfig::MAX_NUM_TOKENS) { assert(false && "Exceeding the space available in the TreeVerify batch"); @@ -1271,8 +1267,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.num_tokens running: " << new_bc.num_tokens - << std::endl; std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; From 84a6fbad47dc493f31bf26a4831fecd0c59cde15 Mon Sep 17 00:00:00 2001 From: Zeyu Wang Date: Mon, 25 Sep 2023 00:54:01 -0400 Subject: [PATCH 29/30] fix store_beam_metadata. --- src/runtime/request_manager.cc | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index c30e1a68cf..5489c9b06d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -996,15 +996,15 @@ BeamSearchBatchConfig // get value from requestinfo if (request.status == Request::RUNNING) { - std::cout << "[running ]Num of token in batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch - << std::endl; + // std::cout << "[running ]Num of token in batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch + // << std::endl; new_bc.tokensInfo[new_bc.num_tokens].token_id = new_bc.beamRequestsInfo[i].tokens[k]; } else { - std::cout << "[pending ]Num of token in batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch - << std::endl; + // std::cout << "[pending ]Num of token in batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch + // << std::endl; new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[request.tokens.size() - 1]; } @@ -1301,12 +1301,16 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } for (int i = 0; i <= old_bc.num_tokens; i++) { - int request_index = old_bc.tokensInfo[i].request_index; if (i == old_bc.num_tokens || - old_bc.requestsInfo[request_index].request_guid != guid) { + old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != + guid) { + + int index = old_bc.tokensInfo[i - 1].request_index; + int beam_size = old_bc.beamRequestsInfo[index].beam_size; + int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results - int beam_width = old_bc.beamRequestsInfo[request_index].beam_size; + int beam_width = old_bc.beamRequestsInfo[index].beam_size; // Count tokens sent to model in this request to find the final token's // index @@ -1319,10 +1323,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, << ", value: " << result.token_ids[result_index] << "\n"; } - int index = old_bc.tokensInfo[i - 1].request_index; - int beam_size = old_bc.beamRequestsInfo[index].beam_size; - int depth = old_bc.beamRequestsInfo[index].current_depth; - Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; if (depth == 1) { @@ -1366,7 +1366,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // update the guid and start_depth for current request if (i < old_bc.num_tokens) { - guid = old_bc.requestsInfo[request_index].request_guid; + guid = old_bc.requestsInfo[index].request_guid; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } From 59acaeb0f33c57851d9e3bc14d3eada6f484d8e8 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 25 Sep 2023 09:14:51 -0400 Subject: [PATCH 30/30] hip --- src/ops/kernels/softmax.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index edfc5bb544..ca4872d51b 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -29,7 +29,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN( cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); - checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN( cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); dim = softmax->dim;