Skip to content

Commit

Permalink
chore: more measurements
Browse files Browse the repository at this point in the history
  • Loading branch information
chenzhuofu committed Aug 15, 2024
1 parent 102dd38 commit 0219b78
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 33 deletions.
8 changes: 7 additions & 1 deletion include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,13 +380,16 @@ class RequestManager {
long long llm_step_start = 0, ssm_step_start = 0;
// Times for each LLM verification phase (in ms)
std::vector<double> llm_step_times;
// Number of requests in batch at each step
std::vector<int> requests_per_step;
// Times for each SSM speculation phase (in ms)
std::vector<double> ssm_step_times;
// Number of requests getting decoded at each step
std::vector<int> ssm_steps;
std::vector<int> requests_per_step;
// Number of generated tokens at each step
std::vector<int> generated_tokens_per_step;
// To calculate the E2E time of serving
long long server_start_time = 0;
};

ProfileInfo profiling;
Expand Down Expand Up @@ -445,6 +448,9 @@ class RequestManager {
int k);
void gumbel_conditioned_on_max(float target_max,
std::vector<std::pair<float, int>> &logits);

// Profiling related functions
void reset_profiling_statistics();
};

}; // namespace FlexFlow
113 changes: 81 additions & 32 deletions src/runtime/request_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -567,23 +567,23 @@ void RequestManager::request_complete_clean_up(int batch_index) {
output_file.close();
}
}
RequestProfileInfo profile_info = profiling_requests[guid];
std::string str =
"[" + std::to_string(guid) +
"] Request completed:" + " decoding_time_ms(" +
std::to_string(
(profile_info.finish_time - profile_info.start_decoding_time) *
1e-3) +
")" + " total_time_ms(" +
std::to_string((profile_info.finish_time - profile_info.start_time) *
1e-3) +
")" + " LLM_decoding_steps(" +
std::to_string(profile_info.llm_decoding_steps) + ")";
if (decoding_mode == SPECULATIVE_DECODING) {
str = str + " SSM_decoding_steps(" +
std::to_string(profile_info.ssm_decoding_steps) + ")";
}
write_to_output_file("", str);
// RequestProfileInfo profile_info = profiling_requests[guid];
// std::string str =
// "[" + std::to_string(guid) +
// "] Request completed:" + " decoding_time_ms(" +
// std::to_string(
// (profile_info.finish_time - profile_info.start_decoding_time) *
// 1e-3) +
// ")" + " total_time_ms(" +
// std::to_string((profile_info.finish_time - profile_info.start_time) *
// 1e-3) +
// ")" + " LLM_decoding_steps(" +
// std::to_string(profile_info.llm_decoding_steps) + ")";
// if (decoding_mode == SPECULATIVE_DECODING) {
// str = str + " SSM_decoding_steps(" +
// std::to_string(profile_info.ssm_decoding_steps) + ")";
// }
// write_to_output_file("", str);

trigger_request_completion_future(guid);
}
Expand Down Expand Up @@ -2022,6 +2022,7 @@ void RequestManager::serve_decoding(FFModel *llm) {
std::queue<InferenceResultFuture> batch_pipeline;
{ batch_pipeline.push(last_irf); }

reset_profiling_statistics();
while (!is_background_server_terminated()) {

if (batch_pipeline.size() >= 4) {
Expand Down Expand Up @@ -2089,6 +2090,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
std::queue<InferenceResultFuture> infer_result_future_pipeline;
infer_result_future_pipeline.push(irf_0);

reset_profiling_statistics();
while (!is_background_server_terminated()) {
if (infer_result_future_pipeline.size() >= 4) {
// Block here to avoid launching too many batches
Expand Down Expand Up @@ -2198,47 +2200,81 @@ void RequestManager::terminate_background_server() {
assert(profiling.llm_step_times.size() ==
profiling.requests_per_step.size());
// Write the last profiling statistics to output file
std::string str = "[Profiling Statistics]\n llm_step_times_ms(";
std::string llm_step_times_ms = " ";
for (double time : profiling.llm_step_times) {
llm_step_times_ms += std::to_string(time) + " ";
}
llm_step_times_ms += ")";
str += llm_step_times_ms;
str += "\n requests_per_step(";
std::string req_per_step = " ";
std::string str = "[Profiling Statistics]";

long long total_time = Realm::Clock::current_time_in_microseconds() -
profiling.server_start_time;
int total_requests = profiling_requests.size();
int total_tokens = 0;
for (int num_tokens : profiling.generated_tokens_per_step) {
total_tokens += num_tokens;
}
str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
// throughput
str += "\n throughput_requests_per_sec(" +
std::to_string(total_requests / (total_time / 1e6)) + ")";
str += "\n throughput_tokens_per_sec(" +
std::to_string(total_tokens / (total_time / 1e6)) + ")";

double average_latency_per_request = 0;
std::string latency_per_request_ms = "\n latency_per_request_ms( ";
for (auto const &profiling_info : profiling_requests) {
double latency_ms = (profiling_info.second.finish_time -
profiling_info.second.start_time) /
1000.0;
// latency_per_request_ms += "[" + std::to_string(profiling_info.first) +
// ","; latency_per_request_ms += std::to_string(latency_ms) + "] ";
latency_per_request_ms += std::to_string(latency_ms) + " ";
average_latency_per_request += latency_ms;
}
latency_per_request_ms += ")";
str += latency_per_request_ms;
average_latency_per_request /= total_requests;
str += "\n average_latency_per_request_ms(" +
std::to_string(average_latency_per_request) + ")";

std::string req_per_step = "\n requests_per_step( ";
for (int nb : profiling.requests_per_step) {
req_per_step += std::to_string(nb) + " ";
}
req_per_step += ")";
str += req_per_step;

if (profiling.ssm_step_times.size() > 0) {
// assert(profiling.ssm_step_times.size() ==
// profiling.llm_step_times.size());
str += "\n ssm_step_times_ms(";
std::string ssm_step_times_ms = " ";
std::string ssm_step_times_ms = "\n ssm_step_times_ms( ";
for (double time : profiling.ssm_step_times) {
ssm_step_times_ms += std::to_string(time) + " ";
}
ssm_step_times_ms += ")";
str += ssm_step_times_ms;
}

if (profiling.ssm_steps.size() > 0) {
str += "\n ssm_steps(";
std::string ssm_steps = " ";
std::string ssm_steps = "\n ssm_steps( ";
for (int nb : profiling.ssm_steps) {
ssm_steps += std::to_string(nb) + " ";
}
ssm_steps += ")";
str += ssm_steps;
}
str += "\n generated_tokens_per_step(";
std::string generated_tokens_per_step = " ";

std::string llm_step_times_ms = "\n llm_step_times_ms( ";
for (double time : profiling.llm_step_times) {
llm_step_times_ms += std::to_string(time) + " ";
}
llm_step_times_ms += ")";
str += llm_step_times_ms;

std::string generated_tokens_per_step = "\n generated_tokens_per_step( ";
for (int nb : profiling.generated_tokens_per_step) {
generated_tokens_per_step += std::to_string(nb) + " ";
}
generated_tokens_per_step += ")";
str += generated_tokens_per_step;

write_to_output_file("", str);
background_server_status = TERMINATED;
// Wait for the background server to terminate
Expand Down Expand Up @@ -2540,4 +2576,17 @@ std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
}

/* --------- Request Token Tree Related Functions --------- */

/* --------- Profiling Related Functions --------- */
void RequestManager::reset_profiling_statistics() {
profiling.llm_step_times.clear();
profiling.requests_per_step.clear();
profiling.ssm_step_times.clear();
profiling.ssm_steps.clear();
profiling.generated_tokens_per_step.clear();
profiling.llm_step_start = 0;
profiling.ssm_step_start = 0;
profiling.server_start_time = Realm::Clock::current_time_in_microseconds();
}
/* --------- Profiling Related Functions --------- */
}; // namespace FlexFlow

0 comments on commit 0219b78

Please sign in to comment.