From af81fadcd63d682087303ec1e5c40190aaa89fae Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Tue, 3 Dec 2024 22:00:28 +0000 Subject: [PATCH] first commit --- .../profile-generator/container/benchmark_serving.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 8829e5a1e..9ac9866bf 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -34,7 +34,8 @@ # Prometheus Metrics prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) -tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') +latency_per_output_token_metric = Histogram('LatencyProfileGenerator:latency_per_output_token', 'Time per output token per request (including first token)') +tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request (excluding first token)') ttft_metric = Histogram('LatencyProfileGenerator:time_to_first_token', 'Time to first token per request') active_requests_metric = Gauge('LatencyProfileGenerator:active_requests', 'How many requests actively being processed') @@ -225,7 +226,10 @@ async def send_stream_request( output_token_ids = tokenizer(output).input_ids output_len = len(output_token_ids) request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) - tpot_metric.observe((request_end_time - request_start_time) / output_len) + + # Exclude first token for tpot calculation + tpot_metric.observe((request_end_time - ttft - request_start_time) / (output_len - 1)) + latency_per_output_token_metric.observe((request_end_time - request_start_time) / output_len) if ttft is not None: ttft_metric.observe(ttft) prompt_length_metric.observe(prompt_len) @@ -380,7 +384,7 @@ async def send_request( # (prompt len, output len, latency, success) request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) - tpot_metric.observe((request_end_time - request_start_time) / output_len) + latency_per_output_token_metric.observe((request_end_time - request_start_time) / output_len) prompt_length_metric.observe(prompt_len) response_length_metric.observe(output_len)