Skip to content

Commit

Permalink
Split TPOT into two metrics, one with first token latency and one wit…
Browse files Browse the repository at this point in the history
…hout (#900)

* first commit

* change metric name
  • Loading branch information
Bslabe123 authored Dec 3, 2024
1 parent 378589a commit f61a7bb
Showing 1 changed file with 7 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
# Prometheus Metrics
prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
request_latency_per_output_token_metric = Histogram('LatencyProfileGenerator:request_latency_per_output_token', 'Time per output token per request (including first token)')
tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request (excluding first token)')
ttft_metric = Histogram('LatencyProfileGenerator:time_to_first_token', 'Time to first token per request')
active_requests_metric = Gauge('LatencyProfileGenerator:active_requests', 'How many requests actively being processed')

Expand Down Expand Up @@ -225,7 +226,10 @@ async def send_stream_request(
output_token_ids = tokenizer(output).input_ids
output_len = len(output_token_ids)
request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
tpot_metric.observe((request_end_time - request_start_time) / output_len)

# Exclude first token for tpot calculation
tpot_metric.observe((request_end_time - ttft - request_start_time) / (output_len - 1))
request_latency_per_output_token_metric.observe((request_end_time - request_start_time) / output_len)
if ttft is not None:
ttft_metric.observe(ttft)
prompt_length_metric.observe(prompt_len)
Expand Down Expand Up @@ -380,7 +384,7 @@ async def send_request(

# (prompt len, output len, latency, success)
request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
tpot_metric.observe((request_end_time - request_start_time) / output_len)
request_latency_per_output_token_metric.observe((request_end_time - request_start_time) / output_len)
prompt_length_metric.observe(prompt_len)
response_length_metric.observe(output_len)

Expand Down

0 comments on commit f61a7bb

Please sign in to comment.