diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index c49a694cb..c05f3b4e2 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -13,12 +13,13 @@ import requests import time from typing import AsyncGenerator, List, Optional, Tuple, Dict -from prometheus_client import start_http_server, Histogram, Gauge +from prometheus_client import start_http_server, Histogram import google.auth import google.auth.transport.requests import aiohttp +from aiohttp_prometheus_exporter.trace import PrometheusTraceConfig import numpy as np from transformers import AutoTokenizer from transformers import PreTrainedTokenizerBase @@ -34,7 +35,6 @@ prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') -active_requests_metric = Gauge('LatencyProfileGenerator:active_requests', 'How many requests actively being processed') def sample_requests( dataset_path: str, @@ -209,14 +209,11 @@ async def send_request( # Set client timeout to be 3 hrs. timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) - async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session: + async with aiohttp.ClientSession(timeout=timeout,trust_env=True,trace_configs=[PrometheusTraceConfig()]) as session: while True: try: - active_requests_metric.inc() async with session.post(api_url, headers=headers, json=pload, ssl=False) as response: - output = await response.json() - active_requests_metric.dec() # Re-send the request if it failed. if "error" not in output: