From 9a0cb17f978a057d8bd2aa49226d7f637d1d9ee3 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Fri, 25 Oct 2024 15:01:38 -0400 Subject: [PATCH] Added Prometheus Server to LPG (#857) * first commit * separated templates * remove empty file * fmt --- .../container/benchmark_serving.py | 13 +++++++++++++ .../profile-generator/container/requirements.txt | 3 ++- .../modules/latency-profile/main.tf | 9 ++++++++- ...latency-profile-generator-podmonitoring.yaml.tpl | 12 ++++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 829e8c930..37ecdb570 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -13,6 +13,7 @@ import requests import time from typing import AsyncGenerator, List, Optional, Tuple, Dict +from prometheus_client import start_http_server, Histogram import google.auth import google.auth.transport.requests @@ -27,7 +28,12 @@ MIN_SEQ_LEN = 4 CLIENT_TIMEOUT_SEC = 3 * 60 * 60 NEW_TEXT_KEY = "\nOutput:\n" +PROMETHEUS_PORT = 9090 +# Prometheus Metrics +prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) +response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)]) +tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request') def sample_requests( dataset_path: str, @@ -264,6 +270,10 @@ async def send_request( # (prompt len, output len, latency, success) request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) + tpot_metric.observe((request_end_time - request_start_time) / output_len) + prompt_length_metric.observe(prompt_len) + response_length_metric.observe(output_len) + return request_latency, None async def benchmark( @@ -589,6 +599,9 @@ async def main(args: argparse.Namespace): else args.endpoint ) + print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}") + start_http_server(PROMETHEUS_PORT) + api_url = f"http://{args.host}:{args.port}/{endpoint}" tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt index df176eea6..a9f6d99a6 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -34,4 +34,5 @@ aioprometheus[starlette] pynvml == 11.5.0 accelerate aiohttp -google-auth \ No newline at end of file +google-auth +prometheus_client >= 0.21.0 \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf index 5d9d9baea..9d6591394 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -23,7 +23,8 @@ locals { ? "${path.module}/manifest-templates" : pathexpand(var.templates_path) ) - latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl" + latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl" + latency-profile-generator-podmonitoring-template = "${path.module}/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl" hugging_face_token_secret = ( var.hugging_face_secret == null || var.hugging_face_secret_version == null ? null @@ -68,4 +69,10 @@ resource "kubernetes_manifest" "latency-profile-generator" { save_aggregated_result = var.save_aggregated_result models = var.models })) +} + +resource "kubernetes_manifest" "latency-profile-generator-podmonitoring" { + manifest = yamldecode(templatefile(local.latency-profile-generator-podmonitoring-template, { + namespace = var.namespace + })) } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl new file mode 100644 index 000000000..fb46ca27e --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl @@ -0,0 +1,12 @@ +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "lpg-driver-podmonitoring" + namespace: ${namespace} +spec: + selector: + matchLabels: + name: latency-profile-generator + endpoints: + - port: 9090 + interval: 15s