From 9a0cb17f978a057d8bd2aa49226d7f637d1d9ee3 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <bslabe123@gmail.com>
Date: Fri, 25 Oct 2024 15:01:38 -0400
Subject: [PATCH] Added Prometheus Server to LPG (#857)

* first commit

* separated templates

* remove empty file

* fmt
---
 .../container/benchmark_serving.py                  | 13 +++++++++++++
 .../profile-generator/container/requirements.txt    |  3 ++-
 .../modules/latency-profile/main.tf                 |  9 ++++++++-
 ...latency-profile-generator-podmonitoring.yaml.tpl | 12 ++++++++++++
 4 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl

diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 829e8c930..37ecdb570 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -13,6 +13,7 @@
 import requests
 import time
 from typing import AsyncGenerator, List, Optional, Tuple, Dict
+from prometheus_client import start_http_server, Histogram
 
 import google.auth
 import google.auth.transport.requests
@@ -27,7 +28,12 @@
 MIN_SEQ_LEN = 4
 CLIENT_TIMEOUT_SEC = 3 * 60 * 60
 NEW_TEXT_KEY = "\nOutput:\n"
+PROMETHEUS_PORT = 9090
 
+# Prometheus Metrics
+prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
+response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
+tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
 
 def sample_requests(
     dataset_path: str,
@@ -264,6 +270,10 @@ async def send_request(
 
   # (prompt len, output len, latency, success)
   request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
+  tpot_metric.observe((request_end_time - request_start_time) / output_len)
+  prompt_length_metric.observe(prompt_len)
+  response_length_metric.observe(output_len)
+
   return request_latency, None
 
 async def benchmark(
@@ -589,6 +599,9 @@ async def main(args: argparse.Namespace):
     else args.endpoint
 )
 
+  print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
+  start_http_server(PROMETHEUS_PORT)
+
   api_url = f"http://{args.host}:{args.port}/{endpoint}"
   tokenizer = AutoTokenizer.from_pretrained(
       args.tokenizer, trust_remote_code=args.trust_remote_code
diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
index df176eea6..a9f6d99a6 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
+++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
@@ -34,4 +34,5 @@ aioprometheus[starlette]
 pynvml == 11.5.0
 accelerate
 aiohttp
-google-auth
\ No newline at end of file
+google-auth
+prometheus_client >= 0.21.0
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
index 5d9d9baea..9d6591394 100644
--- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
@@ -23,7 +23,8 @@ locals {
     ? "${path.module}/manifest-templates"
     : pathexpand(var.templates_path)
   )
-  latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
+  latency-profile-generator-template               = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
+  latency-profile-generator-podmonitoring-template = "${path.module}/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl"
   hugging_face_token_secret = (
     var.hugging_face_secret == null || var.hugging_face_secret_version == null
     ? null
@@ -68,4 +69,10 @@ resource "kubernetes_manifest" "latency-profile-generator" {
     save_aggregated_result                     = var.save_aggregated_result
     models                                     = var.models
   }))
+}
+
+resource "kubernetes_manifest" "latency-profile-generator-podmonitoring" {
+  manifest = yamldecode(templatefile(local.latency-profile-generator-podmonitoring-template, {
+    namespace = var.namespace
+  }))
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl
new file mode 100644
index 000000000..fb46ca27e
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl
@@ -0,0 +1,12 @@
+apiVersion: monitoring.googleapis.com/v1
+kind: PodMonitoring
+metadata:
+  name: "lpg-driver-podmonitoring"
+  namespace: ${namespace}
+spec:
+  selector:
+    matchLabels:
+      name: latency-profile-generator
+  endpoints:
+  - port: 9090
+    interval: 15s