Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
… into add/example_notebooks/kubernetes_docs
  • Loading branch information
german-grandas committed Oct 28, 2024
2 parents a91fe1d + 9a0cb17 commit 57db05c
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import requests
import time
from typing import AsyncGenerator, List, Optional, Tuple, Dict
from prometheus_client import start_http_server, Histogram

import google.auth
import google.auth.transport.requests
Expand All @@ -27,7 +28,12 @@
MIN_SEQ_LEN = 4
CLIENT_TIMEOUT_SEC = 3 * 60 * 60
NEW_TEXT_KEY = "\nOutput:\n"
PROMETHEUS_PORT = 9090

# Prometheus Metrics
prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')

def sample_requests(
dataset_path: str,
Expand Down Expand Up @@ -264,6 +270,10 @@ async def send_request(

# (prompt len, output len, latency, success)
request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
tpot_metric.observe((request_end_time - request_start_time) / output_len)
prompt_length_metric.observe(prompt_len)
response_length_metric.observe(output_len)

return request_latency, None

async def benchmark(
Expand Down Expand Up @@ -589,6 +599,9 @@ async def main(args: argparse.Namespace):
else args.endpoint
)

print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
start_http_server(PROMETHEUS_PORT)

api_url = f"http://{args.host}:{args.port}/{endpoint}"
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ aioprometheus[starlette]
pynvml == 11.5.0
accelerate
aiohttp
google-auth
google-auth
prometheus_client >= 0.21.0
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ locals {
? "${path.module}/manifest-templates"
: pathexpand(var.templates_path)
)
latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
latency-profile-generator-podmonitoring-template = "${path.module}/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl"
hugging_face_token_secret = (
var.hugging_face_secret == null || var.hugging_face_secret_version == null
? null
Expand Down Expand Up @@ -68,4 +69,10 @@ resource "kubernetes_manifest" "latency-profile-generator" {
save_aggregated_result = var.save_aggregated_result
models = var.models
}))
}

resource "kubernetes_manifest" "latency-profile-generator-podmonitoring" {
manifest = yamldecode(templatefile(local.latency-profile-generator-podmonitoring-template, {
namespace = var.namespace
}))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: monitoring.googleapis.com/v1
kind: PodMonitoring
metadata:
name: "lpg-driver-podmonitoring"
namespace: ${namespace}
spec:
selector:
matchLabels:
name: latency-profile-generator
endpoints:
- port: 9090
interval: 15s

0 comments on commit 57db05c

Please sign in to comment.