diff --git a/load_tests/common.js b/load_tests/common.js index e0a105956e1..aecaa0fd070 100644 --- a/load_tests/common.js +++ b/load_tests/common.js @@ -1,4 +1,4 @@ -import { check } from 'k6'; +import { check, sleep } from 'k6'; import { scenario } from 'k6/execution'; import http from 'k6/http'; import { Trend, Counter } from 'k6/metrics'; @@ -33,13 +33,13 @@ export function get_options() { // rate: 20, // timeUnit: '1s', // }, - load_test: { - executor: 'constant-arrival-rate', - duration: '60s', - preAllocatedVUs: 100, - rate: 1, - timeUnit: '1s', - }, + // load_test: { + // executor: 'constant-arrival-rate', + // duration: '60s', + // preAllocatedVUs: 10, // not enough RAM for 100 VUs + // rate: 1, + // timeUnit: '1s', + // }, // breakpoint: { // executor: 'ramping-arrival-rate', //Assure load increase if the system slows // preAllocatedVUs: 300, @@ -47,19 +47,25 @@ export function get_options() { // { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load // ], // }, - // throughput: { - // executor: 'shared-iterations', - // vus: 100, - // iterations: 200, - // maxDuration: '40s', - // }, + throughput: { + executor: 'shared-iterations', + vus: 16, + iterations: 500, + maxDuration: '400s', + }, }, }; } function generate_payload(gpt, max_new_tokens) { - const input = gpt["conversations"][0]["value"]; - return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens } + const max_input_tokens = 10000 - max_new_tokens - 1; + const input = gpt["conversations"][0]["value"].substring(0, max_input_tokens); + return { + "prompt": `<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`, + "max_tokens": max_new_tokens, + "temperature": 0, + "model": `${model_id}`, + } } export const options = get_options(); @@ -68,14 +74,19 @@ export default function run() { const headers = { 'Content-Type': 'application/json' }; const query = shareGPT[scenario.iterationInTest % shareGPT.length]; const payload = JSON.stringify(generate_payload(query, max_new_tokens)); - const res = http.post(`http://${host}/v1/chat/completions`, payload, { + const res = http.post(`https://${host}/v1/completions`, payload, { headers, }); - if (res.status >= 400 && res.status < 500) { + // if (res.status >= 400 && res.status < 500) { + // return; + // } + + if (res.status !== 200) { + console.error(res.body); + console.error('status: ' + res.status); return; } - check(res, { 'Post status is 200': (res) => res.status === 200, }); @@ -83,12 +94,26 @@ export default function run() { if (res.status === 200) { const body = res.json(); - const completion_tokens = body.usage.completion_tokens; - const latency_ms_per_token = duration / completion_tokens; - timePerToken.add(latency_ms_per_token); - const prompt_tokens = body.usage.prompt_tokens; - input_tokens.add(prompt_tokens); - new_tokens.add(completion_tokens); - tokens.add(completion_tokens + prompt_tokens); + if (body.usage) { + const completion_tokens = body.usage.completion_tokens; + const latency_ms_per_token = duration / completion_tokens; + timePerToken.add(latency_ms_per_token); + const prompt_tokens = body.usage.prompt_tokens; + input_tokens.add(prompt_tokens); + new_tokens.add(completion_tokens); + tokens.add(completion_tokens + prompt_tokens); + } + if (body.tokens_predicted) { + // llama.cpp specific + const completion_tokens = body.tokens_predicted; + const latency_ms_per_token = duration / completion_tokens; + timePerToken.add(latency_ms_per_token); + const prompt_tokens = body.tokens_evaluated; + input_tokens.add(prompt_tokens); + new_tokens.add(completion_tokens); + tokens.add(completion_tokens + prompt_tokens); + } } + + sleep(1); } diff --git a/load_tests/docker-compose.yml b/load_tests/docker-compose.yml new file mode 100644 index 00000000000..edac91e06ef --- /dev/null +++ b/load_tests/docker-compose.yml @@ -0,0 +1,12 @@ +services: + + # How to run: + # HOST=..... docker compose up + tgi_load_test: + image: grafana/k6 + command: run /load_tests/common.js + environment: + - HOST=${HOST} + - MODEL_ID=${MODEL_ID} + volumes: + - ./:/load_tests:Z