diff --git a/load_tests/common.js b/load_tests/common.js
index e0a105956e1..aecaa0fd070 100644
--- a/load_tests/common.js
+++ b/load_tests/common.js
@@ -1,4 +1,4 @@
-import { check } from 'k6';
+import { check, sleep } from 'k6';
 import { scenario } from 'k6/execution';
 import http from 'k6/http';
 import { Trend, Counter } from 'k6/metrics';
@@ -33,13 +33,13 @@ export function get_options() {
             //     rate: 20,
             //     timeUnit: '1s',
             // },
-            load_test: {
-                executor: 'constant-arrival-rate',
-                duration: '60s',
-                preAllocatedVUs: 100,
-                rate: 1,
-                timeUnit: '1s',
-            },
+            // load_test: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 10, // not enough RAM for 100 VUs
+            //     rate: 1,
+            //     timeUnit: '1s',
+            // },
             // breakpoint: {
             //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
             //     preAllocatedVUs: 300,
@@ -47,19 +47,25 @@ export function get_options() {
             //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
             //     ],
             // },
-            // throughput: {
-            //     executor: 'shared-iterations',
-            //     vus: 100,
-            //     iterations: 200,
-            //     maxDuration: '40s',
-            // },
+            throughput: {
+                executor: 'shared-iterations',
+                vus: 16,
+                iterations: 500,
+                maxDuration: '400s',
+            },
         },
     };
 }
 
 function generate_payload(gpt, max_new_tokens) {
-    const input = gpt["conversations"][0]["value"];
-    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
+    const max_input_tokens = 10000 - max_new_tokens - 1;
+    const input = gpt["conversations"][0]["value"].substring(0, max_input_tokens);
+    return {
+        "prompt": `<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
+        "max_tokens": max_new_tokens,
+        "temperature": 0,
+        "model": `${model_id}`,
+    }
 }
 
 export const options = get_options();
@@ -68,14 +74,19 @@ export default function run() {
     const headers = { 'Content-Type': 'application/json' };
     const query = shareGPT[scenario.iterationInTest % shareGPT.length];
     const payload = JSON.stringify(generate_payload(query, max_new_tokens));
-    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
+    const res = http.post(`https://${host}/v1/completions`, payload, {
         headers,
     });
-    if (res.status >= 400 && res.status < 500) {
+    // if (res.status >= 400 && res.status < 500) {
+    //     return;
+    // }
+
+    if (res.status !== 200) {
+        console.error(res.body);
+        console.error('status: ' + res.status);
         return;
     }
 
-
     check(res, {
         'Post status is 200': (res) => res.status === 200,
     });
@@ -83,12 +94,26 @@ export default function run() {
 
     if (res.status === 200) {
         const body = res.json();
-        const completion_tokens = body.usage.completion_tokens;
-        const latency_ms_per_token = duration / completion_tokens;
-        timePerToken.add(latency_ms_per_token);
-        const prompt_tokens = body.usage.prompt_tokens;
-        input_tokens.add(prompt_tokens);
-        new_tokens.add(completion_tokens);
-        tokens.add(completion_tokens + prompt_tokens);
+        if (body.usage) {
+            const completion_tokens = body.usage.completion_tokens;
+            const latency_ms_per_token = duration / completion_tokens;
+            timePerToken.add(latency_ms_per_token);
+            const prompt_tokens = body.usage.prompt_tokens;
+            input_tokens.add(prompt_tokens);
+            new_tokens.add(completion_tokens);
+            tokens.add(completion_tokens + prompt_tokens);
+        }
+        if (body.tokens_predicted) {
+            // llama.cpp specific
+            const completion_tokens = body.tokens_predicted;
+            const latency_ms_per_token = duration / completion_tokens;
+            timePerToken.add(latency_ms_per_token);
+            const prompt_tokens = body.tokens_evaluated;
+            input_tokens.add(prompt_tokens);
+            new_tokens.add(completion_tokens);
+            tokens.add(completion_tokens + prompt_tokens);
+        }
     }
+
+    sleep(1);
 }
diff --git a/load_tests/docker-compose.yml b/load_tests/docker-compose.yml
new file mode 100644
index 00000000000..edac91e06ef
--- /dev/null
+++ b/load_tests/docker-compose.yml
@@ -0,0 +1,12 @@
+services:
+
+  # How to run:
+  # HOST=..... docker compose up
+  tgi_load_test:
+    image: grafana/k6
+    command: run /load_tests/common.js
+    environment:
+      - HOST=${HOST}
+      - MODEL_ID=${MODEL_ID}
+    volumes:
+      - ./:/load_tests:Z