chore: support ollama report

webisopen · Nov 18, 2024 · 6e6cfb7 · 6e6cfb7
1 parent bfc2ddd
commit 6e6cfb7
Show file tree

Hide file tree

Showing 3 changed files with 352 additions and 141 deletions.
diff --git a/tests/test_first_token_latency.py → tests/gen_benchmark_html_report.py b/tests/test_first_token_latency.py → tests/gen_benchmark_html_report.py
@@ -4,11 +4,12 @@
 from langchain.chat_models import ChatOpenAI
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
+from langchain_ollama import ChatOllama
 
 load_dotenv()
 
 
-def measure_model_metrics(model_name: str, num_samples: int = 3) -> tuple[float, float]:
+def measure_proprietary_models_metrics(model_name: str, num_samples: int = 3) -> tuple[float, float]:
     prompt = ChatPromptTemplate.from_messages([
         ("system", "You are a helpful AI assistant."),
         ("user", "Tell me a short story about a cat.")
@@ -52,13 +53,53 @@ def measure_model_metrics(model_name: str, num_samples: int = 3) -> tuple[float,
     return avg_latency, avg_token_rate
 
 
-def measure_first_token_latency(model_name: str, num_samples: int = 3) -> float:
-    latency, _ = measure_model_metrics(model_name, num_samples)
-    return latency
+def measure_opensource_models_metrics(model_name: str, num_samples: int = 3) -> tuple[float, float]:
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful AI assistant."),
+        ("user", "Tell me a short story about a cat.")
+    ])
+
+    model = ChatOllama(
+        model=model_name,
+        streaming=True,
+    )
+
+    chain = prompt | model | StrOutputParser()
+
+    latencies = []
+    token_rates = []
+
+    for _ in range(num_samples):
+        start_time = time.time()
+        first_token_received = False
+        token_count = 0
+
+        for chunk in chain.stream({}):
+            current_time = time.time()
+            if not first_token_received:
+                latency = (current_time - start_time) * 1000
+                latencies.append(latency)
+                first_token_received = True
+                first_token_time = current_time
+
+            token_count += 1
+
+        total_time = time.time() - first_token_time
+        if total_time > 0:
+            tokens_per_second = token_count / total_time
+            token_rates.append(tokens_per_second)
+
+        time.sleep(1)
+
+    avg_latency = sum(latencies) / len(latencies)
+    avg_token_rate = sum(token_rates) / len(token_rates)
+    return avg_latency, avg_token_rate
+
+
 
 
 if __name__ == '__main__':
-    model_name = "gpt-4o-mini"
-    avg_latency, avg_token_rate = measure_model_metrics(model_name)
+    model_name = "llama3.2"
+    avg_latency, avg_token_rate = measure_opensource_models_metrics(model_name)
     print(f"Average first token latency for {model_name}: {avg_latency:.2f} ms")
     print(f"Average token output rate for {model_name}: {avg_token_rate:.2f} tokens/sec")
diff --git a/tests/generate_benchmark_report.py b/tests/generate_benchmark_report.py
@@ -9,20 +9,20 @@
 PROPRIETARY_MODELS = [
     {"name": "gpt-4o-mini", "function_call_support": True},
     {"name": "gpt-4o", "function_call_support": True},
-    # {"name": "gemini-1.5-flash", "function_call_support": True},
-    # {"name": "gemini-1.5-pro", "function_call_support": True},
+    {"name": "gemini-1.5-flash", "function_call_support": True},
+    {"name": "gemini-1.5-pro", "function_call_support": True},
 ]
 
 OPENSOURCE_MODELS = [
-    # {"name": "qwen2", "function_call_support": True},
-    # {"name": "mistral", "function_call_support": True},
-    # {"name": "qwen2.5", "function_call_support": True},
-    # {"name": "llama3.1", "function_call_support": True},
-    # {"name": "llama3.2", "function_call_support": True},
-    # {"name": "mistral-nemo", "function_call_support": True},
+    {"name": "qwen2", "function_call_support": True},
+    {"name": "mistral", "function_call_support": True},
+    {"name": "qwen2.5", "function_call_support": True},
+    {"name": "llama3.1", "function_call_support": True},
+    {"name": "llama3.2", "function_call_support": True},
+    {"name": "mistral-nemo", "function_call_support": True},
 ]
 
-from test_first_token_latency import measure_model_metrics
+from gen_benchmark_html_report import measure_proprietary_models_metrics, measure_opensource_models_metrics
 
 
 class TestStats:
@@ -128,14 +128,14 @@ def main():
 
     for model in PROPRIETARY_MODELS:
         score = run_model_tests(model['name'])
-        latency, token_rate = measure_model_metrics(model['name'])
+        latency, token_rate = measure_proprietary_models_metrics(model['name'])
         logger.info(f"First token latency for {model['name']}: {latency:.2f}ms")
         logger.info(f"Token output rate for {model['name']}: {token_rate:.1f} tokens/sec")
         proprietary_results[model['name']] = (score, latency, token_rate)
 
     for model in OPENSOURCE_MODELS:
         score = run_model_tests(model['name'])
-        latency, token_rate = measure_model_metrics(model['name'])
+        latency, token_rate = measure_opensource_models_metrics(model['name'])
         logger.info(f"First token latency for {model['name']}: {latency:.2f}ms")
         logger.info(f"Token output rate for {model['name']}: {token_rate:.1f} tokens/sec")
         opensource_results[model['name']] = (score, latency, token_rate)