Skip to content

Commit

Permalink
chore: support ollama report
Browse files Browse the repository at this point in the history
  • Loading branch information
naaive committed Nov 18, 2024
1 parent bfc2ddd commit 6e6cfb7
Show file tree
Hide file tree
Showing 3 changed files with 352 additions and 141 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama

load_dotenv()


def measure_model_metrics(model_name: str, num_samples: int = 3) -> tuple[float, float]:
def measure_proprietary_models_metrics(model_name: str, num_samples: int = 3) -> tuple[float, float]:
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful AI assistant."),
("user", "Tell me a short story about a cat.")
Expand Down Expand Up @@ -52,13 +53,53 @@ def measure_model_metrics(model_name: str, num_samples: int = 3) -> tuple[float,
return avg_latency, avg_token_rate


def measure_first_token_latency(model_name: str, num_samples: int = 3) -> float:
latency, _ = measure_model_metrics(model_name, num_samples)
return latency
def measure_opensource_models_metrics(model_name: str, num_samples: int = 3) -> tuple[float, float]:
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful AI assistant."),
("user", "Tell me a short story about a cat.")
])

model = ChatOllama(
model=model_name,
streaming=True,
)

chain = prompt | model | StrOutputParser()

latencies = []
token_rates = []

for _ in range(num_samples):
start_time = time.time()
first_token_received = False
token_count = 0

for chunk in chain.stream({}):
current_time = time.time()
if not first_token_received:
latency = (current_time - start_time) * 1000
latencies.append(latency)
first_token_received = True
first_token_time = current_time

token_count += 1

total_time = time.time() - first_token_time
if total_time > 0:
tokens_per_second = token_count / total_time
token_rates.append(tokens_per_second)

time.sleep(1)

avg_latency = sum(latencies) / len(latencies)
avg_token_rate = sum(token_rates) / len(token_rates)
return avg_latency, avg_token_rate




if __name__ == '__main__':
model_name = "gpt-4o-mini"
avg_latency, avg_token_rate = measure_model_metrics(model_name)
model_name = "llama3.2"
avg_latency, avg_token_rate = measure_opensource_models_metrics(model_name)
print(f"Average first token latency for {model_name}: {avg_latency:.2f} ms")
print(f"Average token output rate for {model_name}: {avg_token_rate:.2f} tokens/sec")
22 changes: 11 additions & 11 deletions tests/generate_benchmark_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,20 @@
PROPRIETARY_MODELS = [
{"name": "gpt-4o-mini", "function_call_support": True},
{"name": "gpt-4o", "function_call_support": True},
# {"name": "gemini-1.5-flash", "function_call_support": True},
# {"name": "gemini-1.5-pro", "function_call_support": True},
{"name": "gemini-1.5-flash", "function_call_support": True},
{"name": "gemini-1.5-pro", "function_call_support": True},
]

OPENSOURCE_MODELS = [
# {"name": "qwen2", "function_call_support": True},
# {"name": "mistral", "function_call_support": True},
# {"name": "qwen2.5", "function_call_support": True},
# {"name": "llama3.1", "function_call_support": True},
# {"name": "llama3.2", "function_call_support": True},
# {"name": "mistral-nemo", "function_call_support": True},
{"name": "qwen2", "function_call_support": True},
{"name": "mistral", "function_call_support": True},
{"name": "qwen2.5", "function_call_support": True},
{"name": "llama3.1", "function_call_support": True},
{"name": "llama3.2", "function_call_support": True},
{"name": "mistral-nemo", "function_call_support": True},
]

from test_first_token_latency import measure_model_metrics
from gen_benchmark_html_report import measure_proprietary_models_metrics, measure_opensource_models_metrics


class TestStats:
Expand Down Expand Up @@ -128,14 +128,14 @@ def main():

for model in PROPRIETARY_MODELS:
score = run_model_tests(model['name'])
latency, token_rate = measure_model_metrics(model['name'])
latency, token_rate = measure_proprietary_models_metrics(model['name'])
logger.info(f"First token latency for {model['name']}: {latency:.2f}ms")
logger.info(f"Token output rate for {model['name']}: {token_rate:.1f} tokens/sec")
proprietary_results[model['name']] = (score, latency, token_rate)

for model in OPENSOURCE_MODELS:
score = run_model_tests(model['name'])
latency, token_rate = measure_model_metrics(model['name'])
latency, token_rate = measure_opensource_models_metrics(model['name'])
logger.info(f"First token latency for {model['name']}: {latency:.2f}ms")
logger.info(f"Token output rate for {model['name']}: {token_rate:.1f} tokens/sec")
opensource_results[model['name']] = (score, latency, token_rate)
Expand Down
Loading

0 comments on commit 6e6cfb7

Please sign in to comment.