Skip to content

Commit

Permalink
Merge branch 'main' into nanotron_greedyuntil_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanHB authored Oct 7, 2024
2 parents a65722e + f89ae20 commit e461548
Show file tree
Hide file tree
Showing 15 changed files with 1,333 additions and 440 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ where = ["src"]

[project]
name = "lighteval"
version = "0.5.0.dev0"
version = "0.6.0.dev0"
authors = [
{ name="Clémentine Fourrier", email="clementine@huggingface.com" },
{ name="Nathan Habib", email="nathan.habib@huggingface.com" },
Expand Down
344 changes: 197 additions & 147 deletions src/lighteval/metrics/__init__.py

Large diffs are not rendered by default.

293 changes: 125 additions & 168 deletions src/lighteval/metrics/llm_as_judge.py

Large diffs are not rendered by default.

60 changes: 0 additions & 60 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os

import numpy as np
from aenum import Enum
Expand All @@ -43,7 +42,6 @@
Extractiveness,
F1_score,
Faithfulness,
JudgeLLM,
LoglikelihoodAcc,
MajAtK,
Recall,
Expand Down Expand Up @@ -233,64 +231,6 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping(
metric_name=["single_turn", "multi_turn"],
higher_is_better={"single_turn": True, "multi_turn": True},
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=True,
).compute,
corpus_level_fn={
"single_turn": np.mean,
"multi_turn": np.mean,
},
)
llm_judge_multi_turn_llama_3_405b = SampleLevelMetricGrouping(
metric_name=["single_turn", "multi_turn"],
higher_is_better={"single_turn": True, "multi_turn": True},
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=True,
).compute,
corpus_level_fn={
"single_turn": np.mean,
"multi_turn": np.mean,
},
)
llm_judge_gpt3p5 = SampleLevelMetricGrouping(
metric_name=["judge_score"],
higher_is_better={"judge_score": True},
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=False,
).compute,
corpus_level_fn={
"judge_score": np.mean,
},
)
llm_judge_llama_3_405b = SampleLevelMetricGrouping(
metric_name=["judge_score"],
higher_is_better={"judge_score": True},
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=False,
).compute,
corpus_level_fn={
"judge_score": np.mean,
},
)
loglikelihood_acc = SampleLevelMetric(
metric_name="acc",
sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute,
Expand Down
116 changes: 79 additions & 37 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"""

import os
from typing import Callable
from typing import Callable, Literal

import nltk
import numpy as np
Expand Down Expand Up @@ -844,65 +844,107 @@ class JudgeLLM:
available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]

def __init__(
self, judge_model_name: str, template_path: str, multi_turn: bool = False, use_transformers: bool = False
self,
judge_model_name: str,
template: Callable,
process_judge_response: Callable,
judge_backend: Literal["openai", "transformers", "vllm", "tgi"],
short_judge_name: str | None = None,
) -> None:
if judge_model_name in self.available_models_openai:
api_key = os.getenv("OPENAI_API_KEY")
url = None
elif not use_transformers:
api_key = os.getenv("HF_TOKEN")
url = "https://api-inference.huggingface.co/v1/"
else:
api = HfApi()
models = api.list_models(model_name=judge_model_name)
url = None
api_key = None
if not models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")

self.multi_turn = multi_turn
match judge_backend:
case "openai":
if judge_model_name not in self.available_models_openai:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
else:
api_key = os.getenv("OPENAI_API_KEY")
url = None
case "tgi":
api_key = os.getenv("HF_TOKEN")
url = "https://api-inference.huggingface.co/v1/"
case "transformers" | "vllm":
api = HfApi()
models = api.list_models(model_name=judge_model_name)
url = None
api_key = None
if not models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
case _:
raise ValueError(f"{judge_backend} is not a valid backend for llm as a judge metric")

self.short_judge_name = short_judge_name
self.judge = JudgeLM(
model=judge_model_name,
templates_path=template_path,
multi_turn=multi_turn,
templates=template,
process_judge_response=process_judge_response,
api_key=api_key,
url=url,
judge_backend=judge_backend,
)

def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
raise NotImplementedError("This method should be implemented in the subclass.")


class JudgeLLMMTBench(JudgeLLM):
def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs):
"""
Compute the score of a generative task using a llm as a judge.
The generative task can be multiturn with 2 turns max, in that case, we
return scores for turn 1 and 2. Also returns user_prompt and judgement
which are ignored later by the aggregator.
"""
import json

# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
if self.multi_turn:
questions = formatted_doc.specific["multi_turn_queries"]
ref_answers = formatted_doc.specific.get("reference", None) if formatted_doc.specific is not None else None
else:
questions = [formatted_doc.query]
ref_answers = [formatted_doc.choices[formatted_doc.gold_index]]
questions = formatted_doc.specific["multi_turn_queries"]
golds = formatted_doc.specific.get("reference", None)

scores, messages, judgements = self.judge.evaluate_answer(questions, predictions, ref_answers)
query_context_1 = {"query": questions[0], "context": ""}
query_context_2 = {"query": questions[1], "context": predictions[0]}

# Multi turn only has 2 turns
if self.multi_turn:
return {
"single_turn": scores[0],
"multi_turn": scores[1],
"user_prompt": [messages[0], messages[1]],
"judgement": [judgements[0], judgements[1]],
}
score_turn_1, message_turn_1, judgement_turn_1 = self.judge.evaluate_answer(
question=json.dumps(query_context_1, indent=2), answer=predictions[0], gold=golds[0] if golds else None
)
score_turn_2, message_turn_2, judgement_turn_2 = self.judge.evaluate_answer(
question=json.dumps(query_context_2, indent=2), answer=predictions[1], gold=golds[1] if golds else None
)

return {
"judge_score": scores[0],
"user_prompt": messages[0],
"judgement": judgements[0],
"judge_score_turn_1": score_turn_1,
"judge_score_turn_2": score_turn_2,
"user_prompt": [message_turn_1, message_turn_2],
"judgement": [judgement_turn_1, judgement_turn_2],
}


class JudgeLLMMixEval(JudgeLLM):
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
"""
Compute the score of a generative task using a llm as a judge.
The generative task can be multiturn with 2 turns max, in that case, we
return scores for turn 1 and 2. Also returns user_prompt and judgement
which are ignored later by the aggregator.
"""
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
options = [formatted_doc.choices for formatted_doc in formatted_docs]
golds = [formatted_doc.choices[formatted_doc.gold_index[0]] for formatted_doc in formatted_docs]
predictions = [response[0].result[0] for response in responses]

scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)

metrics = []
for i in range(len(sample_ids)):
metrics.append(
{
f"judge_score_{self.short_judge_name}": scores[i],
f"user_prompt_{self.short_judge_name}": messages[i],
f"judgement_{self.short_judge_name}": judgements[i],
}
)

return metrics


class MajAtK:
def __init__(
self,
Expand Down
11 changes: 11 additions & 0 deletions src/lighteval/models/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import gc
import itertools
import os
from typing import Optional

import torch
from tqdm import tqdm

from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
Expand All @@ -47,6 +49,7 @@
import ray
from more_itertools import distribute
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
from vllm.transformers_utils.tokenizer import get_tokenizer
else:
LLM = None
Expand Down Expand Up @@ -95,6 +98,14 @@ def __init__(
def tokenizer(self):
return self._tokenizer

def cleanup(self):
destroy_model_parallel()
del self.model.llm_engine.model_executor.driver_worker
gc.collect()
ray.shutdown()
destroy_distributed_environment()
torch.cuda.empty_cache()

@property
def add_special_tokens(self):
return self._add_special_tokens
Expand Down
52 changes: 38 additions & 14 deletions src/lighteval/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks
from lighteval.tasks.registry import Registry, get_custom_tasks, taskinfo_selector
from lighteval.tasks.requests import Doc, SampleUid
from lighteval.tasks.requests import SampleUid
from lighteval.utils.imports import (
NO_ACCELERATE_ERROR_MSG,
NO_NANOTRON_ERROR_MSG,
Expand Down Expand Up @@ -251,7 +251,6 @@ def evaluate(self):
hlog(f"Removed {tmp_weights_dir}")
except OSError:
pass
self.model.cleanup()

def _run_model(self):
# Running all requests depending on the model call type (log likelihood, generative, ...)
Expand All @@ -269,26 +268,51 @@ def _run_model(self):
sample_id = SampleUid(request.task_name, request.sample_index)
sample_id_to_responses[(sample_id, metric_category)].append(response)

# Cleaning up the model before running metrics
self.model.cleanup()

return sample_id_to_responses

def _compute_metrics(self, sample_id_to_responses):
# 2. Running the metric on each sample on its own.
# Note: some samples are associated with several responses, like the multichoice samples
# and some metrics will parse all samples at once in a second step during aggregation
# To compute the metrics we first group the samples and task and then by metrics.
# This way we can batch the metrics computation for each task and metric category

# This variable will hold the samples grouped by task and metric category
# example:
# task_metric_category_groups = {
# "task_name": {
# "metric_category": {
# "ids": [sample_id1, sample_id2, ...],
# "responses": [[response1_1, response1_2, ...], [response2_1, response2_2, ...], ...],
# "docs": [doc1, doc2, ...]
# }
task_metric_category_groups = collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.defaultdict(list))
)

for (sample_id, metric_category), sample_responses in sample_id_to_responses.items():
short_task_name = sample_id.task_name.rsplit("|", 1)[0]
task_metric_category_groups[sample_id.task_name][metric_category]["ids"].append(sample_id.doc_id_seed)
task_metric_category_groups[sample_id.task_name][metric_category]["responses"].append(sample_responses)
task_metric_category_groups[sample_id.task_name][metric_category]["docs"].append(self.docs[sample_id])

for task_name, samples_per_metric in task_metric_category_groups.items():
short_task_name = task_name.rsplit("|", 1)[0]
task: LightevalTask = self.task_dict[short_task_name]
doc: Doc = self.docs[sample_id]

compute_metric = task.get_metric_method_from_category(metric_category=metric_category)
# This is important if two metric categories have non-zero intersection request-wise.
# Some might then only expect to get their requests.
metric_category_metrics = [metric for metric in task.metrics if metric.category == metric_category]
metrics = compute_metric(results=sample_responses, formatted_doc=doc, metrics=metric_category_metrics)
for metric_category, samples in samples_per_metric.items():
sample_ids = samples["ids"]
responses = samples["responses"]
docs = samples["docs"]
metric_function = task.get_metric_method_from_category(metric_category=metric_category)
metric_category_metrics = [metric for metric in task.metrics if metric.category == metric_category]

outputs = metric_function(
sample_ids=sample_ids, responses=responses, formatted_docs=docs, metrics=metric_category_metrics
)

self.evaluation_tracker.metrics_logger.log(sample_id.task_name, metrics)
self.evaluation_tracker.details_logger.log(sample_id.task_name, task, doc, sample_responses, metrics)
for output, doc, response in zip(outputs, docs, responses):
self.evaluation_tracker.metrics_logger.log(task_name, output)
self.evaluation_tracker.details_logger.log(task_name, task, doc, response, output)

def save_and_push_results(self):
if self.is_main_process():
Expand Down
3 changes: 2 additions & 1 deletion src/lighteval/tasks/extended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@

if can_load_extended_tasks():
import lighteval.tasks.extended.ifeval.main as ifeval
import lighteval.tasks.extended.mix_eval.main as mix_eval
import lighteval.tasks.extended.mt_bench.main as mt_bench
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks

AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench]
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval]

else:
AVAILABLE_EXTENDED_TASKS_MODULES = []
Loading

0 comments on commit e461548

Please sign in to comment.