Skip to content

Commit

Permalink
fix: eval correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
xhiroga committed Nov 30, 2024
1 parent f393eb9 commit d3c5f93
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

load_dotenv()

logging.set_verbosity_info()

HF_TOKEN = os.environ.get("HF_TOKEN")
assert os.environ.get("WANDB_ENTITY")
assert os.environ.get("WANDB_PROJECT")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@

import google.generativeai as genai
import torch
from datasets import dataset_dict, load_dataset
from datasets import dataset_dict
from dotenv import load_dotenv
from peft import PeftMixedModel, PeftModel
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, logging
from typing_extensions import TypedDict

from instruction_datasets import load_elyza_tasks_100_TV


class Result(TypedDict):
task_id: int | None
Expand Down Expand Up @@ -134,7 +136,7 @@ def evaluate(results: list[Result], batch_size: int = 10) -> list[Evaluation]:
batch_results = results[i : i + batch_size]

prompts = [
evaluation_prompt(result["input"], result["output"], result["eval_aspect"], result.get("sample_output"))
evaluation_prompt(result["input"], result["output"], result.get("eval_aspect"), result.get("target"))
for result in batch_results
]

Expand Down Expand Up @@ -171,11 +173,11 @@ def save_results(results: list[Result], jsonl_prefix: str):
logging.set_verbosity_info()
model = AutoModelForCausalLM.from_pretrained("models/llm-jp-3-1-8b-finetune", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("models/llm-jp-3-1-8b-finetune", trust_remote_code=True)
ds = load_dataset("elyza/ELYZA-tasks-100")
results = test(model, tokenizer, ds["test"], limit=5, model_half=True)
ds = load_elyza_tasks_100_TV()
results = test(model, tokenizer, ds, limit=5, model_half=True)
evaluations = evaluate(results, 10)
averagt_score = sum(evaluation["score"] for evaluation in evaluations) / len(
average_score = sum(evaluation["score"] for evaluation in evaluations) / len(
evaluations
)
print(f"{evaluations=}, {averagt_score=}")
assert 2 < averagt_score < 5
print(f"{evaluations=}, {average_score=}")
assert 2 < average_score < 5

0 comments on commit d3c5f93

Please sign in to comment.