From d3c5f934519f5758733376990be00652d53882b1 Mon Sep 17 00:00:00 2001 From: Hiroaki Ogasawara <13391129+xhiroga@users.noreply.github.com> Date: Sat, 30 Nov 2024 18:26:17 +0900 Subject: [PATCH] fix: eval correctly --- .../{notbooks => notebooks}/preprocess.ipynb | 2 +- .../_src/llm-exercises/src/sft.py | 2 ++ .../_src/llm-exercises/src/utils.py | 16 +++++++++------- 3 files changed, 12 insertions(+), 8 deletions(-) rename computer-science/machine-learning/_src/llm-exercises/{notbooks => notebooks}/preprocess.ipynb (99%) diff --git a/computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb b/computer-science/machine-learning/_src/llm-exercises/notebooks/preprocess.ipynb similarity index 99% rename from computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb rename to computer-science/machine-learning/_src/llm-exercises/notebooks/preprocess.ipynb index 0be479b7d..6d632d48f 100644 --- a/computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb +++ b/computer-science/machine-learning/_src/llm-exercises/notebooks/preprocess.ipynb @@ -150,7 +150,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/computer-science/machine-learning/_src/llm-exercises/src/sft.py b/computer-science/machine-learning/_src/llm-exercises/src/sft.py index 0d33249f2..efeb3fb62 100644 --- a/computer-science/machine-learning/_src/llm-exercises/src/sft.py +++ b/computer-science/machine-learning/_src/llm-exercises/src/sft.py @@ -21,6 +21,8 @@ load_dotenv() +logging.set_verbosity_info() + HF_TOKEN = os.environ.get("HF_TOKEN") assert os.environ.get("WANDB_ENTITY") assert os.environ.get("WANDB_PROJECT") diff --git a/computer-science/machine-learning/_src/llm-exercises/src/utils.py b/computer-science/machine-learning/_src/llm-exercises/src/utils.py index 369eee637..da9795648 100644 --- a/computer-science/machine-learning/_src/llm-exercises/src/utils.py +++ b/computer-science/machine-learning/_src/llm-exercises/src/utils.py @@ -4,13 +4,15 @@ import google.generativeai as genai import torch -from datasets import dataset_dict, load_dataset +from datasets import dataset_dict from dotenv import load_dotenv from peft import PeftMixedModel, PeftModel from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, logging from typing_extensions import TypedDict +from instruction_datasets import load_elyza_tasks_100_TV + class Result(TypedDict): task_id: int | None @@ -134,7 +136,7 @@ def evaluate(results: list[Result], batch_size: int = 10) -> list[Evaluation]: batch_results = results[i : i + batch_size] prompts = [ - evaluation_prompt(result["input"], result["output"], result["eval_aspect"], result.get("sample_output")) + evaluation_prompt(result["input"], result["output"], result.get("eval_aspect"), result.get("target")) for result in batch_results ] @@ -171,11 +173,11 @@ def save_results(results: list[Result], jsonl_prefix: str): logging.set_verbosity_info() model = AutoModelForCausalLM.from_pretrained("models/llm-jp-3-1-8b-finetune", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("models/llm-jp-3-1-8b-finetune", trust_remote_code=True) - ds = load_dataset("elyza/ELYZA-tasks-100") - results = test(model, tokenizer, ds["test"], limit=5, model_half=True) + ds = load_elyza_tasks_100_TV() + results = test(model, tokenizer, ds, limit=5, model_half=True) evaluations = evaluate(results, 10) - averagt_score = sum(evaluation["score"] for evaluation in evaluations) / len( + average_score = sum(evaluation["score"] for evaluation in evaluations) / len( evaluations ) - print(f"{evaluations=}, {averagt_score=}") - assert 2 < averagt_score < 5 + print(f"{evaluations=}, {average_score=}") + assert 2 < average_score < 5