From d3c5f934519f5758733376990be00652d53882b1 Mon Sep 17 00:00:00 2001
From: Hiroaki Ogasawara <13391129+xhiroga@users.noreply.github.com>
Date: Sat, 30 Nov 2024 18:26:17 +0900
Subject: [PATCH] fix: eval correctly

---
 .../{notbooks => notebooks}/preprocess.ipynb     |  2 +-
 .../_src/llm-exercises/src/sft.py                |  2 ++
 .../_src/llm-exercises/src/utils.py              | 16 +++++++++-------
 3 files changed, 12 insertions(+), 8 deletions(-)
 rename computer-science/machine-learning/_src/llm-exercises/{notbooks => notebooks}/preprocess.ipynb (99%)

diff --git a/computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb b/computer-science/machine-learning/_src/llm-exercises/notebooks/preprocess.ipynb
similarity index 99%
rename from computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb
rename to computer-science/machine-learning/_src/llm-exercises/notebooks/preprocess.ipynb
index 0be479b7d..6d632d48f 100644
--- a/computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb
+++ b/computer-science/machine-learning/_src/llm-exercises/notebooks/preprocess.ipynb
@@ -150,7 +150,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/computer-science/machine-learning/_src/llm-exercises/src/sft.py b/computer-science/machine-learning/_src/llm-exercises/src/sft.py
index 0d33249f2..efeb3fb62 100644
--- a/computer-science/machine-learning/_src/llm-exercises/src/sft.py
+++ b/computer-science/machine-learning/_src/llm-exercises/src/sft.py
@@ -21,6 +21,8 @@
 
 load_dotenv()
 
+logging.set_verbosity_info()
+
 HF_TOKEN = os.environ.get("HF_TOKEN")
 assert os.environ.get("WANDB_ENTITY")
 assert os.environ.get("WANDB_PROJECT")
diff --git a/computer-science/machine-learning/_src/llm-exercises/src/utils.py b/computer-science/machine-learning/_src/llm-exercises/src/utils.py
index 369eee637..da9795648 100644
--- a/computer-science/machine-learning/_src/llm-exercises/src/utils.py
+++ b/computer-science/machine-learning/_src/llm-exercises/src/utils.py
@@ -4,13 +4,15 @@
 
 import google.generativeai as genai
 import torch
-from datasets import dataset_dict, load_dataset
+from datasets import dataset_dict
 from dotenv import load_dotenv
 from peft import PeftMixedModel, PeftModel
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, logging
 from typing_extensions import TypedDict
 
+from instruction_datasets import load_elyza_tasks_100_TV
+
 
 class Result(TypedDict):
     task_id: int | None
@@ -134,7 +136,7 @@ def evaluate(results: list[Result], batch_size: int = 10) -> list[Evaluation]:
         batch_results = results[i : i + batch_size]
 
         prompts = [
-            evaluation_prompt(result["input"], result["output"], result["eval_aspect"], result.get("sample_output"))
+            evaluation_prompt(result["input"], result["output"], result.get("eval_aspect"), result.get("target"))
             for result in batch_results
         ]
 
@@ -171,11 +173,11 @@ def save_results(results: list[Result], jsonl_prefix: str):
     logging.set_verbosity_info()
     model = AutoModelForCausalLM.from_pretrained("models/llm-jp-3-1-8b-finetune", device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained("models/llm-jp-3-1-8b-finetune", trust_remote_code=True)
-    ds = load_dataset("elyza/ELYZA-tasks-100")
-    results = test(model, tokenizer, ds["test"], limit=5, model_half=True)
+    ds = load_elyza_tasks_100_TV()
+    results = test(model, tokenizer, ds, limit=5, model_half=True)
     evaluations = evaluate(results, 10)
-    averagt_score = sum(evaluation["score"] for evaluation in evaluations) / len(
+    average_score = sum(evaluation["score"] for evaluation in evaluations) / len(
         evaluations
     )
-    print(f"{evaluations=}, {averagt_score=}")
-    assert 2 < averagt_score < 5
+    print(f"{evaluations=}, {average_score=}")
+    assert 2 < average_score < 5