feat: evaluate with elyza-task-100-TV

xhiroga · Nov 30, 2024 · f393eb9 · f393eb9
1 parent 680d284
commit f393eb9
Show file tree

Hide file tree

Showing 4 changed files with 302 additions and 54 deletions.
diff --git a/computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb b/computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb
@@ -0,0 +1,158 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "\n",
+    "def validate_json_files(directory):\n",
+    "    json_files = [f for f in os.listdir(directory) if f.endswith(\".json\")]\n",
+    "    for json_file in json_files:\n",
+    "        file_path = os.path.join(directory, json_file)\n",
+    "        try:\n",
+    "            with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
+    "                json.load(f)\n",
+    "            print(f\"{json_file}: Valid JSON\")\n",
+    "        except json.JSONDecodeError as e:\n",
+    "            print(f\"{json_file}: Invalid JSON - {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json\n",
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.1.json\n",
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.2.json\n",
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.1.json\n",
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.2.json\n",
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-002-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-002-1.json\n",
+      "Processed ../data/Distribution20241221_all/ichikara-instruction-003-003-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-003-1.json\n",
+      "ichikara-instruction-003-001-1.json: Valid JSON\n",
+      "ichikara-instruction-003-001-2.1.json: Valid JSON\n",
+      "ichikara-instruction-003-001-2.2.json: Valid JSON\n",
+      "ichikara-instruction-003-001-5.1.json: Valid JSON\n",
+      "ichikara-instruction-003-001-5.2.json: Valid JSON\n",
+      "ichikara-instruction-003-002-1.json: Valid JSON\n",
+      "ichikara-instruction-003-003-1.json: Valid JSON\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "27c61dea8c7b40338481097e3e032852",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['ID', 'text', 'output'],\n",
+       "        num_rows: 6701\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "original_dir = \"../data/Distribution20241221_all\"\n",
+    "preprocessed_dir = \"../data/Distribution20241221_all_preprocessed\"\n",
+    "\n",
+    "\n",
+    "def preprocess_ichikara_instruction(original_dir: str, preprocessed_dir: str):\n",
+    "    if not os.path.exists(preprocessed_dir):\n",
+    "        os.makedirs(preprocessed_dir)\n",
+    "\n",
+    "    data_files = [\n",
+    "        os.path.join(original_dir, f)\n",
+    "        for f in os.listdir(original_dir)\n",
+    "        if f.endswith(\".json\")\n",
+    "    ]\n",
+    "\n",
+    "    invalid_escape_pattern = re.compile(r\"\\\\(?![\\\"\\\\/bfnrt]|u[0-9a-fA-F]{4})\")\n",
+    "\n",
+    "    for file_path in data_files:\n",
+    "        try:\n",
+    "            with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "                content = file.read()\n",
+    "                # Replace invalid escape sequences\n",
+    "                content = invalid_escape_pattern.sub(r\"\\\\\\\\\", content)\n",
+    "                # Replace \\\\\" with \\\"\n",
+    "                content = content.replace('\\\\\\\\\"', '\\\\\"')\n",
+    "\n",
+    "            preprocessed_file_path = os.path.join(\n",
+    "                preprocessed_dir, os.path.basename(file_path)\n",
+    "            )\n",
+    "            with open(\n",
+    "                preprocessed_file_path, \"w\", encoding=\"utf-8\"\n",
+    "            ) as preprocessed_file:\n",
+    "                preprocessed_file.write(content)\n",
+    "\n",
+    "            print(f\"Processed {file_path} -> {preprocessed_file_path}\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error processing file {file_path}: {e}\")\n",
+    "\n",
+    "\n",
+    "# Run the preprocessing function\n",
+    "preprocess_ichikara_instruction(original_dir, preprocessed_dir)\n",
+    "validate_json_files(preprocessed_dir)\n",
+    "load_dataset(\"json\", data_files=f\"{preprocessed_dir}/*.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/computer-science/machine-learning/_src/llm-exercises/src/instruction_datasets.py b/computer-science/machine-learning/_src/llm-exercises/src/instruction_datasets.py
@@ -0,0 +1,45 @@
+from datasets import dataset_dict, load_dataset
+
+
+def load_ichikara_instruction_003_001_1() -> dataset_dict.Dataset:
+    dataset = load_dataset(
+        "json",
+        data_files="data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json",
+    )
+
+    return dataset["train"]
+
+
+def load_ichikara_instruction_all() -> dataset_dict.Dataset:
+    dataset = load_dataset(
+        "json",
+        data_files="data/Distribution20241221_all_preprocessed/*.json",  # avoid README.md
+    )
+    return dataset["train"]
+
+
+def load_elyza_tasks_100() -> dataset_dict.Dataset:
+    dataset = load_dataset("elyza/ELYZA-tasks-100")
+    return dataset["test"]
+
+
+def load_elyza_tasks_100_TV() -> dataset_dict.Dataset:
+    dataset = load_dataset(
+        "json",
+        data_files="data/elyza-tasks-100-TV_0.jsonl",
+    )
+    return dataset["train"]
+
+
+INSTRUCTION_DATASETS = {
+    "ichikara-instruction-003-001-1": load_ichikara_instruction_003_001_1,
+    "ichikara-instruction-all": load_ichikara_instruction_all,
+    "elyza/ELYZA-tasks-100": load_elyza_tasks_100,
+    "elyza-tasks-100-TV_0": load_elyza_tasks_100_TV,
+}
+
+
+if __name__ == "__main__":
+    for dataset_name, load_func in INSTRUCTION_DATASETS.items():
+        dataset = load_func()
+        print(dataset)
diff --git a/computer-science/machine-learning/_src/llm-exercises/src/sft.py b/computer-science/machine-learning/_src/llm-exercises/src/sft.py
@@ -1,9 +1,9 @@
 import os
+import re
 from pathlib import Path
 
 import bitsandbytes as bnb
 import torch
-from datasets import load_dataset
 from dotenv import load_dotenv
 from peft import LoraConfig, get_peft_model
 from transformers import (
@@ -16,18 +16,26 @@
 from trl import SFTTrainer
 
 import wandb
-from utils import evaluate, validate
+from instruction_datasets import INSTRUCTION_DATASETS
+from utils import evaluate, save_results, test
 
 load_dotenv()
 
 HF_TOKEN = os.environ.get("HF_TOKEN")
 assert os.environ.get("WANDB_ENTITY")
 assert os.environ.get("WANDB_PROJECT")
-wandb.init()
+
+config = {
+    "base_model_id": "llm-jp/llm-jp-3-1.8b",
+    "train_datasets": ["ichikara-instruction-all"],
+    "test_datasets": ["elyza/ELYZA-tasks-100", "elyza-tasks-100-TV_0"],
+}
+
+wandb.init(config=config)
 run_name = wandb.run.name
 
 
-base_model_id = Path("models/llm-jp/llm-jp-3-1.8b")
+base_model_id = Path(f"models/{config['base_model_id']}")
 new_model_id = f"{base_model_id.name.replace('.', '-')}-finetune-{run_name}"
 
 """
@@ -118,19 +126,13 @@ def find_all_linear_names(model):
 model = get_peft_model(model, peft_config)
 
 
-dataset = load_dataset(
-    "json",
-    data_files="data/Distribution20241221_all/ichikara-instruction-003-001-1.json",
-)
-prompt = """### 指示
+prompt = """\
+### 指示
 {}
 ### 回答
 {}"""
 
 
-"""
-formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる
-"""
 EOS_TOKEN = tokenizer.eos_token  # トークナイザーのEOSトークン（文末トークン）
 
 
@@ -144,6 +146,7 @@ def formatting_prompts_func(examples):
 
 
 # 各データにフォーマットを適用
+dataset = INSTRUCTION_DATASETS[config["train_datasets"][0]]()
 dataset = dataset.map(
     formatting_prompts_func,
     num_proc=8,  # 並列処理数を指定
@@ -176,7 +179,7 @@ def formatting_prompts_func(examples):
 
 trainer = SFTTrainer(
     model=model,
-    train_dataset=dataset["train"],
+    train_dataset=dataset,
     # TODO: eval_dataset
     peft_config=peft_config,
     max_seq_length=512,  # TODO: 長くすることを検討
@@ -190,30 +193,25 @@ def formatting_prompts_func(examples):
 tokenizer.pad_token = tokenizer.eos_token
 trainer.train()  # トレーニングを実行
 
-# Validate
-validate_dataset = "elyza/ELYZA-tasks-100"
-ds = load_dataset(validate_dataset)
+# test
+for test_dataset_name in config["test_datasets"]:
+    ds = INSTRUCTION_DATASETS[test_dataset_name]()
+    results = test(model, tokenizer, ds, limit=None, model_half=True)
+    evaluations = evaluate(results)
+    scores = [e["score"] for e in evaluations]
+    average_score = sum(scores) / len(evaluations)
+    wandb.log(
+        {
+            "test_dataset": test_dataset_name,
+            "scores": scores,
+            "average_score": average_score,
+        }
+    )
+    jsonl_prefix = f"{new_model_id}-{test_dataset_name}-{run_name}".replace("/", "-")
+    save_results(results, jsonl_prefix)
 
-results = validate(model, tokenizer, ds["test"], limit=10)
-evaluations = evaluate(results)
-average_score = sum(e["score"] for e in evaluations) / len(evaluations)
-wandb.log({"validate_dataset": validate_dataset, "evaluations": evaluations, "average_score": average_score})
 wandb.finish()
 
-# こちらで生成されたjsolを提出してください。
-# 本コードではinputとeval_aspectも含んでいますが、なくても問題ありません。
-# 必須なのはtask_idとoutputとなります。
-import json
-import re
-
-jsonl_id = re.sub(".*/", "", new_model_id)
-with open(f"output/{jsonl_id}-outputs.jsonl", "w", encoding="utf-8") as f:
-    for result in results:
-        json.dump(
-            result, f, ensure_ascii=False
-        )  # ensure_ascii=False for handling non-ASCII characters
-        f.write("\n")
-
 # モデルとトークナイザーをHugging Faceにアップロード
 model.push_to_hub(
     "llm-jp-3-1-8b-finetune", token=HF_TOKEN, private=True