Skip to content

Commit

Permalink
feat: evaluate with elyza-task-100-TV
Browse files Browse the repository at this point in the history
  • Loading branch information
xhiroga committed Nov 30, 2024
1 parent 680d284 commit f393eb9
Show file tree
Hide file tree
Showing 4 changed files with 302 additions and 54 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"\n",
"\n",
"def validate_json_files(directory):\n",
" json_files = [f for f in os.listdir(directory) if f.endswith(\".json\")]\n",
" for json_file in json_files:\n",
" file_path = os.path.join(directory, json_file)\n",
" try:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
" json.load(f)\n",
" print(f\"{json_file}: Valid JSON\")\n",
" except json.JSONDecodeError as e:\n",
" print(f\"{json_file}: Invalid JSON - {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json\n",
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.1.json\n",
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.2.json\n",
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.1.json\n",
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.2.json\n",
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-002-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-002-1.json\n",
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-003-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-003-1.json\n",
"ichikara-instruction-003-001-1.json: Valid JSON\n",
"ichikara-instruction-003-001-2.1.json: Valid JSON\n",
"ichikara-instruction-003-001-2.2.json: Valid JSON\n",
"ichikara-instruction-003-001-5.1.json: Valid JSON\n",
"ichikara-instruction-003-001-5.2.json: Valid JSON\n",
"ichikara-instruction-003-002-1.json: Valid JSON\n",
"ichikara-instruction-003-003-1.json: Valid JSON\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "27c61dea8c7b40338481097e3e032852",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating train split: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['ID', 'text', 'output'],\n",
" num_rows: 6701\n",
" })\n",
"})"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import re\n",
"from datasets import load_dataset\n",
"\n",
"original_dir = \"../data/Distribution20241221_all\"\n",
"preprocessed_dir = \"../data/Distribution20241221_all_preprocessed\"\n",
"\n",
"\n",
"def preprocess_ichikara_instruction(original_dir: str, preprocessed_dir: str):\n",
" if not os.path.exists(preprocessed_dir):\n",
" os.makedirs(preprocessed_dir)\n",
"\n",
" data_files = [\n",
" os.path.join(original_dir, f)\n",
" for f in os.listdir(original_dir)\n",
" if f.endswith(\".json\")\n",
" ]\n",
"\n",
" invalid_escape_pattern = re.compile(r\"\\\\(?![\\\"\\\\/bfnrt]|u[0-9a-fA-F]{4})\")\n",
"\n",
" for file_path in data_files:\n",
" try:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" content = file.read()\n",
" # Replace invalid escape sequences\n",
" content = invalid_escape_pattern.sub(r\"\\\\\\\\\", content)\n",
" # Replace \\\\\" with \\\"\n",
" content = content.replace('\\\\\\\\\"', '\\\\\"')\n",
"\n",
" preprocessed_file_path = os.path.join(\n",
" preprocessed_dir, os.path.basename(file_path)\n",
" )\n",
" with open(\n",
" preprocessed_file_path, \"w\", encoding=\"utf-8\"\n",
" ) as preprocessed_file:\n",
" preprocessed_file.write(content)\n",
"\n",
" print(f\"Processed {file_path} -> {preprocessed_file_path}\")\n",
" except Exception as e:\n",
" print(f\"Error processing file {file_path}: {e}\")\n",
"\n",
"\n",
"# Run the preprocessing function\n",
"preprocess_ichikara_instruction(original_dir, preprocessed_dir)\n",
"validate_json_files(preprocessed_dir)\n",
"load_dataset(\"json\", data_files=f\"{preprocessed_dir}/*.json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from datasets import dataset_dict, load_dataset


def load_ichikara_instruction_003_001_1() -> dataset_dict.Dataset:
dataset = load_dataset(
"json",
data_files="data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json",
)

return dataset["train"]


def load_ichikara_instruction_all() -> dataset_dict.Dataset:
dataset = load_dataset(
"json",
data_files="data/Distribution20241221_all_preprocessed/*.json", # avoid README.md
)
return dataset["train"]


def load_elyza_tasks_100() -> dataset_dict.Dataset:
dataset = load_dataset("elyza/ELYZA-tasks-100")
return dataset["test"]


def load_elyza_tasks_100_TV() -> dataset_dict.Dataset:
dataset = load_dataset(
"json",
data_files="data/elyza-tasks-100-TV_0.jsonl",
)
return dataset["train"]


INSTRUCTION_DATASETS = {
"ichikara-instruction-003-001-1": load_ichikara_instruction_003_001_1,
"ichikara-instruction-all": load_ichikara_instruction_all,
"elyza/ELYZA-tasks-100": load_elyza_tasks_100,
"elyza-tasks-100-TV_0": load_elyza_tasks_100_TV,
}


if __name__ == "__main__":
for dataset_name, load_func in INSTRUCTION_DATASETS.items():
dataset = load_func()
print(dataset)
66 changes: 32 additions & 34 deletions computer-science/machine-learning/_src/llm-exercises/src/sft.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import re
from pathlib import Path

import bitsandbytes as bnb
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from peft import LoraConfig, get_peft_model
from transformers import (
Expand All @@ -16,18 +16,26 @@
from trl import SFTTrainer

import wandb
from utils import evaluate, validate
from instruction_datasets import INSTRUCTION_DATASETS
from utils import evaluate, save_results, test

load_dotenv()

HF_TOKEN = os.environ.get("HF_TOKEN")
assert os.environ.get("WANDB_ENTITY")
assert os.environ.get("WANDB_PROJECT")
wandb.init()

config = {
"base_model_id": "llm-jp/llm-jp-3-1.8b",
"train_datasets": ["ichikara-instruction-all"],
"test_datasets": ["elyza/ELYZA-tasks-100", "elyza-tasks-100-TV_0"],
}

wandb.init(config=config)
run_name = wandb.run.name


base_model_id = Path("models/llm-jp/llm-jp-3-1.8b")
base_model_id = Path(f"models/{config['base_model_id']}")
new_model_id = f"{base_model_id.name.replace('.', '-')}-finetune-{run_name}"

"""
Expand Down Expand Up @@ -118,19 +126,13 @@ def find_all_linear_names(model):
model = get_peft_model(model, peft_config)


dataset = load_dataset(
"json",
data_files="data/Distribution20241221_all/ichikara-instruction-003-001-1.json",
)
prompt = """### 指示
prompt = """\
### 指示
{}
### 回答
{}"""


"""
formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる
"""
EOS_TOKEN = tokenizer.eos_token # トークナイザーのEOSトークン(文末トークン)


Expand All @@ -144,6 +146,7 @@ def formatting_prompts_func(examples):


# 各データにフォーマットを適用
dataset = INSTRUCTION_DATASETS[config["train_datasets"][0]]()
dataset = dataset.map(
formatting_prompts_func,
num_proc=8, # 並列処理数を指定
Expand Down Expand Up @@ -176,7 +179,7 @@ def formatting_prompts_func(examples):

trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
train_dataset=dataset,
# TODO: eval_dataset
peft_config=peft_config,
max_seq_length=512, # TODO: 長くすることを検討
Expand All @@ -190,30 +193,25 @@ def formatting_prompts_func(examples):
tokenizer.pad_token = tokenizer.eos_token
trainer.train() # トレーニングを実行

# Validate
validate_dataset = "elyza/ELYZA-tasks-100"
ds = load_dataset(validate_dataset)
# test
for test_dataset_name in config["test_datasets"]:
ds = INSTRUCTION_DATASETS[test_dataset_name]()
results = test(model, tokenizer, ds, limit=None, model_half=True)
evaluations = evaluate(results)
scores = [e["score"] for e in evaluations]
average_score = sum(scores) / len(evaluations)
wandb.log(
{
"test_dataset": test_dataset_name,
"scores": scores,
"average_score": average_score,
}
)
jsonl_prefix = f"{new_model_id}-{test_dataset_name}-{run_name}".replace("/", "-")
save_results(results, jsonl_prefix)

results = validate(model, tokenizer, ds["test"], limit=10)
evaluations = evaluate(results)
average_score = sum(e["score"] for e in evaluations) / len(evaluations)
wandb.log({"validate_dataset": validate_dataset, "evaluations": evaluations, "average_score": average_score})
wandb.finish()

# こちらで生成されたjsolを提出してください。
# 本コードではinputとeval_aspectも含んでいますが、なくても問題ありません。
# 必須なのはtask_idとoutputとなります。
import json
import re

jsonl_id = re.sub(".*/", "", new_model_id)
with open(f"output/{jsonl_id}-outputs.jsonl", "w", encoding="utf-8") as f:
for result in results:
json.dump(
result, f, ensure_ascii=False
) # ensure_ascii=False for handling non-ASCII characters
f.write("\n")

# モデルとトークナイザーをHugging Faceにアップロード
model.push_to_hub(
"llm-jp-3-1-8b-finetune", token=HF_TOKEN, private=True
Expand Down
Loading

0 comments on commit f393eb9

Please sign in to comment.