-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: evaluate with elyza-task-100-TV
- Loading branch information
Showing
4 changed files
with
302 additions
and
54 deletions.
There are no files selected for viewing
158 changes: 158 additions & 0 deletions
158
computer-science/machine-learning/_src/llm-exercises/notbooks/preprocess.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import json\n", | ||
"\n", | ||
"\n", | ||
"def validate_json_files(directory):\n", | ||
" json_files = [f for f in os.listdir(directory) if f.endswith(\".json\")]\n", | ||
" for json_file in json_files:\n", | ||
" file_path = os.path.join(directory, json_file)\n", | ||
" try:\n", | ||
" with open(file_path, \"r\", encoding=\"utf-8\") as f:\n", | ||
" json.load(f)\n", | ||
" print(f\"{json_file}: Valid JSON\")\n", | ||
" except json.JSONDecodeError as e:\n", | ||
" print(f\"{json_file}: Invalid JSON - {e}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json\n", | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.1.json\n", | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-2.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-2.2.json\n", | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.1.json\n", | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-001-5.2.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-5.2.json\n", | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-002-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-002-1.json\n", | ||
"Processed ../data/Distribution20241221_all/ichikara-instruction-003-003-1.json -> ../data/Distribution20241221_all_preprocessed/ichikara-instruction-003-003-1.json\n", | ||
"ichikara-instruction-003-001-1.json: Valid JSON\n", | ||
"ichikara-instruction-003-001-2.1.json: Valid JSON\n", | ||
"ichikara-instruction-003-001-2.2.json: Valid JSON\n", | ||
"ichikara-instruction-003-001-5.1.json: Valid JSON\n", | ||
"ichikara-instruction-003-001-5.2.json: Valid JSON\n", | ||
"ichikara-instruction-003-002-1.json: Valid JSON\n", | ||
"ichikara-instruction-003-003-1.json: Valid JSON\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "27c61dea8c7b40338481097e3e032852", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Generating train split: 0 examples [00:00, ? examples/s]" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"DatasetDict({\n", | ||
" train: Dataset({\n", | ||
" features: ['ID', 'text', 'output'],\n", | ||
" num_rows: 6701\n", | ||
" })\n", | ||
"})" | ||
] | ||
}, | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import re\n", | ||
"from datasets import load_dataset\n", | ||
"\n", | ||
"original_dir = \"../data/Distribution20241221_all\"\n", | ||
"preprocessed_dir = \"../data/Distribution20241221_all_preprocessed\"\n", | ||
"\n", | ||
"\n", | ||
"def preprocess_ichikara_instruction(original_dir: str, preprocessed_dir: str):\n", | ||
" if not os.path.exists(preprocessed_dir):\n", | ||
" os.makedirs(preprocessed_dir)\n", | ||
"\n", | ||
" data_files = [\n", | ||
" os.path.join(original_dir, f)\n", | ||
" for f in os.listdir(original_dir)\n", | ||
" if f.endswith(\".json\")\n", | ||
" ]\n", | ||
"\n", | ||
" invalid_escape_pattern = re.compile(r\"\\\\(?![\\\"\\\\/bfnrt]|u[0-9a-fA-F]{4})\")\n", | ||
"\n", | ||
" for file_path in data_files:\n", | ||
" try:\n", | ||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", | ||
" content = file.read()\n", | ||
" # Replace invalid escape sequences\n", | ||
" content = invalid_escape_pattern.sub(r\"\\\\\\\\\", content)\n", | ||
" # Replace \\\\\" with \\\"\n", | ||
" content = content.replace('\\\\\\\\\"', '\\\\\"')\n", | ||
"\n", | ||
" preprocessed_file_path = os.path.join(\n", | ||
" preprocessed_dir, os.path.basename(file_path)\n", | ||
" )\n", | ||
" with open(\n", | ||
" preprocessed_file_path, \"w\", encoding=\"utf-8\"\n", | ||
" ) as preprocessed_file:\n", | ||
" preprocessed_file.write(content)\n", | ||
"\n", | ||
" print(f\"Processed {file_path} -> {preprocessed_file_path}\")\n", | ||
" except Exception as e:\n", | ||
" print(f\"Error processing file {file_path}: {e}\")\n", | ||
"\n", | ||
"\n", | ||
"# Run the preprocessing function\n", | ||
"preprocess_ichikara_instruction(original_dir, preprocessed_dir)\n", | ||
"validate_json_files(preprocessed_dir)\n", | ||
"load_dataset(\"json\", data_files=f\"{preprocessed_dir}/*.json\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
45 changes: 45 additions & 0 deletions
45
computer-science/machine-learning/_src/llm-exercises/src/instruction_datasets.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from datasets import dataset_dict, load_dataset | ||
|
||
|
||
def load_ichikara_instruction_003_001_1() -> dataset_dict.Dataset: | ||
dataset = load_dataset( | ||
"json", | ||
data_files="data/Distribution20241221_all_preprocessed/ichikara-instruction-003-001-1.json", | ||
) | ||
|
||
return dataset["train"] | ||
|
||
|
||
def load_ichikara_instruction_all() -> dataset_dict.Dataset: | ||
dataset = load_dataset( | ||
"json", | ||
data_files="data/Distribution20241221_all_preprocessed/*.json", # avoid README.md | ||
) | ||
return dataset["train"] | ||
|
||
|
||
def load_elyza_tasks_100() -> dataset_dict.Dataset: | ||
dataset = load_dataset("elyza/ELYZA-tasks-100") | ||
return dataset["test"] | ||
|
||
|
||
def load_elyza_tasks_100_TV() -> dataset_dict.Dataset: | ||
dataset = load_dataset( | ||
"json", | ||
data_files="data/elyza-tasks-100-TV_0.jsonl", | ||
) | ||
return dataset["train"] | ||
|
||
|
||
INSTRUCTION_DATASETS = { | ||
"ichikara-instruction-003-001-1": load_ichikara_instruction_003_001_1, | ||
"ichikara-instruction-all": load_ichikara_instruction_all, | ||
"elyza/ELYZA-tasks-100": load_elyza_tasks_100, | ||
"elyza-tasks-100-TV_0": load_elyza_tasks_100_TV, | ||
} | ||
|
||
|
||
if __name__ == "__main__": | ||
for dataset_name, load_func in INSTRUCTION_DATASETS.items(): | ||
dataset = load_func() | ||
print(dataset) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.