Skip to content

Commit

Permalink
added a chapter splitter utility function
Browse files Browse the repository at this point in the history
  • Loading branch information
Bikatr7 committed Dec 28, 2024
1 parent 7e0648c commit 2b5a40e
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- [**Translator Settings**](#translator-settings)
- [**Web GUI**](#web-gui)
- [**Hugging Face**](#hugging-face)
- [**Utility Scripts**](#utility-scripts)
- [**License**](#license)
- [**Contact**](#contact)
- [**Acknowledgements**](#acknowledgements)
Expand Down Expand Up @@ -329,6 +330,15 @@ It's a bit slower than running it locally, but it's a good alternative for those

To see the README for the Hugging Face hosted version of Kudasai, please see [here](https://huggingface.co/spaces/Bikatr7/Kudasai/blob/main/README.md).

---------------------------------------------------------------------------------------------------------------------------------------------------
## **Utility Scripts**<a name="utility-scripts"></a>

There are a few utility scripts included in the util folder, these are not required for Kudasai to run, but they can be useful for certain tasks.

splitter.py : Splits a translated text file into chapters based on 〇 markers and additional rules. (Mostly made for COTE only.)

token_counter.py : Counts the number of tokens in a text file, as well as estimating the cost of translation.

---------------------------------------------------------------------------------------------------------------------------------------------------
## **License**<a name="license"></a>

Expand Down
90 changes: 90 additions & 0 deletions util/splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import regex
import os

def sanitize_filename(title, max_length=50):
"""Create a safe filename from chapter title."""
title = title.replace('〇', '').strip()

## Keep only alphanumeric chars, spaces, and safe punctuation
safe_chars = "".join(c for c in title if c.isalnum() or c in (' ', '_', '-'))

## Replace multiple spaces with single underscore
safe_chars = regex.sub(r'\s+', '_', safe_chars)

return safe_chars[:max_length].rstrip('_')

def is_chapter_marker(line):
"""Check if a line is a true chapter marker by looking for 〇 followed by text."""
## Match 〇 followed by whitespace and then English or Japanese text
pattern = r'^〇\s*[A-Za-z\p{Han}\p{Hiragana}\p{Katakana}]'
return bool(regex.match(pattern, line))

def get_raw_chapters(preprocessed_file):
"""Extract chapter titles from preprocessed text before first 〇 marker."""
chapters = []
with open(preprocessed_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if(line.startswith('〇')):
break
if(line and not line.lower() == 'toc'):
chapters.append(line)
return chapters

def split_chapters(preprocessed_file, translated_file, output_dir='chapters'):
"""Split text file into chapters based on 〇 markers."""
os.makedirs(output_dir, exist_ok=True)

raw_chapters = get_raw_chapters(preprocessed_file)

current_chapter = []
chapter_num = 1
chapter_names = []

with open(translated_file, 'r', encoding='utf-8') as f:
lines = f.readlines()

for line in lines:
if(is_chapter_marker(line)):
if(current_chapter):
chapter_title = current_chapter[0]
filename = f"{chapter_num:02d}_{sanitize_filename(chapter_title)}.txt"
chapter_names.append(f"{chapter_num:02d}. {chapter_title.strip()}")

with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.writelines(current_chapter)
chapter_num += 1

## Start new chapter
current_chapter = [line]
else:
if(current_chapter):
current_chapter.append(line)

## Save final chapter
if(current_chapter):
chapter_title = current_chapter[0]
filename = f"{chapter_num:02d}_{sanitize_filename(chapter_title)}.txt"
chapter_names.append(f"{chapter_num:02d}. {chapter_title.strip()}")

with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.writelines(current_chapter)

## Write chapter info to a file
with open(os.path.join(output_dir, 'chapter_list.txt'), 'w', encoding='utf-8') as f:
f.write("EXPERIMENTAL CHAPTER SPLIT - Please report any perceived issues to your translation head\n\n")
f.write("Raw chapters detected from preprocessed text:\n")
f.write("\n".join(raw_chapters))
f.write("\n\nChapters split from translated text:\n")
f.write('\n'.join(chapter_names))

if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python splitter.py <preprocessed_text.txt> <translated_text.txt>")
sys.exit(1)

preprocessed_file = sys.argv[1]
translated_file = sys.argv[2]

split_chapters(preprocessed_file, translated_file)

0 comments on commit 2b5a40e

Please sign in to comment.