forked from christofferaakre/japanese-ebook-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
130 lines (116 loc) · 4.03 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import base64
import subprocess
import re
import json
from pathlib import Path
def get_all_subdirs(path: str) -> list:
"""
Returns a list of all the subdirectories that exist
in the given directory
Arguments:
path: str - Path of directory to search in
"""
p = Path(path)
return [x for x in p.iterdir() if x.is_dir()]
def get_books() -> list:
"""
Returns a list of the data for all the
analysed ebooks
"""
book_dirs = get_all_subdirs('static/books')
books = []
for book_dir in book_dirs:
filename = f'{book_dir}/book_data.json'
with open(filename) as file:
book_data = json.load(file)
books.append(book_data)
return books
def get_book(hash: str) -> object:
"""
Returns the data for the book with the given hash.
Arguments:
hash: str - The sha256 sum hash for the epub file
"""
with open(f'static/books/{hash}/book_data.json') as file:
book_data = json.load(file)
return book_data
def save_base64_image(base64string: str, filename: str) -> str:
"""
Decodes the given base64 string and saves it as an image with the specified
filename. Returns the filename.
Arguments:
base64string: str - The base64 string encoded string
filename: str - The filename you want to save the image to
"""
image_data = base64.b64decode(base64string)
with open(filename, 'wb') as file:
file.write(image_data)
return filename
def remove_ruby_text_from_epub(
filename: str,
new_filename: str = None
) -> str:
"""
Removes all ruby text from the given .epub file.
Requires furigana4epub to be installed and in system path
Returns the path of the new file
(pip install furigana4epub).
Arguments:
filename: str - The filename for the epub file to remove ruby text
from
"""
if not new_filename:
new_filename = filename
name = filename.replace('.epub', '')
subprocess.run(f'furigana4epub -d {filename}', shell=True)
subprocess.run(f'mv {name}_no_furigana.epub {new_filename}', shell=True)
return new_filename
def convert_epub_to_txt(filename: str,
process_text: bool = False
) -> str:
"""
NOTE: requires pandoc to be in system path.
Converts the given epub file to a .txt file with
the same filename (minus extension) in the same directory.
Returns the filename of the .txt file
Arguments:
filename : str - The filename of the .epub file you want to convert
process_text: bool (optional, default = False) - Process
the text to filter out any non-japanese text using a regex
"""
txt_filename = filename.replace('.epub', '.txt')
subprocess.run(f'./convert_epub_to_txt {filename}', shell=True)
if process_text:
with open(txt_filename, 'r', encoding='utf-8') as file:
text = file.read()
with open(txt_filename, 'w', encoding='utf-8') as file:
file.write(process_japanese_text(text))
return txt_filename
def process_japanese_text(text: str) -> str:
"""
Filters out any text that is not japanese using a regex,
and returns the processed text
Arguments:
text: str - The text to process
"""
regex = r"[\u3000-\u303F]|[\u3040-\u309F]|[\u30A0-\u30FF]|[\uFF00-\uFFEF]|[\u4E00-\u9FAF]|[\u2605-\u2606]|[\u2190-\u2195]|\u203B"
processed = ''.join(re.findall(regex, text)).replace('\u3000', '')
return processed
def parse_sentence(sentence: str, mt) -> list:
"""
Parses the given sentence into a list of words using mecab.
Arguments:
sentence: str - The sentence you want to parse
mt - A mecab tagger. Create using MeCab.Tagger
"""
filter_str = r'()./,!:?\uksa0123456789\t\r\s .'
parsed = mt.parseToNode(sentence)
words = []
while parsed:
word = re.sub(r'\s+', '', parsed.surface)
for char in filter_str:
word = word.replace(char, '')
if len(word) >= 1:
words.append(word)
parsed = parsed.next
return words