-
-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathutils.py
56 lines (40 loc) · 1.4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd
import unicodedata
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
def get_background_df(background_dir):
background_df = []
for path in background_dir.iterdir():
ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split("_")[-4:]]
h = ymax - ymin
w = xmax - xmin
ratio = w / h
background_df.append(
{
"path": str(path),
"h": h,
"w": w,
"ratio": ratio,
}
)
background_df = pd.DataFrame(background_df)
return background_df
def is_kanji(ch):
return "CJK UNIFIED IDEOGRAPH" in unicodedata.name(ch)
def is_hiragana(ch):
return "HIRAGANA" in unicodedata.name(ch)
def is_katakana(ch):
return "KATAKANA" in unicodedata.name(ch)
def is_ascii(ch):
return ord(ch) < 128
def get_charsets(vocab_path=None):
if vocab_path is None:
vocab_path = ASSETS_PATH / "vocab.csv"
vocab = pd.read_csv(vocab_path).char.values
hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
katakana = vocab[[is_katakana(c) for c in vocab]][3:]
return vocab, hiragana, katakana
def get_font_meta():
df = pd.read_csv(ASSETS_PATH / "fonts.csv")
df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
return df, font_map