Skip to content

Commit

Permalink
sorry to do this all in one fucking commit
Browse files Browse the repository at this point in the history
updated gitignore

kudasai now sends the json name to kairyou

updated requirements

added bulk charsets to katakana handler
added is punctuation and adjusted some functions and docstrings

added sudachiHandler

added sudachi based system.dlc
added katakana_words

adjusted how kairyou builds the nlp
added files to preloader

Signed-off-by: Bikatr7 <Tetralon07@gmail.com>
  • Loading branch information
Bikatr7 committed Nov 6, 2023
1 parent b36e100 commit b688adf
Show file tree
Hide file tree
Showing 8 changed files with 13,817 additions and 29 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down Expand Up @@ -105,3 +104,8 @@ KudasaiOutput.zip
.vs/ProjectSettings.json
.vs/VSWorkspaceState.json
.vs/Kudasai/v17/.wsuo
lib/dicts/TEIS Replacements.json_sudachi_dict.dic
lib/dicts/TEIS Replacements.json_sudachi_dict.csv
lib/dicts/Cote Replacements.json_sudachi_dict.dic
lib/dicts/Cote Replacements.json_sudachi_dict.csv
lib/dicts/system.dic
17 changes: 11 additions & 6 deletions Kudasai.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(self) -> None:

##-------------------start-of-setup_kairyou_for_cli()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def setup_kairyou_for_cli(self, input_file, replacement_json) -> None:
def setup_kairyou_for_cli(self, input_file, replacement_json_path) -> None:

"""
Expand All @@ -86,7 +86,7 @@ def setup_kairyou_for_cli(self, input_file, replacement_json) -> None:

try:

with open(replacement_json, 'r', encoding='utf-8') as file:
with open(replacement_json_path, 'r', encoding='utf-8') as file:
replacement_json = json.load(file)

except:
Expand All @@ -98,11 +98,16 @@ def setup_kairyou_for_cli(self, input_file, replacement_json) -> None:

exit()

## get name of json file
## Example "86 Replacements.json" would return 86
name_of_replacement_json = os.path.basename(replacement_json_path)



with open(input_file, 'r', encoding='utf-8') as file:
japanese_text = file.read()

self.kairyou_client = Kairyou(replacement_json, japanese_text, self.preloader)
self.kairyou_client = Kairyou(replacement_json, japanese_text, self.preloader, name_of_replacement_json)

##-------------------start-of-setup_kairyou_for_console()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Expand All @@ -126,13 +131,13 @@ def setup_kairyou_for_console(self) -> None:

self.preloader.toolkit.clear_console()

replacement_json = input("Please enter the path to the replacement json file:\n").strip('"')
replacement_json_path = input("Please enter the path to the replacement json file:\n").strip('"')

self.preloader.toolkit.clear_console()

try:

with open(replacement_json, 'r', encoding='utf-8') as file:
with open(replacement_json_path, 'r', encoding='utf-8') as file:
replacement_json = json.load(file)

except:
Expand All @@ -148,7 +153,7 @@ def setup_kairyou_for_console(self) -> None:
with open(input_file, 'r', encoding='utf-8') as file:
japanese_text = file.read()

self.kairyou_client = Kairyou(replacement_json, japanese_text, self.preloader)
self.kairyou_client = Kairyou(replacement_json, japanese_text, self.preloader, os.path.basename(replacement_json_path))

##-------------------start-of-run_kudasai_console()---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Expand Down
71 changes: 61 additions & 10 deletions handlers/katakanaHandler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,44 @@
## built-in libraries
from __future__ import annotations ## used for cheating the circular import issue that occurs when i need to type check some things

import string
import typing

## custom modules
if(typing.TYPE_CHECKING): ## used for cheating the circular import issue that occurs when i need to type check some things
from models.Kairyou import Name

## https://en.wikipedia.org/wiki/Katakana_(Unicode_block)
KATAKANA_CHARSET = {
'゠','ァ','ア','ィ','イ','ゥ','ウ','ェ','エ','ォ','オ','カ','ガ','キ','ギ','ク',
'グ','ケ','ゲ','コ','ゴ','サ','ザ','シ','ジ','ス','ズ','セ','ゼ','ソ','ゾ','タ',
'ダ','チ','ヂ','ッ','ツ','ヅ','テ','デ','ト','ド','ナ','ニ','ヌ','ネ','ノ','ハ',
'バ','パ','ヒ','ビ','ピ','フ','ブ','プ','ヘ','ベ','ペ','ホ','ボ','ポ','マ','ミ',
'ム','メ','モ','ャ','ヤ','ュ','ユ','ョ','ヨ','ラ','リ','ル','レ','ロ','ヮ','ワ',
'ヰ','ヱ','ヲ','ン','ヴ','ヵ','ヶ','ヷ','ヸ','ヹ','ヺ','・','ー','ヽ','ヾ'
}

## Punctuation unicode ranges:
## https://kairozu.github.io/updates/cleaning-jp-text
PUNCTUATION_CHARSET = {
' ','、','。','〃','〄','々','〆','〇','〈','〉','《','》','「','」','『','』',
'【','】','〒','〓','〔','〕','〖','〗','〘','〙','〚','〛','〜','〝','〞','〟',
'〠','〡','〢','〣','〤','〥','〦','〧','〨','〩','〪','〫','〬','〭','〮','〯',
'〰','〱','〲','〳','〴','〵','〶','〷','〸','〹','〺','〻','〼','〽','〾','〿',
'!','"','#','$','%','&',''','(',')','*','+',',','-','.','/',':',
';','<','=','>','?','[','\',']','^','_','`','{','|','}','~','⦅',
'⦆','。','「','」','、','・','ー','※',' ',' ',' ',' ',
' ',' ',' ',' ',' ',' ',' ',
'​','‌','‍','‎','‏','‐','‑','‒','–','—',
'―','‖','‗','‘','’','‚','‛','“','”','„','‟','†','‡','•','‣','․','‥','…','‧',
'
','
','‪','‫','‬','‭','‮',
' ','‰','‱','′','″','‴','‵','‶','‷','‸','‹','›','※','‼','‽','‾','‿',
'⁀','⁁','⁂','⁃','⁄','⁅','⁆','⁇','⁈','⁉','⁊','⁋','⁌','⁍','⁎','⁏','⁐','⁑','⁒',
'⁓','⁔','⁕','⁖','⁗','⁘','⁙','⁚','⁛','⁜','⁝','⁞',' ','⁠',
'⁦','⁧','⁨','⁩','«','»','×',"△","▼"
} | set(string.punctuation) ## EN punctuation set

##--------------------start-of-katakanaHandler------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

class katakanaHandler:

Expand Down Expand Up @@ -56,22 +88,22 @@ def load_katakana_words(self, katakana_lib_file:str) -> None:

##--------------------start-of-is_katakana_only()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def is_katakana_only(self, word:str) -> bool:
def is_katakana_only(self, string:str) -> bool:

"""
Checks if the word is only katakana.\n
Checks if the string is only katakana.\n
Parameters:\n
self (object - katakanaHandler) : the katakanaHandler object.\n
word (str) : the word to check.\n
string (str) : the string to check.\n
Returns:\n
bool : True if the word is only katakana, False otherwise.\n
"""

return all('ァ' <= char <= 'ヴ' or char == 'ー' for char in word)
return all([char in KATAKANA_CHARSET for char in string])

##--------------------start-of-get_katakana_entities()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Expand All @@ -97,19 +129,38 @@ def is_actual_word(self, jap:str) -> bool:

"""
Checks if the given jap is an actual katakana word.\n
Checks if the given jap is an actual katakana word.
Parameters:\n
self (object - katakanaHandler) : the katakanaHandler object
self (object - katakanaHandler) : the katakanaHandler object.
jap (str) : the katakana word to check.
Returns:
bool : True if the word is an actual katakana word, False otherwise.\n
bool : True if the word is an actual katakana word, False otherwise.
"""

if(jap in self.katakana_words):
return True

else:
return False
return False

##--------------------start-of-is_punctuation()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def is_punctuation(self, string):

"""
Checks if the given string is all punctuation.
Parameters:
self (object - katakanaHandler) : the katakanaHandler object.
string (str) : the string to check.
Returns:
bool : True if the word is all punc otherwise false
"""

return all([char in PUNCTUATION_CHARSET for char in string])
Loading

0 comments on commit b688adf

Please sign in to comment.