diff --git a/README.md b/README.md index 7e36452..e44e539 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The pipeline employs a context layer, multiple transformer and n-gram morpho-ort and an autoregressive recurrent neural transformer base. The current implementation offers state-of-the-art accuracy for out-of-vocabulary (OOV) words, as well as contextual analysis for correct inferencing of [English Heteronyms](https://en.wikipedia.org/wiki/Heteronym_(linguistics)). -The package is offered in a pre-trained state that is ready for use as a dependency or in +The package is offered in a pre-trained state that is ready for [usage](#Usage) as a dependency or in notebook environments. There are no additional resources needed, other than the model checkpoint which is automatically downloaded on the first usage. See [Installation](#Installation) more information. @@ -68,6 +68,8 @@ pip install aquila-resolve ## Usage +### 1. Module + ```python from Aquila_Resolve import G2p @@ -77,13 +79,29 @@ g2p.convert('The book costs $5, will you read it?') # >> '{DH AH0} {B UH1 K} {K AA1 S T S} {F AY1 V} {D AA1 L ER0 Z}, {W IH1 L} {Y UW1} {R IY1 D} {IH1 T}?' ``` -> Additional optional parameters are available when defining a `G2p` instance: +> Optional parameters when defining a `G2p` instance: | Parameter | Default | Description | |-------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `device` | `'cpu'` | Device for Pytorch inference model. GPU is supported using `'cuda'` | + +> Optional parameters when calling `convert`: + +| Parameter | Default | Description | +|-------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `process_numbers` | `True` | Toggles conversion of some numbers and symbols to their spoken pronunciation forms. See [numbers.py](src/Aquila_Resolve/text/numbers.py) for details on what is covered. | +### 2. Command Line + +A simple wrapper for text conversion is available through the `aquila-resolve` command +``` +~ +❯ aquila-resolve +✔ Aquila Resolve v0.1.2 +? Text to convert: I read the book, did you read it? +{AY1} {R EH1 D} {DH AH0} {B UH1 K}, {D IH1 D} {Y UW1} {R IY1 D} {IH1 T}? +``` + ## Model Architecture In evaluation[^1], neural G2P models have traditionally been extremely sensitive to orthographical variations diff --git a/requirements-dev.txt b/requirements-dev.txt index 239b8b4..4c36ccd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,5 +7,7 @@ torch~=1.11.0 inflect>=2.1.0 requests>=2.23.0 numpy>=1.18.0 +inquirerpy>=0.3.3 +yaspin>=2.1.0 pytest>=7.1.2 pytest_mock>=3.7.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5ba0268..6325b69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,6 @@ pywordsegment>=0.2.1 torch~=1.11.0 inflect>=2.1.0 requests>=2.23.0 -numpy>=1.18.0 \ No newline at end of file +numpy>=1.18.0 +inquirerpy>=0.3.3 +yaspin>=2.1.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 97c127a..45be0bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,9 @@ [metadata] name = Aquila-Resolve -version = 0.1.2 +version = 0.1.3 author = ionite author_email = dev@ionite.io -description = Augmented Recurrent Neural Grapheme-to-Phoneme conversion with Inflectional Orthography. +description = Augmented Neural English G2p converter with Inflectional Orthography. long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/ionite34/Aquila-Resolve @@ -32,6 +32,8 @@ install_requires = inflect>=2.1.0 requests>=2.23.0 numpy>=1.18.0 + inquirerpy>=0.3.3 + yaspin>=2.1.0 zip_safe = False include_package_data = True @@ -39,4 +41,8 @@ include_package_data = True * = *.json, *.json.gz [options.packages.find] -where = src \ No newline at end of file +where = src + +[options.entry_points] +console_scripts = + aquila-resolve = Aquila_Resolve.cli:main_menu \ No newline at end of file diff --git a/src/Aquila_Resolve/__init__.py b/src/Aquila_Resolve/__init__.py index fb63580..03035f9 100644 --- a/src/Aquila_Resolve/__init__.py +++ b/src/Aquila_Resolve/__init__.py @@ -4,7 +4,7 @@ Grapheme to Phoneme Resolver """ -__version__ = "0.1.2" +__version__ = "0.1.3" from .g2p import G2p from .data.remote import download diff --git a/src/Aquila_Resolve/__main__.py b/src/Aquila_Resolve/__main__.py new file mode 100644 index 0000000..664934b --- /dev/null +++ b/src/Aquila_Resolve/__main__.py @@ -0,0 +1,4 @@ +from .cli import main_menu + +if __name__ == "__main__": # pragma: no cover + main_menu() diff --git a/src/Aquila_Resolve/cli/__init__.py b/src/Aquila_Resolve/cli/__init__.py new file mode 100644 index 0000000..17d57ec --- /dev/null +++ b/src/Aquila_Resolve/cli/__init__.py @@ -0,0 +1 @@ +from .cli import main_menu diff --git a/src/Aquila_Resolve/cli/cli.py b/src/Aquila_Resolve/cli/cli.py new file mode 100644 index 0000000..3a3d719 --- /dev/null +++ b/src/Aquila_Resolve/cli/cli.py @@ -0,0 +1,29 @@ +# CLI Entry Point +import Aquila_Resolve +from InquirerPy import inquirer +from InquirerPy.utils import color_print as cp +from yaspin import yaspin + + +def main_menu(): + """ + Aquila Resolve Entry Point + """ + g2p_convert() + exit(0) + + +def g2p_convert(): # pragma: no cover + """ + G2P Conversion + """ + with yaspin('Initializing Aquila Resolve Backend...', color='yellow') as sp: + g2p = Aquila_Resolve.G2p() + sp.ok(f'✔ Aquila Resolve v{Aquila_Resolve.__version__}') + + while True: + text = inquirer.text("Text to convert:").execute() + if not text: + return + result = g2p.convert(text) + cp([('yellow', f'{result}')]) diff --git a/src/Aquila_Resolve/data/__init__.py b/src/Aquila_Resolve/data/__init__.py index 8bff045..ffd02b1 100644 --- a/src/Aquila_Resolve/data/__init__.py +++ b/src/Aquila_Resolve/data/__init__.py @@ -9,3 +9,6 @@ DATA_PATH = files(__name__) +CMU_FILE = DATA_PATH.joinpath("cmudict.json.gz") +HET_FILE = DATA_PATH.joinpath("heteronyms.json") +PT_FILE = DATA_PATH.joinpath("model.pt") diff --git a/src/Aquila_Resolve/data/remote.py b/src/Aquila_Resolve/data/remote.py index 0732fd2..ee7d790 100644 --- a/src/Aquila_Resolve/data/remote.py +++ b/src/Aquila_Resolve/data/remote.py @@ -1,6 +1,7 @@ # Access and checks for remote data import requests import shutil +import nltk from warnings import warn from tqdm.auto import tqdm from . import DATA_PATH @@ -54,6 +55,20 @@ def ensure_download() -> None: "Aquila_Resolve/data/ folder.") +def ensure_nltk() -> None: # pragma: no cover + """Ensures all required NLTK Data is installed""" + required = { + 'wordnet': 'corpora/wordnet.zip', + 'omw-1.4': 'corpora/omw-1.4.zip', + 'averaged_perceptron_tagger': 'taggers/averaged_perceptron_tagger.zip', + } + for name, url in required.items(): + try: + nltk.data.find(url) + except LookupError: + nltk.download(name, raise_on_error=True) + + def check_updates() -> None: """Checks if the model matches the latest checksum""" if not check_model(): @@ -62,7 +77,13 @@ def check_updates() -> None: def get_checksum(file: str, block_size: int = 65536) -> str: - """Calculates the checksum of a file""" + """ + Calculates the Sha256 checksum of a file + + :param file: Path to file + :param block_size: Block size for reading + :return: Checksum of file + """ import hashlib s = hashlib.sha256() with open(file, 'rb') as f: diff --git a/src/Aquila_Resolve/g2p.py b/src/Aquila_Resolve/g2p.py index 4b99000..f9be4a5 100644 --- a/src/Aquila_Resolve/g2p.py +++ b/src/Aquila_Resolve/g2p.py @@ -4,7 +4,6 @@ from functools import lru_cache import pywordsegment -import nltk from nltk.stem import WordNetLemmatizer from nltk.stem.snowball import SnowballStemmer @@ -17,27 +16,21 @@ from .processors import Processor from .infer import Infer from .symbols import contains_alpha, valid_braces +from .data.remote import ensure_nltk re_digit = re.compile(r"\((\d+)\)") re_bracket_with_digit = re.compile(r"\(.*\)") re_phonemes = re.compile(r'\{.*?}') -# Check that the nltk data is downloaded, if not, download it -try: - nltk.data.find('corpora/wordnet.zip') - nltk.data.find('corpora/omw-1.4.zip') -except LookupError: - nltk.download('wordnet') - nltk.download('omw-1.4') - class G2p: def __init__(self, device: str = 'cpu'): - # noinspection GrazieInspection """ - Grapheme to Phoneme conversion + Initialize the G2p converter. + :param device: Pytorch device. """ + ensure_nltk() # Ensure nltk data is downloaded self.dict = get_cmudict() # CMU Dictionary self.h2p = H2p(preload=True) # H2p parser self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form @@ -65,7 +58,6 @@ def __init__(self, device: str = 'cpu'): @lru_cache(maxsize=None) def lookup(self, text: str, pos: str = None) -> str | None: - # noinspection GrazieInspection """ Gets the CMU Dictionary entry for a word. @@ -134,7 +126,6 @@ def lookup(self, text: str, pos: str = None) -> str | None: return None def convert(self, text: str, convert_num: bool = True) -> str | None: - # noinspection GrazieInspection """ Replace a grapheme text line with phonemes. diff --git a/src/Aquila_Resolve/h2p.py b/src/Aquila_Resolve/h2p.py index 4be9df6..a43633f 100644 --- a/src/Aquila_Resolve/h2p.py +++ b/src/Aquila_Resolve/h2p.py @@ -1,19 +1,12 @@ from nltk.tokenize import TweetTokenizer from nltk import pos_tag from nltk import pos_tag_sents +from .data.remote import ensure_nltk from .dictionary import Dictionary from .filter import filter_text as ft from .text.replace import replace_first from . import format_ph as ph -# Check required nltk data exists, if not, download it -try: - from nltk.data import find - find('taggers/averaged_perceptron_tagger.zip') -except LookupError: # pragma: no cover - from nltk.downloader import download - download('averaged_perceptron_tagger', raise_on_error=True) - class H2p: def __init__(self, dict_path=None, preload=False, phoneme_format=''): @@ -29,6 +22,8 @@ def __init__(self, dict_path=None, preload=False, phoneme_format=''): :param preload: Preloads the tokenizer and tagger during initialization :type preload: bool """ + # Ensure nltk data is available + ensure_nltk() # Supported phoneme formats self.phoneme_format = phoneme_format @@ -87,9 +82,9 @@ def replace_het_list(self, text_list): # Get pos tags list tags_list = pos_tag_sents(list_sentence_words) # Loop through lines - for index in range(len(tags_list)): + for index, line in enumerate(tags_list): # Loop through words and pos tags in tags_list index - for word, pos in tags_list[index]: + for word, pos in line: # Skip if word not in dictionary if not self.dict.contains(word): continue diff --git a/src/Aquila_Resolve/infer.py b/src/Aquila_Resolve/infer.py index 04d384b..f9e0128 100644 --- a/src/Aquila_Resolve/infer.py +++ b/src/Aquila_Resolve/infer.py @@ -1,7 +1,7 @@ from __future__ import annotations from .models.dp.phonemizer import Phonemizer -from .data import DATA_PATH -from .data.remote import ensure_download, check_updates +from .data import PT_FILE +from .data.remote import ensure_download from .models import MODELS_PATH import sys @@ -11,9 +11,7 @@ class Infer: def __init__(self, device='cpu'): ensure_download() # Download checkpoint if necessary - check_updates() # Check for checkpoint updates - checkpoint_path = DATA_PATH.joinpath('model.pt') - self.model = Phonemizer.from_checkpoint(checkpoint_path, device=device) + self.model = Phonemizer.from_checkpoint(PT_FILE, device=device) self.lang = 'en_us' self.batch_size = 32 diff --git a/src/Aquila_Resolve/processors.py b/src/Aquila_Resolve/processors.py index 324581c..3de935a 100644 --- a/src/Aquila_Resolve/processors.py +++ b/src/Aquila_Resolve/processors.py @@ -1,6 +1,7 @@ # Transformations of text sequences for matching from __future__ import annotations from typing import TYPE_CHECKING +from collections import defaultdict import re @@ -18,25 +19,9 @@ def __init__(self, g2p: G2p): self._tag = g2p.h2p.tag self._stem = g2p.stem # Number of times respective methods were called - self.stat_hits = { - 'possessives': 0, - 'contractions': 0, - 'hyphenated': 0, - 'compound': 0, - 'plural': 0, - 'stem': 0, - 'inference': 0 - } + self.stat_hits = defaultdict(int) # Number of times respective methods returned value (not None) - self.stat_resolves = { - 'possessives': 0, - 'contractions': 0, - 'hyphenated': 0, - 'compound': 0, - 'plural': 0, - 'stem': 0, - 'inference': 0 - } + self.stat_resolves = defaultdict(int) def auto_possessives(self, word: str) -> str | None: """ diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py new file mode 100644 index 0000000..cf94900 --- /dev/null +++ b/tests/cli/test_cli.py @@ -0,0 +1,8 @@ +import pytest +from Aquila_Resolve.cli import cli + + +def test_main_menu(mocker): + mocker.patch.object(cli, 'g2p_convert', return_value=None) + with pytest.raises(SystemExit): + cli.main_menu()