Merge pull request #13 from ionite34/dev

ionite34 · May 24, 2022 · 2f767d8 · 2f767d8
2 parents ed180c4 + 518b53c
commit 2f767d8
Show file tree

Hide file tree

Showing 15 changed files with 117 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ The pipeline employs a context layer, multiple transformer and n-gram morpho-ort
 and an autoregressive recurrent neural transformer base. The current implementation offers state-of-the-art accuracy for out-of-vocabulary (OOV) words, as well as contextual
 analysis for correct inferencing of [English Heteronyms](https://en.wikipedia.org/wiki/Heteronym_(linguistics)).
 
-The package is offered in a pre-trained state that is ready for use as a dependency or in
+The package is offered in a pre-trained state that is ready for [usage](#Usage) as a dependency or in
 notebook environments. There are no additional resources needed, other than the model checkpoint which is
 automatically downloaded on the first usage. See [Installation](#Installation) more information.
 
@@ -68,6 +68,8 @@ pip install aquila-resolve
 
 ## Usage
 
+### 1. Module
+
 ```python
 from Aquila_Resolve import G2p
 
@@ -77,13 +79,29 @@ g2p.convert('The book costs $5, will you read it?')
 # >> '{DH AH0} {B UH1 K} {K AA1 S T S} {F AY1 V} {D AA1 L ER0 Z}, {W IH1 L} {Y UW1} {R IY1 D} {IH1 T}?'
 ```
 
-> Additional optional parameters are available when defining a `G2p` instance:
+> Optional parameters when defining a `G2p` instance:
 
 | Parameter         | Default | Description                                                                                                                                                              |
 |-------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `device`          | `'cpu'` | Device for Pytorch inference model. GPU is supported using `'cuda'`                                                                                                      |
+
+> Optional parameters when calling `convert`:
+
+| Parameter         | Default | Description                                                                                                                                                              |
+|-------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `process_numbers` | `True`  | Toggles conversion of some numbers and symbols to their spoken pronunciation forms. See [numbers.py](src/Aquila_Resolve/text/numbers.py) for details on what is covered. |
 
+### 2. Command Line
+
+A simple wrapper for text conversion is available through the `aquila-resolve` command
+```
+~
+❯ aquila-resolve
+✔ Aquila Resolve v0.1.2
+? Text to convert: I read the book, did you read it?
+{AY1} {R EH1 D} {DH AH0} {B UH1 K}, {D IH1 D} {Y UW1} {R IY1 D} {IH1 T}?
+```
+
 ## Model Architecture
 
 In evaluation[^1], neural G2P models have traditionally been extremely sensitive to orthographical variations

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -7,5 +7,7 @@ torch~=1.11.0
 inflect>=2.1.0
 requests>=2.23.0
 numpy>=1.18.0
+inquirerpy>=0.3.3
+yaspin>=2.1.0
 pytest>=7.1.2
 pytest_mock>=3.7.0
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,6 @@ pywordsegment>=0.2.1
 torch~=1.11.0
 inflect>=2.1.0
 requests>=2.23.0
-numpy>=1.18.0
+numpy>=1.18.0
+inquirerpy>=0.3.3
+yaspin>=2.1.0
diff --git a/setup.cfg b/setup.cfg
@@ -1,9 +1,9 @@
 [metadata]
 name = Aquila-Resolve
-version = 0.1.2
+version = 0.1.3
 author = ionite
 author_email = dev@ionite.io
-description = Augmented Recurrent Neural Grapheme-to-Phoneme conversion with Inflectional Orthography.
+description = Augmented Neural English G2p converter with Inflectional Orthography.
 long_description = file: README.md
 long_description_content_type = text/markdown
 url = https://github.com/ionite34/Aquila-Resolve
@@ -32,11 +32,17 @@ install_requires =
     inflect>=2.1.0
     requests>=2.23.0
     numpy>=1.18.0
+    inquirerpy>=0.3.3
+    yaspin>=2.1.0
 zip_safe = False
 include_package_data = True
 
 [options.package_data]
 * = *.json, *.json.gz
 
 [options.packages.find]
-where = src
+where = src
+
+[options.entry_points]
+console_scripts =
+    aquila-resolve = Aquila_Resolve.cli:main_menu
diff --git a/src/Aquila_Resolve/__init__.py b/src/Aquila_Resolve/__init__.py
@@ -4,7 +4,7 @@
 Grapheme to Phoneme Resolver
 
 """
-__version__ = "0.1.2"
+__version__ = "0.1.3"
 
 from .g2p import G2p
 from .data.remote import download
diff --git a/src/Aquila_Resolve/__main__.py b/src/Aquila_Resolve/__main__.py
@@ -0,0 +1,4 @@
+from .cli import main_menu
+
+if __name__ == "__main__":  # pragma: no cover
+    main_menu()
diff --git a/src/Aquila_Resolve/cli/__init__.py b/src/Aquila_Resolve/cli/__init__.py
@@ -0,0 +1 @@
+from .cli import main_menu
diff --git a/src/Aquila_Resolve/cli/cli.py b/src/Aquila_Resolve/cli/cli.py
@@ -0,0 +1,29 @@
+# CLI Entry Point
+import Aquila_Resolve
+from InquirerPy import inquirer
+from InquirerPy.utils import color_print as cp
+from yaspin import yaspin
+
+
+def main_menu():
+    """
+    Aquila Resolve Entry Point
+    """
+    g2p_convert()
+    exit(0)
+
+
+def g2p_convert():  # pragma: no cover
+    """
+    G2P Conversion
+    """
+    with yaspin('Initializing Aquila Resolve Backend...', color='yellow') as sp:
+        g2p = Aquila_Resolve.G2p()
+        sp.ok(f'✔ Aquila Resolve v{Aquila_Resolve.__version__}')
+
+    while True:
+        text = inquirer.text("Text to convert:").execute()
+        if not text:
+            return
+        result = g2p.convert(text)
+        cp([('yellow', f'{result}')])
diff --git a/src/Aquila_Resolve/data/__init__.py b/src/Aquila_Resolve/data/__init__.py
@@ -9,3 +9,6 @@
 
 
 DATA_PATH = files(__name__)
+CMU_FILE = DATA_PATH.joinpath("cmudict.json.gz")
+HET_FILE = DATA_PATH.joinpath("heteronyms.json")
+PT_FILE = DATA_PATH.joinpath("model.pt")
diff --git a/src/Aquila_Resolve/data/remote.py b/src/Aquila_Resolve/data/remote.py
@@ -1,6 +1,7 @@
 # Access and checks for remote data
 import requests
 import shutil
+import nltk
 from warnings import warn
 from tqdm.auto import tqdm
 from . import DATA_PATH
@@ -54,6 +55,20 @@ def ensure_download() -> None:
                            "Aquila_Resolve/data/ folder.")
 
 
+def ensure_nltk() -> None:  # pragma: no cover
+    """Ensures all required NLTK Data is installed"""
+    required = {
+        'wordnet': 'corpora/wordnet.zip',
+        'omw-1.4': 'corpora/omw-1.4.zip',
+        'averaged_perceptron_tagger': 'taggers/averaged_perceptron_tagger.zip',
+    }
+    for name, url in required.items():
+        try:
+            nltk.data.find(url)
+        except LookupError:
+            nltk.download(name, raise_on_error=True)
+
+
 def check_updates() -> None:
     """Checks if the model matches the latest checksum"""
     if not check_model():
@@ -62,7 +77,13 @@ def check_updates() -> None:
 
 
 def get_checksum(file: str, block_size: int = 65536) -> str:
-    """Calculates the checksum of a file"""
+    """
+    Calculates the Sha256 checksum of a file
+
+    :param file: Path to file
+    :param block_size: Block size for reading
+    :return: Checksum of file
+    """
     import hashlib
     s = hashlib.sha256()
     with open(file, 'rb') as f:

diff --git a/src/Aquila_Resolve/g2p.py b/src/Aquila_Resolve/g2p.py
@@ -4,7 +4,6 @@
 from functools import lru_cache
 
 import pywordsegment
-import nltk
 from nltk.stem import WordNetLemmatizer
 from nltk.stem.snowball import SnowballStemmer
 
@@ -17,27 +16,21 @@
 from .processors import Processor
 from .infer import Infer
 from .symbols import contains_alpha, valid_braces
+from .data.remote import ensure_nltk
 
 re_digit = re.compile(r"\((\d+)\)")
 re_bracket_with_digit = re.compile(r"\(.*\)")
 re_phonemes = re.compile(r'\{.*?}')
 
-# Check that the nltk data is downloaded, if not, download it
-try:
-    nltk.data.find('corpora/wordnet.zip')
-    nltk.data.find('corpora/omw-1.4.zip')
-except LookupError:
-    nltk.download('wordnet')
-    nltk.download('omw-1.4')
-
 
 class G2p:
     def __init__(self, device: str = 'cpu'):
-        # noinspection GrazieInspection
         """
-        Grapheme to Phoneme conversion
+        Initialize the G2p converter.
 
+        :param device: Pytorch device.
         """
+        ensure_nltk()  # Ensure nltk data is downloaded
         self.dict = get_cmudict()  # CMU Dictionary
         self.h2p = H2p(preload=True)  # H2p parser
         self.lemmatize = WordNetLemmatizer().lemmatize  # WordNet Lemmatizer - used to find singular form
@@ -65,7 +58,6 @@ def __init__(self, device: str = 'cpu'):
 
     @lru_cache(maxsize=None)
     def lookup(self, text: str, pos: str = None) -> str | None:
-        # noinspection GrazieInspection
         """
         Gets the CMU Dictionary entry for a word.
 
@@ -134,7 +126,6 @@ def lookup(self, text: str, pos: str = None) -> str | None:
         return None
 
     def convert(self, text: str, convert_num: bool = True) -> str | None:
-        # noinspection GrazieInspection
         """
         Replace a grapheme text line with phonemes.
 

diff --git a/src/Aquila_Resolve/h2p.py b/src/Aquila_Resolve/h2p.py
@@ -1,19 +1,12 @@
 from nltk.tokenize import TweetTokenizer
 from nltk import pos_tag
 from nltk import pos_tag_sents
+from .data.remote import ensure_nltk
 from .dictionary import Dictionary
 from .filter import filter_text as ft
 from .text.replace import replace_first
 from . import format_ph as ph
 
-# Check required nltk data exists, if not, download it
-try:
-    from nltk.data import find
-    find('taggers/averaged_perceptron_tagger.zip')
-except LookupError:  # pragma: no cover
-    from nltk.downloader import download
-    download('averaged_perceptron_tagger', raise_on_error=True)
-
 
 class H2p:
     def __init__(self, dict_path=None, preload=False, phoneme_format=''):
@@ -29,6 +22,8 @@ def __init__(self, dict_path=None, preload=False, phoneme_format=''):
         :param preload: Preloads the tokenizer and tagger during initialization
         :type preload: bool
         """
+        # Ensure nltk data is available
+        ensure_nltk()
 
         # Supported phoneme formats
         self.phoneme_format = phoneme_format
@@ -87,9 +82,9 @@ def replace_het_list(self, text_list):
         # Get pos tags list
         tags_list = pos_tag_sents(list_sentence_words)
         # Loop through lines
-        for index in range(len(tags_list)):
+        for index, line in enumerate(tags_list):
             # Loop through words and pos tags in tags_list index
-            for word, pos in tags_list[index]:
+            for word, pos in line:
                 # Skip if word not in dictionary
                 if not self.dict.contains(word):
                     continue

diff --git a/src/Aquila_Resolve/infer.py b/src/Aquila_Resolve/infer.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from .models.dp.phonemizer import Phonemizer
-from .data import DATA_PATH
-from .data.remote import ensure_download, check_updates
+from .data import PT_FILE
+from .data.remote import ensure_download
 from .models import MODELS_PATH
 import sys
 
@@ -11,9 +11,7 @@
 class Infer:
     def __init__(self, device='cpu'):
         ensure_download()  # Download checkpoint if necessary
-        check_updates()  # Check for checkpoint updates
-        checkpoint_path = DATA_PATH.joinpath('model.pt')
-        self.model = Phonemizer.from_checkpoint(checkpoint_path, device=device)
+        self.model = Phonemizer.from_checkpoint(PT_FILE, device=device)
         self.lang = 'en_us'
         self.batch_size = 32
 

diff --git a/src/Aquila_Resolve/processors.py b/src/Aquila_Resolve/processors.py
@@ -1,6 +1,7 @@
 # Transformations of text sequences for matching
 from __future__ import annotations
 from typing import TYPE_CHECKING
+from collections import defaultdict
 
 import re
 
@@ -18,25 +19,9 @@ def __init__(self, g2p: G2p):
         self._tag = g2p.h2p.tag
         self._stem = g2p.stem
         # Number of times respective methods were called
-        self.stat_hits = {
-            'possessives': 0,
-            'contractions': 0,
-            'hyphenated': 0,
-            'compound': 0,
-            'plural': 0,
-            'stem': 0,
-            'inference': 0
-        }
+        self.stat_hits = defaultdict(int)
         # Number of times respective methods returned value (not None)
-        self.stat_resolves = {
-            'possessives': 0,
-            'contractions': 0,
-            'hyphenated': 0,
-            'compound': 0,
-            'plural': 0,
-            'stem': 0,
-            'inference': 0
-        }
+        self.stat_resolves = defaultdict(int)
 
     def auto_possessives(self, word: str) -> str | None:
         """

diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
@@ -0,0 +1,8 @@
+import pytest
+from Aquila_Resolve.cli import cli
+
+
+def test_main_menu(mocker):
+    mocker.patch.object(cli, 'g2p_convert', return_value=None)
+    with pytest.raises(SystemExit):
+        cli.main_menu()