Fix frequency usage

lstrobel · Dec 9, 2024 · ce9a09b · ce9a09b
1 parent 70d5880
commit ce9a09b
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 2 deletions.
diff --git a/src/py_pinyin_split/__init__.py b/src/py_pinyin_split/__init__.py
@@ -575,6 +575,14 @@ def _get_tone_variants(self, syllable: str):
 
         return variants
 
+    def _remove_tone(self, syllable: str):
+        """Remove tone marks from a pinyin syllable, returning the base form."""
+        result = syllable
+        for base_vowel, toned_vowels in self.VOWEL_TONE_VARIANTS.items():
+            for toned in toned_vowels:
+                result = result.replace(toned, base_vowel)
+        return result
+
     def __init__(self, include_nonstandard=False):
         self.preprocess_tokenizer = WordPunctTokenizer()
 
@@ -656,8 +664,11 @@ def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
                 best_split = None
 
                 for split in shortest:
-                    # Get syllables without tones and sum their frequencies
-                    syllables = [s[start:end].lower() for start, end in split]
+                    # Get syllables and remove tones before frequency lookup
+                    syllables = [
+                        self._remove_tone(s[start:end].lower()) for start, end in split
+                    ]
+
                     total_freq = sum(
                         int(self.SYLLABLE_FREQUENCIES.get(syl, "0"))
                         for syl in syllables

diff --git a/tests/test_pinyinsplit.py b/tests/test_pinyinsplit.py
@@ -29,6 +29,11 @@ def test_tone_splits():
     assert tokenizer.tokenize("màn") == ["màn"]
 
 
+def test_tone_split_where_frequency_matters():
+    tokenizer = PinyinTokenizer()
+    assert tokenizer.tokenize("kěnéng") == ["kě", "néng"]
+
+
 def test_invalid_pinyin():
     """Inputs with invalid pinyin throw ValueErrors"""
     tokenizer = PinyinTokenizer()