Skip to content

Commit

Permalink
Fix frequency usage
Browse files Browse the repository at this point in the history
  • Loading branch information
lstrobel committed Dec 9, 2024
1 parent 70d5880 commit ce9a09b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
15 changes: 13 additions & 2 deletions src/py_pinyin_split/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,14 @@ def _get_tone_variants(self, syllable: str):

return variants

def _remove_tone(self, syllable: str):
"""Remove tone marks from a pinyin syllable, returning the base form."""
result = syllable
for base_vowel, toned_vowels in self.VOWEL_TONE_VARIANTS.items():
for toned in toned_vowels:
result = result.replace(toned, base_vowel)
return result

def __init__(self, include_nonstandard=False):
self.preprocess_tokenizer = WordPunctTokenizer()

Expand Down Expand Up @@ -656,8 +664,11 @@ def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
best_split = None

for split in shortest:
# Get syllables without tones and sum their frequencies
syllables = [s[start:end].lower() for start, end in split]
# Get syllables and remove tones before frequency lookup
syllables = [
self._remove_tone(s[start:end].lower()) for start, end in split
]

total_freq = sum(
int(self.SYLLABLE_FREQUENCIES.get(syl, "0"))
for syl in syllables
Expand Down
5 changes: 5 additions & 0 deletions tests/test_pinyinsplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def test_tone_splits():
assert tokenizer.tokenize("màn") == ["màn"]


def test_tone_split_where_frequency_matters():
tokenizer = PinyinTokenizer()
assert tokenizer.tokenize("kěnéng") == ["kě", "néng"]


def test_invalid_pinyin():
"""Inputs with invalid pinyin throw ValueErrors"""
tokenizer = PinyinTokenizer()
Expand Down

0 comments on commit ce9a09b

Please sign in to comment.