From fea72ac19a52afed739fec731a12bd7e10b8d35e Mon Sep 17 00:00:00 2001 From: drupchen Date: Thu, 20 Jul 2023 10:15:39 +0545 Subject: [PATCH] improve support for news articles --- botok/tokenizers/sentencetokenizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/botok/tokenizers/sentencetokenizer.py b/botok/tokenizers/sentencetokenizer.py index 9840ed9..32917ef 100644 --- a/botok/tokenizers/sentencetokenizer.py +++ b/botok/tokenizers/sentencetokenizer.py @@ -25,14 +25,15 @@ "བགྱི་", "བྱ་", "བཞུགས་", - "འདུག", + "འདུག་", + "སོང་", ] te_particles = [ "སྟེ་", "ཏེ་", "དེ་", ] # separated because these seem to cut long sentences -clause_boundaries = te_particles + ["ནས་", "ན་"] +clause_boundaries = te_particles + ["ནས་", "ན་", "ལ་"] dagdra = ["པ་", "བ་", "པོ་", "བོ་"] normalization_patterns = [(' ', ''),