From 90bf27e3bbf7a78d8181886929211bb8b25e2b60 Mon Sep 17 00:00:00 2001 From: drupchen Date: Thu, 27 Jul 2023 15:16:55 +0545 Subject: [PATCH] =?UTF-8?q?add=20=E0=BD=9E=E0=BD=B2=E0=BD=84=E0=BC=8B=20as?= =?UTF-8?q?=20clause=20boundary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- botok/tokenizers/sentencetokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/botok/tokenizers/sentencetokenizer.py b/botok/tokenizers/sentencetokenizer.py index 32917ef..e0807db 100644 --- a/botok/tokenizers/sentencetokenizer.py +++ b/botok/tokenizers/sentencetokenizer.py @@ -33,7 +33,7 @@ "ཏེ་", "དེ་", ] # separated because these seem to cut long sentences -clause_boundaries = te_particles + ["ནས་", "ན་", "ལ་"] +clause_boundaries = te_particles + ["ནས་", "ན་", "ལ་", "ཞིང་"] dagdra = ["པ་", "བ་", "པོ་", "བོ་"] normalization_patterns = [(' ', ''),