From 543c50e7e467da990a27970f4df64ca52720bffe Mon Sep 17 00:00:00 2001 From: "P.J. Finlay" Date: Sun, 11 Aug 2024 15:59:02 -0400 Subject: [PATCH] Revert "Update tokenizer.py" This reverts commit 6f1b5d54a1018e884aefd0396f11d919a17ce3ae. With this commit active underscores weren't being correctly detokenized as spaces. https://community.libretranslate.com/t/the-problem-of-translating-symbols-oov-out-of-the-vocabulary/1071/4 --- argostranslate/tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/argostranslate/tokenizer.py b/argostranslate/tokenizer.py index 8ea3fdd8..93bad4f5 100644 --- a/argostranslate/tokenizer.py +++ b/argostranslate/tokenizer.py @@ -24,7 +24,8 @@ def encode(self, sentence: str) -> List[str]: return tokens def decode(self, tokens: List[str]) -> str: - return self.lazy_processor().decode_pieces(tokens) + detokenized = "".join(tokens) + return detokenized.replace("▁", " ") class BPETokenizer(Tokenizer):