From c53ccb74f8b83419c4fd0b9fcf51648cdb79c877 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 12 Feb 2024 15:59:13 +0000 Subject: [PATCH] Check Regex For Supported Operations Rewrite regex pattern from QWEN Filter Normalization Patterns Log Regex Filtering and Rewriting Add truncation to HF tokenizers in tests --- README.md | 44 +++++++++---------- python/openvino_tokenizers/hf_parser.py | 20 ++++++--- .../openvino_tokenizers/tokenizer_pipeline.py | 29 ++++++++++++ python/openvino_tokenizers/utils.py | 22 +++++++++- tests/conftest.py | 8 ++-- tests/pass_rates.json | 2 +- tests/tokenizers_test.py | 10 ++--- 7 files changed, 94 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index aae1dfc88..4e77fe686 100644 --- a/README.md +++ b/README.md @@ -264,12 +264,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE - 95.84 + 96.74 3439 SentencePiece - 86.36 + 86.08 2896 @@ -279,7 +279,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The WordPiece - 82.55 + 90.43 533 @@ -300,13 +300,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE EleutherAI/gpt-j-6b - 98.34 + 98.90 181 BPE EleutherAI/gpt-neo-125m - 98.34 + 98.90 181 @@ -330,7 +330,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE Salesforce/codegen-16B-multi - 97.24 + 97.79 181 @@ -354,7 +354,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE facebook/bart-large-mnli - 97.24 + 98.90 181 @@ -372,31 +372,31 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The BPE gpt2 - 97.24 + 98.90 181 BPE laion/CLIP-ViT-bigG-14-laion2B-39B-b160k - 61.33 + 65.19 181 BPE microsoft/deberta-base - 96.13 + 98.90 181 BPE roberta-base - 96.13 + 98.90 181 BPE sentence-transformers/all-roberta-large-v1 - 96.13 + 98.90 181 @@ -456,7 +456,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The SentencePiece camembert-base_slow - 75.14 + 74.03 181 @@ -486,13 +486,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The SentencePiece xlm-roberta-base - 98.90 + 97.24 181 SentencePiece xlm-roberta-base_slow - 98.90 + 97.24 181 @@ -528,19 +528,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The WordPiece ProsusAI/finbert - 80.49 + 95.12 41 WordPiece bert-base-multilingual-cased - 80.49 + 95.12 41 WordPiece bert-large-cased - 80.49 + 95.12 41 @@ -552,13 +552,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The WordPiece distilbert-base-uncased-finetuned-sst-2-english - 80.49 + 95.12 41 WordPiece google/electra-base-discriminator - 80.49 + 95.12 41 @@ -588,7 +588,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The WordPiece rasa/LaBSE - 73.17 + 87.80 41 @@ -600,7 +600,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The WordPiece squeezebert/squeezebert-uncased - 80.49 + 95.12 41 diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index 9a0396a05..fcc20fd41 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -51,14 +51,21 @@ WhitespaceSplitStep, WordPieceTokenizationStep, ) +from .utils import filter_re2_incompatible -def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep: +def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> List[RegexNormalizationStep]: regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"] - return RegexNormalizationStep( - regex_search_pattern=regex_search_pattern, - replace_term=normalizer_dict["content"], - ) + filtered_pattern = filter_re2_incompatible(regex_search_pattern) + if filtered_pattern == "": + return [] + + return [ + RegexNormalizationStep( + regex_search_pattern=regex_search_pattern, + replace_term=normalizer_dict["content"], + ) + ] def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]: @@ -368,7 +375,6 @@ def modify_sentencepiece_model( sp_model_path: Path, add_tokens: Dict[int, str], skip_special_tokens: bool = False, - reference_vocab: Optional[List[str]] = None, ) -> None: model_pb = import_protobuf() model = model_pb.ModelProto() @@ -573,7 +579,7 @@ def convert_tiktoken_model_tokenizer( pipeline.add_steps( [ NormalizeUnicode("NFC"), - RegexSplitStep(split_pattern), + RegexSplitStep(split_pattern, behaviour="contiguous"), BytesToCharsStep(), BPETokenizationStep.from_tiktoken_encoding(encoding), TruncationStep.from_hf_object(hf_tokenizer), diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 3cab78b08..51fe4cc43 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -2,6 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import logging import weakref from dataclasses import dataclass, field from functools import singledispatchmethod @@ -25,6 +26,10 @@ TOKENIZER_NAME, ) from .str_pack import pack_string, pack_strings +from .utils import has_incompatible_re2_op + + +logger = logging.getLogger(__name__) @dataclass @@ -98,6 +103,15 @@ class RegexNormalizationStep(NormalizationStep): regex_search_pattern: str replace_term: str + def __post_init__(self): + self.vet_search_pattern() + + def vet_search_pattern(self) -> None: + if has_incompatible_re2_op(self.regex_search_pattern): + logger.warning( + "RegexNormalization pattern is not supported, operation output might differ from the original tokenizer." + ) + @classmethod def strip_accents_regex(cls) -> "RegexNormalizationStep": return cls(regex_search_pattern=r"\p{Mn}", replace_term="") @@ -168,6 +182,20 @@ class RegexSplitStep(PreTokenizatinStep): invert: bool = False behaviour: str = "remove" + def __post_init__(self): + self.vet_split_pattern() + + def vet_split_pattern(self) -> None: + if r"(?!\S)" in self.split_pattern: + # rewrite regex pattern to get results closer to qwen.cpp results + logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation") + self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])") + + if has_incompatible_re2_op(self.split_pattern): + logger.warning( + "RegexSplit pattern is not supported, operation output might differ from the original tokenizer." + ) + @classmethod def bert_whitespace_splitter(cls) -> "RegexSplitStep": return cls(split_pattern=r"\s+", invert=False) @@ -481,6 +509,7 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None: def token_id(self) -> Optional[int]: return self._token_id + @dataclass class TokenWithTypeId: token_type_id: Optional[int] = None diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py index c31f8ed08..55f2a6792 100644 --- a/python/openvino_tokenizers/utils.py +++ b/python/openvino_tokenizers/utils.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging +import re from typing import Dict, Optional, Sequence, Tuple, Union from openvino import Model, Type @@ -87,7 +88,7 @@ def greedy_decoder(input) -> Model: def add_greedy_decoding( - text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64 + text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64 ) -> Model: ppp = PrePostProcessor(text_generation_model) ppp.output(logits_output).postprocess().custom(greedy_decoder) @@ -109,3 +110,22 @@ def change_outputs_type(model: Model, output_type: Type) -> Model: for idx, _ in enumerate(model.outputs): ppp.output(idx).tensor().set_element_type(output_type) return ppp.build() + + +def has_incompatible_re2_op(pattern: str) -> bool: + return "(?=" in pattern or "(?!" in pattern or "(?<=" in pattern or "(? str: + not_filtered = [] + + for subpattern in (match.group() for match in _subpattern_regex.finditer(pattern)): + if has_incompatible_re2_op(subpattern): + logging.warning(f"Subpattern `{subpattern}` is not supported by re2 and filtered out.") + continue + not_filtered.append(subpattern) + + return "|".join(not_filtered) diff --git a/tests/conftest.py b/tests/conftest.py index 353d73ae6..5cd1ed82d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,11 +56,9 @@ def add_tokenizer_type(row): new_readme.write(old_readme) new_readme.write( "## Test Results\n\n" - "This report is autogenerated and includes tokenizers and detokenizers tests. " - "The `Output Matched, %` column shows the percent of test strings " - "for which the results of OpenVINO and Hugingface Tokenizers are the same. " - "To update the report run `pytest tokenizers_test.py --update_readme` in " - "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n" + "This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column " + "shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. " + "To update the report run `pytest --update_readme tokenizers_test.py` in `tests` directory.\n\n" "### Output Match by Tokenizer Type\n\n" ) is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0) diff --git a/tests/pass_rates.json b/tests/pass_rates.json index 04ed42352..2e58bdf36 100644 --- a/tests/pass_rates.json +++ b/tests/pass_rates.json @@ -1,3 +1,3 @@ { - "tokenizers_test.py::test_": 0.9110740586355426 + "tokenizers_test.py::test_": 0.9201055995553703 } \ No newline at end of file diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py index 7a8acd78d..8a3d046d9 100644 --- a/tests/tokenizers_test.py +++ b/tests/tokenizers_test.py @@ -278,7 +278,7 @@ def test_hf_wordpiece_tokenizers(wordpiece_tokenizers, test_string): hf_tokenizer, ov_tokenizer = wordpiece_tokenizers packed_strings = pack_strings([test_string]) - hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True) ov_tokenized = ov_tokenizer(packed_strings) for output_name, hf_result in hf_tokenized.items(): @@ -298,7 +298,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str hf_tokenizer, ov_tokenizer = wordpiece_tokenizers packed_strings = pack_strings(test_string) - hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True) + hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True, truncation=True) ov_tokenized = ov_tokenizer(packed_strings) for output_name, hf_result in hf_tokenized.items(): @@ -317,7 +317,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str def test_sentencepiece_model_tokenizer(sentencepice_tokenizers, test_string): hf_tokenizer, ov_tokenizer = sentencepice_tokenizers - hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True) ov_tokenized = ov_tokenizer(pack_strings([test_string])) for output_name, hf_result in hf_tokenized.items(): @@ -364,7 +364,7 @@ def test_hf_bpe_tokenizers_outputs(bpe_tokenizers, test_string): hf_tokenizer, ov_tokenizer = bpe_tokenizers packed_strings = pack_strings([test_string]) - hf_tokenized = hf_tokenizer([test_string], return_tensors="np") + hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True) ov_tokenized = ov_tokenizer(packed_strings) for output_name, hf_result in hf_tokenized.items(): @@ -410,7 +410,7 @@ def test_bpe_detokenizer( def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string): hf_tokenizer, ov_tokenizer = tiktoken_tokenizers - hf_tokenized = hf_tokenizer(test_string, return_tensors="np") + hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True) ov_tokenized = ov_tokenizer(pack_strings([test_string])) for output_name, hf_result in hf_tokenized.items():