From c53ccb74f8b83419c4fd0b9fcf51648cdb79c877 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Mon, 12 Feb 2024 15:59:13 +0000
Subject: [PATCH] Check Regex For Supported Operations

Rewrite regex pattern from QWEN
Filter Normalization Patterns
Log Regex Filtering and Rewriting
Add truncation to HF tokenizers in tests
---
 README.md                                     | 44 +++++++++----------
 python/openvino_tokenizers/hf_parser.py       | 20 ++++++---
 .../openvino_tokenizers/tokenizer_pipeline.py | 29 ++++++++++++
 python/openvino_tokenizers/utils.py           | 22 +++++++++-
 tests/conftest.py                             |  8 ++--
 tests/pass_rates.json                         |  2 +-
 tests/tokenizers_test.py                      | 10 ++---
 7 files changed, 94 insertions(+), 41 deletions(-)
diff --git a/README.md b/README.md
index aae1dfc88..4e77fe686 100644
--- a/README.md
+++ b/README.md
@@ -264,12 +264,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
   <tbody>
     <tr>
       <td >BPE</td>
-      <td >95.84</td>
+      <td >96.74</td>
       <td >3439</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >86.36</td>
+      <td >86.08</td>
       <td >2896</td>
     </tr>
     <tr>
@@ -279,7 +279,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >WordPiece</td>
-      <td >82.55</td>
+      <td >90.43</td>
       <td >533</td>
     </tr>
   </tbody>
@@ -300,13 +300,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >BPE</td>
       <td >EleutherAI/gpt-j-6b</td>
-      <td >98.34</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
       <td >BPE</td>
       <td >EleutherAI/gpt-neo-125m</td>
-      <td >98.34</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
@@ -330,7 +330,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >BPE</td>
       <td >Salesforce/codegen-16B-multi</td>
-      <td >97.24</td>
+      <td >97.79</td>
       <td >181</td>
     </tr>
     <tr>
@@ -354,7 +354,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >BPE</td>
       <td >facebook/bart-large-mnli</td>
-      <td >97.24</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
@@ -372,31 +372,31 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >BPE</td>
       <td >gpt2</td>
-      <td >97.24</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
       <td >BPE</td>
       <td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
-      <td >61.33</td>
+      <td >65.19</td>
       <td >181</td>
     </tr>
     <tr>
       <td >BPE</td>
       <td >microsoft/deberta-base</td>
-      <td >96.13</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
       <td >BPE</td>
       <td >roberta-base</td>
-      <td >96.13</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
       <td >BPE</td>
       <td >sentence-transformers/all-roberta-large-v1</td>
-      <td >96.13</td>
+      <td >98.90</td>
       <td >181</td>
     </tr>
     <tr>
@@ -456,7 +456,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >SentencePiece</td>
       <td >camembert-base_slow</td>
-      <td >75.14</td>
+      <td >74.03</td>
       <td >181</td>
     </tr>
     <tr>
@@ -486,13 +486,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >SentencePiece</td>
       <td >xlm-roberta-base</td>
-      <td >98.90</td>
+      <td >97.24</td>
       <td >181</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >xlm-roberta-base_slow</td>
-      <td >98.90</td>
+      <td >97.24</td>
       <td >181</td>
     </tr>
     <tr>
@@ -528,19 +528,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >WordPiece</td>
       <td >ProsusAI/finbert</td>
-      <td >80.49</td>
+      <td >95.12</td>
       <td >41</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >bert-base-multilingual-cased</td>
-      <td >80.49</td>
+      <td >95.12</td>
       <td >41</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >bert-large-cased</td>
-      <td >80.49</td>
+      <td >95.12</td>
       <td >41</td>
     </tr>
     <tr>
@@ -552,13 +552,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >WordPiece</td>
       <td >distilbert-base-uncased-finetuned-sst-2-english</td>
-      <td >80.49</td>
+      <td >95.12</td>
       <td >41</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >google/electra-base-discriminator</td>
-      <td >80.49</td>
+      <td >95.12</td>
       <td >41</td>
     </tr>
     <tr>
@@ -588,7 +588,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >WordPiece</td>
       <td >rasa/LaBSE</td>
-      <td >73.17</td>
+      <td >87.80</td>
       <td >41</td>
     </tr>
     <tr>
@@ -600,7 +600,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >WordPiece</td>
       <td >squeezebert/squeezebert-uncased</td>
-      <td >80.49</td>
+      <td >95.12</td>
       <td >41</td>
     </tr>
   </tbody>
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
index 9a0396a05..fcc20fd41 100644
--- a/python/openvino_tokenizers/hf_parser.py
+++ b/python/openvino_tokenizers/hf_parser.py
@@ -51,14 +51,21 @@
     WhitespaceSplitStep,
     WordPieceTokenizationStep,
 )
+from .utils import filter_re2_incompatible
 
 
-def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep:
+def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> List[RegexNormalizationStep]:
     regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"]
-    return RegexNormalizationStep(
-        regex_search_pattern=regex_search_pattern,
-        replace_term=normalizer_dict["content"],
-    )
+    filtered_pattern = filter_re2_incompatible(regex_search_pattern)
+    if filtered_pattern == "":
+        return []
+
+    return [
+        RegexNormalizationStep(
+            regex_search_pattern=regex_search_pattern,
+            replace_term=normalizer_dict["content"],
+        )
+    ]
 
 
 def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]:
@@ -368,7 +375,6 @@ def modify_sentencepiece_model(
     sp_model_path: Path,
     add_tokens: Dict[int, str],
     skip_special_tokens: bool = False,
-    reference_vocab: Optional[List[str]] = None,
 ) -> None:
     model_pb = import_protobuf()
     model = model_pb.ModelProto()
@@ -573,7 +579,7 @@ def convert_tiktoken_model_tokenizer(
     pipeline.add_steps(
         [
             NormalizeUnicode("NFC"),
-            RegexSplitStep(split_pattern),
+            RegexSplitStep(split_pattern, behaviour="contiguous"),
             BytesToCharsStep(),
             BPETokenizationStep.from_tiktoken_encoding(encoding),
             TruncationStep.from_hf_object(hf_tokenizer),
diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
index 3cab78b08..51fe4cc43 100644
--- a/python/openvino_tokenizers/tokenizer_pipeline.py
+++ b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import weakref
 from dataclasses import dataclass, field
 from functools import singledispatchmethod
@@ -25,6 +26,10 @@
     TOKENIZER_NAME,
 )
 from .str_pack import pack_string, pack_strings
+from .utils import has_incompatible_re2_op
+
+
+logger = logging.getLogger(__name__)
 
 
 @dataclass
@@ -98,6 +103,15 @@ class RegexNormalizationStep(NormalizationStep):
     regex_search_pattern: str
     replace_term: str
 
+    def __post_init__(self):
+        self.vet_search_pattern()
+
+    def vet_search_pattern(self) -> None:
+        if has_incompatible_re2_op(self.regex_search_pattern):
+            logger.warning(
+                "RegexNormalization pattern is not supported, operation output might differ from the original tokenizer."
+            )
+
     @classmethod
     def strip_accents_regex(cls) -> "RegexNormalizationStep":
         return cls(regex_search_pattern=r"\p{Mn}", replace_term="")
@@ -168,6 +182,20 @@ class RegexSplitStep(PreTokenizatinStep):
     invert: bool = False
     behaviour: str = "remove"
 
+    def __post_init__(self):
+        self.vet_split_pattern()
+
+    def vet_split_pattern(self) -> None:
+        if r"(?!\S)" in self.split_pattern:
+            #  rewrite regex pattern to get results closer to qwen.cpp results
+            logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
+            self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")
+
+        if has_incompatible_re2_op(self.split_pattern):
+            logger.warning(
+                "RegexSplit pattern is not supported, operation output might differ from the original tokenizer."
+            )
+
     @classmethod
     def bert_whitespace_splitter(cls) -> "RegexSplitStep":
         return cls(split_pattern=r"\s+", invert=False)
@@ -481,6 +509,7 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None:
     def token_id(self) -> Optional[int]:
         return self._token_id
 
+
 @dataclass
 class TokenWithTypeId:
     token_type_id: Optional[int] = None
diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py
index c31f8ed08..55f2a6792 100644
--- a/python/openvino_tokenizers/utils.py
+++ b/python/openvino_tokenizers/utils.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
+import re
 from typing import Dict, Optional, Sequence, Tuple, Union
 
 from openvino import Model, Type
@@ -87,7 +88,7 @@ def greedy_decoder(input) -> Model:
 
 
 def add_greedy_decoding(
-        text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
+    text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
 ) -> Model:
     ppp = PrePostProcessor(text_generation_model)
     ppp.output(logits_output).postprocess().custom(greedy_decoder)
@@ -109,3 +110,22 @@ def change_outputs_type(model: Model, output_type: Type) -> Model:
     for idx, _ in enumerate(model.outputs):
         ppp.output(idx).tensor().set_element_type(output_type)
     return ppp.build()
+
+
+def has_incompatible_re2_op(pattern: str) -> bool:
+    return "(?=" in pattern or "(?!" in pattern or "(?<=" in pattern or "(?<!" in pattern
+
+
+_subpattern_regex = re.compile(r"(?:[^()|]+|\([^)]*\))+")
+
+
+def filter_re2_incompatible(pattern: str) -> str:
+    not_filtered = []
+
+    for subpattern in (match.group() for match in _subpattern_regex.finditer(pattern)):
+        if has_incompatible_re2_op(subpattern):
+            logging.warning(f"Subpattern `{subpattern}` is not supported by re2 and filtered out.")
+            continue
+        not_filtered.append(subpattern)
+
+    return "|".join(not_filtered)
diff --git a/tests/conftest.py b/tests/conftest.py
index 353d73ae6..5cd1ed82d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -56,11 +56,9 @@ def add_tokenizer_type(row):
     new_readme.write(old_readme)
     new_readme.write(
         "## Test Results\n\n"
-        "This report is autogenerated and includes tokenizers and detokenizers tests. "
-        "The `Output Matched, %` column shows the percent of test strings "
-        "for which the results of OpenVINO and Hugingface Tokenizers are the same. "
-        "To update the report run `pytest tokenizers_test.py --update_readme` in "
-        "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n"
+        "This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column "
+        "shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. "
+        "To update the report run `pytest --update_readme tokenizers_test.py` in `tests` directory.\n\n"
         "### Output Match by Tokenizer Type\n\n"
     )
     is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
diff --git a/tests/pass_rates.json b/tests/pass_rates.json
index 04ed42352..2e58bdf36 100644
--- a/tests/pass_rates.json
+++ b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tokenizers_test.py::test_": 0.9110740586355426
+    "tokenizers_test.py::test_": 0.9201055995553703
 }
\ No newline at end of file
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
index 7a8acd78d..8a3d046d9 100644
--- a/tests/tokenizers_test.py
+++ b/tests/tokenizers_test.py
@@ -278,7 +278,7 @@ def test_hf_wordpiece_tokenizers(wordpiece_tokenizers, test_string):
     hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
     packed_strings = pack_strings([test_string])
 
-    hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
+    hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
     ov_tokenized = ov_tokenizer(packed_strings)
 
     for output_name, hf_result in hf_tokenized.items():
@@ -298,7 +298,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
     hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
     packed_strings = pack_strings(test_string)
 
-    hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True)
+    hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True, truncation=True)
     ov_tokenized = ov_tokenizer(packed_strings)
 
     for output_name, hf_result in hf_tokenized.items():
@@ -317,7 +317,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
 def test_sentencepiece_model_tokenizer(sentencepice_tokenizers, test_string):
     hf_tokenizer, ov_tokenizer = sentencepice_tokenizers
 
-    hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
+    hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
     ov_tokenized = ov_tokenizer(pack_strings([test_string]))
 
     for output_name, hf_result in hf_tokenized.items():
@@ -364,7 +364,7 @@ def test_hf_bpe_tokenizers_outputs(bpe_tokenizers, test_string):
     hf_tokenizer, ov_tokenizer = bpe_tokenizers
     packed_strings = pack_strings([test_string])
 
-    hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
+    hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
     ov_tokenized = ov_tokenizer(packed_strings)
 
     for output_name, hf_result in hf_tokenized.items():
@@ -410,7 +410,7 @@ def test_bpe_detokenizer(
 def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string):
     hf_tokenizer, ov_tokenizer = tiktoken_tokenizers
 
-    hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
+    hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
     ov_tokenized = ov_tokenizer(pack_strings([test_string]))
 
     for output_name, hf_result in hf_tokenized.items():