Skip to content

Commit

Permalink
Check Regex For Supported Operations
Browse files Browse the repository at this point in the history
Check Regex For Supported Operations
  • Loading branch information
apaniukov authored Feb 13, 2024
2 parents 73e3592 + c53ccb7 commit 4a332b4
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 41 deletions.
44 changes: 22 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tbody>
<tr>
<td >BPE</td>
<td >95.84</td>
<td >96.74</td>
<td >3439</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >86.36</td>
<td >86.08</td>
<td >2896</td>
</tr>
<tr>
Expand All @@ -279,7 +279,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >WordPiece</td>
<td >82.55</td>
<td >90.43</td>
<td >533</td>
</tr>
</tbody>
Expand All @@ -300,13 +300,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-j-6b</td>
<td >98.34</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-neo-125m</td>
<td >98.34</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
Expand All @@ -330,7 +330,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >Salesforce/codegen-16B-multi</td>
<td >97.24</td>
<td >97.79</td>
<td >181</td>
</tr>
<tr>
Expand All @@ -354,7 +354,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >facebook/bart-large-mnli</td>
<td >97.24</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
Expand All @@ -372,31 +372,31 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >BPE</td>
<td >gpt2</td>
<td >97.24</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
<td >61.33</td>
<td >65.19</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >microsoft/deberta-base</td>
<td >96.13</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >roberta-base</td>
<td >96.13</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
<td >BPE</td>
<td >sentence-transformers/all-roberta-large-v1</td>
<td >96.13</td>
<td >98.90</td>
<td >181</td>
</tr>
<tr>
Expand Down Expand Up @@ -456,7 +456,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >camembert-base_slow</td>
<td >75.14</td>
<td >74.03</td>
<td >181</td>
</tr>
<tr>
Expand Down Expand Up @@ -486,13 +486,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base</td>
<td >98.90</td>
<td >97.24</td>
<td >181</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base_slow</td>
<td >98.90</td>
<td >97.24</td>
<td >181</td>
</tr>
<tr>
Expand Down Expand Up @@ -528,19 +528,19 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >ProsusAI/finbert</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
<td >WordPiece</td>
<td >bert-base-multilingual-cased</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
<td >WordPiece</td>
<td >bert-large-cased</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
Expand All @@ -552,13 +552,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >distilbert-base-uncased-finetuned-sst-2-english</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
<td >WordPiece</td>
<td >google/electra-base-discriminator</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
<tr>
Expand Down Expand Up @@ -588,7 +588,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >rasa/LaBSE</td>
<td >73.17</td>
<td >87.80</td>
<td >41</td>
</tr>
<tr>
Expand All @@ -600,7 +600,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >WordPiece</td>
<td >squeezebert/squeezebert-uncased</td>
<td >80.49</td>
<td >95.12</td>
<td >41</td>
</tr>
</tbody>
Expand Down
20 changes: 13 additions & 7 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,21 @@
WhitespaceSplitStep,
WordPieceTokenizationStep,
)
from .utils import filter_re2_incompatible


def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> RegexNormalizationStep:
def parse_replace_normalizer(normalizer_dict: Dict[str, Any]) -> List[RegexNormalizationStep]:
regex_search_pattern = normalizer_dict["pattern"].get("String") or normalizer_dict["pattern"]["Regex"]
return RegexNormalizationStep(
regex_search_pattern=regex_search_pattern,
replace_term=normalizer_dict["content"],
)
filtered_pattern = filter_re2_incompatible(regex_search_pattern)
if filtered_pattern == "":
return []

return [
RegexNormalizationStep(
regex_search_pattern=regex_search_pattern,
replace_term=normalizer_dict["content"],
)
]


def parse_bert_normalizer(normalizer_dict: Dict[str, Any]) -> List[NormalizationStep]:
Expand Down Expand Up @@ -368,7 +375,6 @@ def modify_sentencepiece_model(
sp_model_path: Path,
add_tokens: Dict[int, str],
skip_special_tokens: bool = False,
reference_vocab: Optional[List[str]] = None,
) -> None:
model_pb = import_protobuf()
model = model_pb.ModelProto()
Expand Down Expand Up @@ -573,7 +579,7 @@ def convert_tiktoken_model_tokenizer(
pipeline.add_steps(
[
NormalizeUnicode("NFC"),
RegexSplitStep(split_pattern),
RegexSplitStep(split_pattern, behaviour="contiguous"),
BytesToCharsStep(),
BPETokenizationStep.from_tiktoken_encoding(encoding),
TruncationStep.from_hf_object(hf_tokenizer),
Expand Down
29 changes: 29 additions & 0 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (C) 2018-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import logging
import weakref
from dataclasses import dataclass, field
from functools import singledispatchmethod
Expand All @@ -25,6 +26,10 @@
TOKENIZER_NAME,
)
from .str_pack import pack_string, pack_strings
from .utils import has_incompatible_re2_op


logger = logging.getLogger(__name__)


@dataclass
Expand Down Expand Up @@ -98,6 +103,15 @@ class RegexNormalizationStep(NormalizationStep):
regex_search_pattern: str
replace_term: str

def __post_init__(self):
self.vet_search_pattern()

def vet_search_pattern(self) -> None:
if has_incompatible_re2_op(self.regex_search_pattern):
logger.warning(
"RegexNormalization pattern is not supported, operation output might differ from the original tokenizer."
)

@classmethod
def strip_accents_regex(cls) -> "RegexNormalizationStep":
return cls(regex_search_pattern=r"\p{Mn}", replace_term="")
Expand Down Expand Up @@ -168,6 +182,20 @@ class RegexSplitStep(PreTokenizatinStep):
invert: bool = False
behaviour: str = "remove"

def __post_init__(self):
self.vet_split_pattern()

def vet_split_pattern(self) -> None:
if r"(?!\S)" in self.split_pattern:
# rewrite regex pattern to get results closer to qwen.cpp results
logger.warning(r"Replace `(?!\S)` pattern to `(?:$|[^\S])` in RegexSplit operation")
self.split_pattern = self.split_pattern.replace(r"(?!\S)", r"(?:$|[^\S])")

if has_incompatible_re2_op(self.split_pattern):
logger.warning(
"RegexSplit pattern is not supported, operation output might differ from the original tokenizer."
)

@classmethod
def bert_whitespace_splitter(cls) -> "RegexSplitStep":
return cls(split_pattern=r"\s+", invert=False)
Expand Down Expand Up @@ -481,6 +509,7 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None:
def token_id(self) -> Optional[int]:
return self._token_id


@dataclass
class TokenWithTypeId:
token_type_id: Optional[int] = None
Expand Down
22 changes: 21 additions & 1 deletion python/openvino_tokenizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
import re
from typing import Dict, Optional, Sequence, Tuple, Union

from openvino import Model, Type
Expand Down Expand Up @@ -87,7 +88,7 @@ def greedy_decoder(input) -> Model:


def add_greedy_decoding(
text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
) -> Model:
ppp = PrePostProcessor(text_generation_model)
ppp.output(logits_output).postprocess().custom(greedy_decoder)
Expand All @@ -109,3 +110,22 @@ def change_outputs_type(model: Model, output_type: Type) -> Model:
for idx, _ in enumerate(model.outputs):
ppp.output(idx).tensor().set_element_type(output_type)
return ppp.build()


def has_incompatible_re2_op(pattern: str) -> bool:
return "(?=" in pattern or "(?!" in pattern or "(?<=" in pattern or "(?<!" in pattern


_subpattern_regex = re.compile(r"(?:[^()|]+|\([^)]*\))+")


def filter_re2_incompatible(pattern: str) -> str:
not_filtered = []

for subpattern in (match.group() for match in _subpattern_regex.finditer(pattern)):
if has_incompatible_re2_op(subpattern):
logging.warning(f"Subpattern `{subpattern}` is not supported by re2 and filtered out.")
continue
not_filtered.append(subpattern)

return "|".join(not_filtered)
8 changes: 3 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,9 @@ def add_tokenizer_type(row):
new_readme.write(old_readme)
new_readme.write(
"## Test Results\n\n"
"This report is autogenerated and includes tokenizers and detokenizers tests. "
"The `Output Matched, %` column shows the percent of test strings "
"for which the results of OpenVINO and Hugingface Tokenizers are the same. "
"To update the report run `pytest tokenizers_test.py --update_readme` in "
"`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n"
"This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column "
"shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. "
"To update the report run `pytest --update_readme tokenizers_test.py` in `tests` directory.\n\n"
"### Output Match by Tokenizer Type\n\n"
)
is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tokenizers_test.py::test_": 0.9110740586355426
"tokenizers_test.py::test_": 0.9201055995553703
}
10 changes: 5 additions & 5 deletions tests/tokenizers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def test_hf_wordpiece_tokenizers(wordpiece_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
packed_strings = pack_strings([test_string])

hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)

for output_name, hf_result in hf_tokenized.items():
Expand All @@ -298,7 +298,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
hf_tokenizer, ov_tokenizer = wordpiece_tokenizers
packed_strings = pack_strings(test_string)

hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True)
hf_tokenized = hf_tokenizer(test_string, return_tensors="np", padding=True, truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)

for output_name, hf_result in hf_tokenized.items():
Expand All @@ -317,7 +317,7 @@ def test_hf_wordpiece_tokenizers_multiple_strings(wordpiece_tokenizers, test_str
def test_sentencepiece_model_tokenizer(sentencepice_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = sentencepice_tokenizers

hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(pack_strings([test_string]))

for output_name, hf_result in hf_tokenized.items():
Expand Down Expand Up @@ -364,7 +364,7 @@ def test_hf_bpe_tokenizers_outputs(bpe_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = bpe_tokenizers
packed_strings = pack_strings([test_string])

hf_tokenized = hf_tokenizer([test_string], return_tensors="np")
hf_tokenized = hf_tokenizer([test_string], return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(packed_strings)

for output_name, hf_result in hf_tokenized.items():
Expand Down Expand Up @@ -410,7 +410,7 @@ def test_bpe_detokenizer(
def test_tiktoken_tokenizers(tiktoken_tokenizers, test_string):
hf_tokenizer, ov_tokenizer = tiktoken_tokenizers

hf_tokenized = hf_tokenizer(test_string, return_tensors="np")
hf_tokenized = hf_tokenizer(test_string, return_tensors="np", truncation=True)
ov_tokenized = ov_tokenizer(pack_strings([test_string]))

for output_name, hf_result in hf_tokenized.items():
Expand Down

0 comments on commit 4a332b4

Please sign in to comment.