Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Update tokenizers #158

Merged
merged 21 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
dbe8936
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 10, 2024
8fd96b2
Update tokenization.py
KINGNEWBLUSH Mar 10, 2024
025fa86
modified: AUTHORS.md
KINGNEWBLUSH Mar 11, 2024
aea99a2
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
970c1b9
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
a289a7a
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
ad7df8b
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
9423b31
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
5792e48
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
edc266f
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
c526016
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
64c6cda
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
3a53b51
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
569bb9f
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
4542258
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
721bc0a
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
1476f8a
modified: tests/test_tokenizer/test_tokenizer.py
KINGNEWBLUSH Mar 12, 2024
f02ccce
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
05172b4
modified: tests/test_tokenizer/test_tokenizer.py
KINGNEWBLUSH Mar 12, 2024
767778f
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
e86a5e6
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@

[Heng Yu](https://github.com/GNEHUY)

[Tianyun Ji](https://github.com/KINGNEWBLUSH)
The stared contributors are the corresponding authors.
92 changes: 78 additions & 14 deletions EduNLP/SIF/tokenization/text/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@
# 2021/5/18 @ tongshiwei
import logging
import jieba
from nltk.tokenize import word_tokenize
import nltk
import spacy
import tokenizers as huggingface_tokenizer
from tokenizers.trainers import BpeTrainer
from .stopwords import DEFAULT_STOPWORDS
from tokenizers import Tokenizer as HGTokenizer


jieba.setLogLevel(logging.INFO)

Expand All @@ -15,7 +22,13 @@ def is_chinese(word):
return True


def tokenize(text, granularity="word", stopwords="default"):
def tokenize(text,
granularity="word",
stopwords="default",
tokenizer="jieba",
tok_model="en_core_web_sm",
bpe_json='bpe.tokenizer.json',
bpe_trainfile=None):
"""
Using jieba library to tokenize item by word or char.

Expand All @@ -37,17 +50,68 @@ def tokenize(text, granularity="word", stopwords="default"):
"""
stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords
stopwords = stopwords if stopwords is not None else {}
if granularity == "word":
return [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
elif granularity == "char":
jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
# Use jieba_tokens to hangle sentence with mixed chinese and english.
split_tokens = []
for token in jieba_tokens:
if is_chinese(token):
split_tokens.extend(list(token))
else:
split_tokens.append(token)
return split_tokens

if (tokenizer == 'jieba'):
if granularity == "word":
return [
token for token in jieba.cut(text)
if token not in stopwords and token.strip()
]
elif granularity == "char":
jieba_tokens = [
token for token in jieba.cut(text)
if token not in stopwords and token.strip()
]
# Use jieba_tokens to hangle sentence with mixed chinese and english.
split_tokens = []
for token in jieba_tokens:
if is_chinese(token):
split_tokens.extend(list(token))
else:
split_tokens.append(token)
return split_tokens
else:
raise TypeError("Unknown granularity %s" % granularity)

elif (tokenizer == 'nltk'):
try:
return [
token for token in word_tokenize(text)
if token not in stopwords and token.strip()
]
except LookupError:
nltk.download('punkt')
return [
token for token in word_tokenize(text)
if token not in stopwords and token.strip()
]

elif (tokenizer == 'spacy'):
try:
spacy_tokenizer = spacy.load(tok_model)
except OSError:
spacy.cli.download(tok_model)
spacy_tokenizer = spacy.load(tok_model)
output = spacy_tokenizer(str(text))
return [
token.text for token in output
if token.text not in stopwords
]

elif (tokenizer == 'bpe'):
try:
tokenizer = HGTokenizer.from_file(bpe_json)
except Exception:
tokenizer = huggingface_tokenizer.Tokenizer(
huggingface_tokenizer.models.BPE())
if (bpe_trainfile is None):
raise LookupError("bpe train file not found, using %s." % bpe_trainfile)
trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files=[bpe_trainfile], trainer=trainer)
tokenizer.save(bpe_json, pretty=True)
output = tokenizer.encode(text)
output = output.tokens
return output[0]
else:
raise TypeError("Unknown granularity %s" % granularity)
raise TypeError("Invalid Spliter: %s" % tokenizer)
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
'networkx',
'numpy>=1.17.0',
'jieba',
'nltk',
'spacy',
'tokenizers',
'js2py',
'EduData>=0.0.16',
'PyBaize>=0.0.3'
Expand Down
44 changes: 44 additions & 0 deletions tests/test_tokenizer/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
from EduNLP.Tokenizer import get_tokenizer
from EduNLP.Pretrain import DisenQTokenizer
from EduNLP.utils import abs_current_dir, path_append


def test_tokenizer():
Expand Down Expand Up @@ -50,6 +51,49 @@ def test_CharTokenizer():
assert ret == ans


def test_TokenizerNLTK():
items = ["The stationery store has 600 exercise books, and after selling\
some, there are still 4 packs left, 25 each, how many are sold?"]
ans = [
'The', 'stationery', 'store', 'has', '600', 'exercise',
'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still',
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
]
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'nltk', "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerSpacy():
items = ["The stationery store has 600 exercise books, and after selling\
some, there are still 4 packs left, 25 each, how many are sold?"]
ans = [
'The', 'stationery', 'store', 'has', '600', 'exercise',
'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still',
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
]
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'spacy', "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerBPE():
items = ['The stationery store has $600$ exercise books, and after selling some,\
there are still $4$ packs left, $25$ each, how many are sold?']
ans = ['h', '600', ' ', '4', ' ', '25', ' ']
data_path = path_append(abs_current_dir(__file__),
"../../static/test_data/standard_luna_data.json", to_str=True)
tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"),
"bpe_trainfile": data_path})
tokens = tokenizer(items)
ret = next(tokens)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是否支持中文?

assert ret == ans


def test_SpaceTokenizer():
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']
tokenizer = get_tokenizer("space", stop_words=[])
Expand Down
Loading