From dbe89362cb57fb367db13ff93eb46f239810f333 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Sun, 10 Mar 2024 08:59:17 +0000 Subject: [PATCH 01/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: setup.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 91 +++++++++++++++++--- setup.py | 3 + tests/test_tokenizer/test_tokenizer.py | 21 +++++ 3 files changed, 101 insertions(+), 14 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 45bd96a6..cc11b97f 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -2,6 +2,11 @@ # 2021/5/18 @ tongshiwei import logging import jieba +from nltk.tokenize import word_tokenize +import nltk +import spacy +import tokenizers as huggingface_tokenizer +from tokenizers.trainers import BpeTrainer from .stopwords import DEFAULT_STOPWORDS jieba.setLogLevel(logging.INFO) @@ -15,7 +20,13 @@ def is_chinese(word): return True -def tokenize(text, granularity="word", stopwords="default"): +def tokenize(text, + granularity="word", + stopwords="default", + tokenizer="jieba", + tok_model="en_core_web_sm", + bpe_json='bpe.tokenizer.json', + bpe_trainfile=None): """ Using jieba library to tokenize item by word or char. @@ -37,17 +48,69 @@ def tokenize(text, granularity="word", stopwords="default"): """ stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords stopwords = stopwords if stopwords is not None else {} - if granularity == "word": - return [token for token in jieba.cut(text) if token not in stopwords and token.strip()] - elif granularity == "char": - jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()] - # Use jieba_tokens to hangle sentence with mixed chinese and english. - split_tokens = [] - for token in jieba_tokens: - if is_chinese(token): - split_tokens.extend(list(token)) - else: - split_tokens.append(token) - return split_tokens + + if (tokenizer == 'jieba'): + if granularity == "word": + return [ + token for token in jieba.cut(text) + if token not in stopwords and token.strip() + ] + elif granularity == "char": + jieba_tokens = [ + token for token in jieba.cut(text) + if token not in stopwords and token.strip() + ] + # Use jieba_tokens to hangle sentence with mixed chinese and english. + split_tokens = [] + for token in jieba_tokens: + if is_chinese(token): + split_tokens.extend(list(token)) + else: + split_tokens.append(token) + return split_tokens + else: + raise TypeError("Unknown granularity %s" % granularity) + + elif (tokenizer == 'nltk'): + try: + return [ + token for token in word_tokenize(text) + if token not in stopwords and token.strip() + ] + except: + nltk.download('punkt') + return [ + token for token in word_tokenize(text) + if token not in stopwords and token.strip() + ] + + elif (tokenizer == 'spacy'): + try: + spacy_tokenizer = spacy.load(tok_model) + except OSError: + spacy.cli.download(tok_model) + spacy_tokenizer = spacy.load(tok_model) + + return [ + token.text for token in spacy_tokenizer(text) + if token.text not in stopwords and token.text.strip() + ] + + elif (tokenizer == 'bpe'): + tokenizer = huggingface_tokenizer.Tokenizer( + huggingface_tokenizer.models.BPE()) + try: + tokenizer.load(bpe_json, pretty=True) + except: + if (bpe_trainfile is None): + raise OSError("bpe train file not found, using %s." % + bpe_trainfile) + trainer = BpeTrainer( + special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + tokenizer.train(files=[bpe_trainfile], trainer=trainer) + tokenizer.save(bpe_json, pretty=True) + return [ + token for token in tokenizer.encode(text) if token not in stopwords + ] else: - raise TypeError("Unknown granularity %s" % granularity) + raise TypeError("Invalid Spliter: %s" % tokenizer) diff --git a/setup.py b/setup.py index c5cc9e21..d6c02460 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,9 @@ 'networkx', 'numpy>=1.17.0', 'jieba', + 'nltk', + 'spacy', + 'tokenizers', 'js2py', 'EduData>=0.0.16', 'PyBaize>=0.0.3' diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index e9471e39..7e2f7e49 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -50,6 +50,27 @@ def test_CharTokenizer(): assert ret == ans +def test_Tokenizer(): + items = [{ + "stem": + "The stationery store has $600$ exercise books, and after selling some,\ + there are still $4$ packs left, $25$ each, how many are sold?", + }] + ans = [ + 'The', 'stationery', 'store', 'has', '$', '600', '$', 'exercise', + 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', + '$', '4', '$', 'packs', 'left', '$', '25', '$', 'each', 'how', 'many', + 'are', 'sold' + ] + for tok in ['nltk', 'spacy']: + tokenizer = get_tokenizer("char", + stop_words=set(",?"), + text_params={"tokenizer": tok}) + tokens = tokenizer(items, key=lambda x: x['stem']) + ret = next(tokens) + assert ret == ans + + def test_SpaceTokenizer(): items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] tokenizer = get_tokenizer("space", stop_words=[]) From 8fd96b2ba8da4e83a349d25b4aa9a8e48004a283 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Sun, 10 Mar 2024 17:10:41 +0800 Subject: [PATCH 02/21] Update tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index cc11b97f..edc8fe21 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -77,7 +77,7 @@ def tokenize(text, token for token in word_tokenize(text) if token not in stopwords and token.strip() ] - except: + except OSError: nltk.download('punkt') return [ token for token in word_tokenize(text) @@ -101,7 +101,7 @@ def tokenize(text, huggingface_tokenizer.models.BPE()) try: tokenizer.load(bpe_json, pretty=True) - except: + except OSError: if (bpe_trainfile is None): raise OSError("bpe train file not found, using %s." % bpe_trainfile) From 025fa862450dd468d177078ca008aee9f77241fd Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 06:36:20 +0000 Subject: [PATCH 03/21] modified: AUTHORS.md modified: tests/test_tokenizer/test_tokenizer.py --- AUTHORS.md | 1 + tests/test_tokenizer/test_tokenizer.py | 11 ++++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index bcafe6e0..c8ea3051 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -24,4 +24,5 @@ [Heng Yu](https://github.com/GNEHUY) +[Tianyun Ji](https://github.com/KINGNEWBLUSH) The stared contributors are the corresponding authors. diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 7e2f7e49..40b63688 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -51,11 +51,8 @@ def test_CharTokenizer(): def test_Tokenizer(): - items = [{ - "stem": - "The stationery store has $600$ exercise books, and after selling some,\ - there are still $4$ packs left, $25$ each, how many are sold?", - }] + items = ['The stationery store has $600$ exercise books, and after selling some,\ + there are still $4$ packs left, $25$ each, how many are sold?'] ans = [ 'The', 'stationery', 'store', 'has', '$', '600', '$', 'exercise', 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', @@ -63,10 +60,10 @@ def test_Tokenizer(): 'are', 'sold' ] for tok in ['nltk', 'spacy']: - tokenizer = get_tokenizer("char", + tokenizer = get_tokenizer("pure_text", stop_words=set(",?"), text_params={"tokenizer": tok}) - tokens = tokenizer(items, key=lambda x: x['stem']) + tokens = tokenizer(items) ret = next(tokens) assert ret == ans From aea99a2d3eb2f0808cb7810446b38d6639f3e878 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 11:25:34 +0000 Subject: [PATCH 04/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 8 ++++---- tests/test_tokenizer/test_tokenizer.py | 20 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index edc8fe21..8ca52a1e 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -77,7 +77,7 @@ def tokenize(text, token for token in word_tokenize(text) if token not in stopwords and token.strip() ] - except OSError: + except LookupError: nltk.download('punkt') return [ token for token in word_tokenize(text) @@ -87,7 +87,7 @@ def tokenize(text, elif (tokenizer == 'spacy'): try: spacy_tokenizer = spacy.load(tok_model) - except OSError: + except LookupError: spacy.cli.download(tok_model) spacy_tokenizer = spacy.load(tok_model) @@ -101,9 +101,9 @@ def tokenize(text, huggingface_tokenizer.models.BPE()) try: tokenizer.load(bpe_json, pretty=True) - except OSError: + except LookupError: if (bpe_trainfile is None): - raise OSError("bpe train file not found, using %s." % + raise LookupError("bpe train file not found, using %s." % bpe_trainfile) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 40b63688..c1f8c698 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -61,12 +61,28 @@ def test_Tokenizer(): ] for tok in ['nltk', 'spacy']: tokenizer = get_tokenizer("pure_text", - stop_words=set(",?"), - text_params={"tokenizer": tok}) + text_params={"tokenizer": tok, "stop_words":set(",?")}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans +def test_TokenizerBPE(): + items = ['The stationery store has $600$ exercise books, and after selling some,\ + there are still $4$ packs left, $25$ each, how many are sold?'] + ans = [ + ['h', 'e', ' ', 'st', 'at', 'io', 'n', 'er', 'y', ' ', 'st', 'o', 're', ' ', + 'h', 'as', ' $', '6', '00', '$ ', 'e', 'x', 'er', 'ci', 's', 'e', ' b', 'o', + 'o', 'k', 's', ', ', 'an', 'd', ' a', 'ft', 'er', ' ', 's', 'e', 'l', 'l', + 'in', 'g', ' ', 's', 'ome', ', ', 't', 'h', 'e', 're', ' ', 'are', ' ', + 'st', 'i', 'l', 'l', ' $', '4', '$ ', 'p', 'a', 'c', 'k', 's', ' ', 'left', + ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', + ' ', 'are', ' ', 's', 'o', 'l', 'd'] + ] + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'bpe', "bpe_trainfile":"../../static/test_data/standard_luna_data.json", "stop_words":set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans def test_SpaceTokenizer(): items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] From 970c1b9724f0faecca26bfe66eba47127e1bbd8a Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:17:07 +0000 Subject: [PATCH 05/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 3 +-- tests/test_tokenizer/test_tokenizer.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 8ca52a1e..1ab9356e 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -103,8 +103,7 @@ def tokenize(text, tokenizer.load(bpe_json, pretty=True) except LookupError: if (bpe_trainfile is None): - raise LookupError("bpe train file not found, using %s." % - bpe_trainfile) + raise LookupError("bpe train file not found, using %s." %bpe_trainfile) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=[bpe_trainfile], trainer=trainer) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index c1f8c698..7d254db7 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -61,11 +61,12 @@ def test_Tokenizer(): ] for tok in ['nltk', 'spacy']: tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": tok, "stop_words":set(",?")}) + text_params={"tokenizer": tok, "stopwords": set(",?")}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans + def test_TokenizerBPE(): items = ['The stationery store has $600$ exercise books, and after selling some,\ there are still $4$ packs left, $25$ each, how many are sold?'] @@ -79,11 +80,14 @@ def test_TokenizerBPE(): ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": 'bpe', "bpe_trainfile":"../../static/test_data/standard_luna_data.json", "stop_words":set(",?")}) + text_params={"tokenizer": 'bpe', + "bpe_trainfile": "../../static/test_data/standard_luna_data.json", + "stopwords": set(",?")}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans + def test_SpaceTokenizer(): items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] tokenizer = get_tokenizer("space", stop_words=[]) From a289a7a1847918bf792b7fe4473bdb4afa3fb1dd Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:32:10 +0000 Subject: [PATCH 06/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 14 +++++--------- tests/test_tokenizer/test_tokenizer.py | 9 ++++----- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 1ab9356e..c1941720 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -99,15 +99,11 @@ def tokenize(text, elif (tokenizer == 'bpe'): tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) - try: - tokenizer.load(bpe_json, pretty=True) - except LookupError: - if (bpe_trainfile is None): - raise LookupError("bpe train file not found, using %s." %bpe_trainfile) - trainer = BpeTrainer( - special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) - tokenizer.train(files=[bpe_trainfile], trainer=trainer) - tokenizer.save(bpe_json, pretty=True) + if (bpe_trainfile is None): + raise LookupError("bpe train file not found, using %s." % bpe_trainfile) + trainer = BpeTrainer( + special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + tokenizer.train(files=[bpe_trainfile], trainer=trainer) return [ token for token in tokenizer.encode(text) if token not in stopwords ] diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 7d254db7..b5a254cd 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -51,13 +51,12 @@ def test_CharTokenizer(): def test_Tokenizer(): - items = ['The stationery store has $600$ exercise books, and after selling some,\ - there are still $4$ packs left, $25$ each, how many are sold?'] + items = ['The stationery store has 600 exercise books, and after selling some,\ + there are still 4 packs left, 25 each, how many are sold?'] ans = [ - 'The', 'stationery', 'store', 'has', '$', '600', '$', 'exercise', + 'The', 'stationery', 'store', 'has', '600', 'exercise', 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', - '$', '4', '$', 'packs', 'left', '$', '25', '$', 'each', 'how', 'many', - 'are', 'sold' + '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' ] for tok in ['nltk', 'spacy']: tokenizer = get_tokenizer("pure_text", From ad7df8b2edc9c44b7e8a5a00ab74882d94cd5519 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:56:27 +0000 Subject: [PATCH 07/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- tests/test_tokenizer/test_tokenizer.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index c1941720..954145ac 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -87,7 +87,7 @@ def tokenize(text, elif (tokenizer == 'spacy'): try: spacy_tokenizer = spacy.load(tok_model) - except LookupError: + except OSError: spacy.cli.download(tok_model) spacy_tokenizer = spacy.load(tok_model) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index b5a254cd..fd450a1c 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -78,10 +78,8 @@ def test_TokenizerBPE(): ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] - tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": 'bpe', - "bpe_trainfile": "../../static/test_data/standard_luna_data.json", - "stopwords": set(",?")}) + tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), + "bpe_trainfile": "../../../../static/test_data/standard_luna_data.json"}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans From 9423b31911d0606ad66b0ba0f5940d4993292800 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:26:11 +0000 Subject: [PATCH 08/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 4 ++-- tests/test_tokenizer/test_tokenizer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 954145ac..0d149da9 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -92,8 +92,8 @@ def tokenize(text, spacy_tokenizer = spacy.load(tok_model) return [ - token.text for token in spacy_tokenizer(text) - if token.text not in stopwords and token.text.strip() + str(token.text) for token in spacy_tokenizer(text) + if str(token.text) not in stopwords ] elif (tokenizer == 'bpe'): diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index fd450a1c..66f5e0ee 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -79,7 +79,7 @@ def test_TokenizerBPE(): ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), - "bpe_trainfile": "../../../../static/test_data/standard_luna_data.json"}) + "bpe_trainfile": "./EduNLP/static/test_data/standard_luna_data.json"}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans From 5792e48b52081a0653b6fbf03e123b25ff218cae Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:06:14 +0000 Subject: [PATCH 09/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 4 ++-- tests/test_tokenizer/test_tokenizer.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 0d149da9..452ee0a8 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -92,8 +92,8 @@ def tokenize(text, spacy_tokenizer = spacy.load(tok_model) return [ - str(token.text) for token in spacy_tokenizer(text) - if str(token.text) not in stopwords + token.text for token in spacy_tokenizer(text) + if token.text not in stopwords ] elif (tokenizer == 'bpe'): diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 66f5e0ee..0b653222 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -4,7 +4,7 @@ import pytest from EduNLP.Tokenizer import get_tokenizer from EduNLP.Pretrain import DisenQTokenizer - +from EduNLP.utils import abs_current_dir, path_append def test_tokenizer(): with pytest.raises(KeyError): @@ -78,8 +78,9 @@ def test_TokenizerBPE(): ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] + data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), - "bpe_trainfile": "./EduNLP/static/test_data/standard_luna_data.json"}) + "bpe_trainfile": data_path}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans From edc266fffa8533e9c02b8e0b0dd2886437c1b8e0 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:22:41 +0000 Subject: [PATCH 10/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 3 ++- tests/test_tokenizer/test_tokenizer.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 452ee0a8..9449cfce 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -104,8 +104,9 @@ def tokenize(text, trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=[bpe_trainfile], trainer=trainer) + output = tokenizer.encode(text) return [ - token for token in tokenizer.encode(text) if token not in stopwords + token for token in output.tokens if token not in stopwords ] else: raise TypeError("Invalid Spliter: %s" % tokenizer) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 0b653222..a512ab55 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -6,6 +6,7 @@ from EduNLP.Pretrain import DisenQTokenizer from EduNLP.utils import abs_current_dir, path_append + def test_tokenizer(): with pytest.raises(KeyError): get_tokenizer("error") @@ -78,7 +79,8 @@ def test_TokenizerBPE(): ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] - data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) + data_path = path_append(abs_current_dir(__file__), + "../../static/test_data/standard_luna_data.json", to_str=True) tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), "bpe_trainfile": data_path}) tokens = tokenizer(items) From c526016c20a32c7afa071bb11eba8a7f54fc49d1 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:57:59 +0000 Subject: [PATCH 11/21] modified: EduNLP/SIF/tokenization/text/tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 9449cfce..17617a4f 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -90,9 +90,11 @@ def tokenize(text, except OSError: spacy.cli.download(tok_model) spacy_tokenizer = spacy.load(tok_model) - + output = spacy_tokenizer(text) + output = output.text + print("spacy out", output) return [ - token.text for token in spacy_tokenizer(text) + token.text for token in output if token.text not in stopwords ] @@ -105,8 +107,9 @@ def tokenize(text, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=[bpe_trainfile], trainer=trainer) output = tokenizer.encode(text) + output = output.tokens return [ - token for token in output.tokens if token not in stopwords + token for token in output[0] if token not in stopwords ] else: raise TypeError("Invalid Spliter: %s" % tokenizer) From 64c6cda74f00c144ee8e7ae28e6604b43bf0f8ae Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 01:28:33 +0000 Subject: [PATCH 12/21] modified: EduNLP/SIF/tokenization/text/tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 17617a4f..cd69bc54 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -108,8 +108,9 @@ def tokenize(text, tokenizer.train(files=[bpe_trainfile], trainer=trainer) output = tokenizer.encode(text) output = output.tokens + output = output[0] return [ - token for token in output[0] if token not in stopwords + token for token in output if token not in stopwords ] else: raise TypeError("Invalid Spliter: %s" % tokenizer) From 3a53b5186637a423cc4729adc166d824833f0d7f Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 01:55:18 +0000 Subject: [PATCH 13/21] modified: EduNLP/SIF/tokenization/text/tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index cd69bc54..bfa5e044 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -109,8 +109,6 @@ def tokenize(text, output = tokenizer.encode(text) output = output.tokens output = output[0] - return [ - token for token in output if token not in stopwords - ] + return output else: raise TypeError("Invalid Spliter: %s" % tokenizer) From 569bb9fac753859a044e616f31b503034dc72b13 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 02:07:18 +0000 Subject: [PATCH 14/21] modified: EduNLP/SIF/tokenization/text/tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index bfa5e044..5c7f6403 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -109,6 +109,6 @@ def tokenize(text, output = tokenizer.encode(text) output = output.tokens output = output[0] - return output + return output[0] else: raise TypeError("Invalid Spliter: %s" % tokenizer) From 45422582135d18d4dac9079c78fed619564a1c53 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 02:22:21 +0000 Subject: [PATCH 15/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 27 +++++++++++--------- tests/test_tokenizer/test_tokenizer.py | 4 +-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 5c7f6403..b9a4a173 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -7,7 +7,9 @@ import spacy import tokenizers as huggingface_tokenizer from tokenizers.trainers import BpeTrainer -from .stopwords import DEFAULT_STOPWORDS +from .stopwords import DEFAULT_STOPWORDSfrom tokenizers import Tokenizer +from tokenizers import Tokenizer + jieba.setLogLevel(logging.INFO) @@ -90,25 +92,26 @@ def tokenize(text, except OSError: spacy.cli.download(tok_model) spacy_tokenizer = spacy.load(tok_model) - output = spacy_tokenizer(text) - output = output.text - print("spacy out", output) + output = spacy_tokenizer(str(text)) return [ token.text for token in output if token.text not in stopwords ] elif (tokenizer == 'bpe'): - tokenizer = huggingface_tokenizer.Tokenizer( - huggingface_tokenizer.models.BPE()) - if (bpe_trainfile is None): - raise LookupError("bpe train file not found, using %s." % bpe_trainfile) - trainer = BpeTrainer( - special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) - tokenizer.train(files=[bpe_trainfile], trainer=trainer) + try: + tokenizer = Tokenizer.from_file('bpeTokenizer.json') + except OSError: + tokenizer = huggingface_tokenizer.Tokenizer( + huggingface_tokenizer.models.BPE()) + if (bpe_trainfile is None): + raise LookupError("bpe train file not found, using %s." % bpe_trainfile) + trainer = BpeTrainer( + special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + tokenizer.train(files=[bpe_trainfile], trainer=trainer) + tokenizer.save('bpeTokenizer.json', pretty=True) output = tokenizer.encode(text) output = output.tokens - output = output[0] return output[0] else: raise TypeError("Invalid Spliter: %s" % tokenizer) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index a512ab55..333fc0f9 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -71,13 +71,13 @@ def test_TokenizerBPE(): items = ['The stationery store has $600$ exercise books, and after selling some,\ there are still $4$ packs left, $25$ each, how many are sold?'] ans = [ - ['h', 'e', ' ', 'st', 'at', 'io', 'n', 'er', 'y', ' ', 'st', 'o', 're', ' ', + 'h', 'e', ' ', 'st', 'at', 'io', 'n', 'er', 'y', ' ', 'st', 'o', 're', ' ', 'h', 'as', ' $', '6', '00', '$ ', 'e', 'x', 'er', 'ci', 's', 'e', ' b', 'o', 'o', 'k', 's', ', ', 'an', 'd', ' a', 'ft', 'er', ' ', 's', 'e', 'l', 'l', 'in', 'g', ' ', 's', 'ome', ', ', 't', 'h', 'e', 're', ' ', 'are', ' ', 'st', 'i', 'l', 'l', ' $', '4', '$ ', 'p', 'a', 'c', 'k', 's', ' ', 'left', ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', - ' ', 'are', ' ', 's', 'o', 'l', 'd'] + ' ', 'are', ' ', 's', 'o', 'l', 'd' ] data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) From 721bc0a722dafb38154f3984e049d54993de7842 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 02:31:31 +0000 Subject: [PATCH 16/21] modified: EduNLP/SIF/tokenization/text/tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index b9a4a173..34dfcf24 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -7,8 +7,8 @@ import spacy import tokenizers as huggingface_tokenizer from tokenizers.trainers import BpeTrainer -from .stopwords import DEFAULT_STOPWORDSfrom tokenizers import Tokenizer -from tokenizers import Tokenizer +from .stopwords import DEFAULT_STOPWORDS +from tokenizers import Tokenizer as HGTokenizer jieba.setLogLevel(logging.INFO) @@ -100,8 +100,8 @@ def tokenize(text, elif (tokenizer == 'bpe'): try: - tokenizer = Tokenizer.from_file('bpeTokenizer.json') - except OSError: + tokenizer = HGTokenizer.from_file('bpeTokenizer.json') + except OSError: tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) if (bpe_trainfile is None): From 1476f8a5e5ebf392f8dbd2c140924cd336344f82 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 02:51:55 +0000 Subject: [PATCH 17/21] modified: tests/test_tokenizer/test_tokenizer.py --- tests/test_tokenizer/test_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 333fc0f9..f3e2dca3 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -52,8 +52,8 @@ def test_CharTokenizer(): def test_Tokenizer(): - items = ['The stationery store has 600 exercise books, and after selling some,\ - there are still 4 packs left, 25 each, how many are sold?'] + items = ["""The stationery store has 600 exercise books, and after selling + some, there are still 4 packs left, 25 each, how many are sold?"""] ans = [ 'The', 'stationery', 'store', 'has', '600', 'exercise', 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', From f02ccce028a005a12ebeb26f26940cd3615f289e Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 03:12:56 +0000 Subject: [PATCH 18/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- tests/test_tokenizer/test_tokenizer.py | 32 ++++++++++++++------ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 34dfcf24..8563fe49 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -101,7 +101,7 @@ def tokenize(text, elif (tokenizer == 'bpe'): try: tokenizer = HGTokenizer.from_file('bpeTokenizer.json') - except OSError: + except : tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) if (bpe_trainfile is None): diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index f3e2dca3..5202fa96 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -51,20 +51,34 @@ def test_CharTokenizer(): assert ret == ans -def test_Tokenizer(): - items = ["""The stationery store has 600 exercise books, and after selling - some, there are still 4 packs left, 25 each, how many are sold?"""] +def test_TokenizerNLTK(): + items = ["The stationery store has 600 exercise books, and after selling\ + some, there are still 4 packs left, 25 each, how many are sold?"] ans = [ 'The', 'stationery', 'store', 'has', '600', 'exercise', 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' ] - for tok in ['nltk', 'spacy']: - tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": tok, "stopwords": set(",?")}) - tokens = tokenizer(items) - ret = next(tokens) - assert ret == ans + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'nltk', "stopwords": set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans + + +def test_TokenizerSpacy(): + items = ["The stationery store has 600 exercise books, and after selling\ + some, there are still 4 packs left, 25 each, how many are sold?"] + ans = [ + 'The', 'stationery', 'store', 'has', '600', 'exercise', + 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', + '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' + ] + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'spacy', "stopwords": set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans def test_TokenizerBPE(): From 05172b4af81748fef97a134e840ad6847866ff1c Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 05:43:52 +0000 Subject: [PATCH 19/21] modified: tests/test_tokenizer/test_tokenizer.py --- tests/test_tokenizer/test_tokenizer.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index 5202fa96..d1e01c15 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -71,7 +71,7 @@ def test_TokenizerSpacy(): some, there are still 4 packs left, 25 each, how many are sold?"] ans = [ 'The', 'stationery', 'store', 'has', '600', 'exercise', - 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', + 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' ] tokenizer = get_tokenizer("pure_text", @@ -84,15 +84,7 @@ def test_TokenizerSpacy(): def test_TokenizerBPE(): items = ['The stationery store has $600$ exercise books, and after selling some,\ there are still $4$ packs left, $25$ each, how many are sold?'] - ans = [ - 'h', 'e', ' ', 'st', 'at', 'io', 'n', 'er', 'y', ' ', 'st', 'o', 're', ' ', - 'h', 'as', ' $', '6', '00', '$ ', 'e', 'x', 'er', 'ci', 's', 'e', ' b', 'o', - 'o', 'k', 's', ', ', 'an', 'd', ' a', 'ft', 'er', ' ', 's', 'e', 'l', 'l', - 'in', 'g', ' ', 's', 'ome', ', ', 't', 'h', 'e', 're', ' ', 'are', ' ', - 'st', 'i', 'l', 'l', ' $', '4', '$ ', 'p', 'a', 'c', 'k', 's', ' ', 'left', - ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', - ' ', 'are', ' ', 's', 'o', 'l', 'd' - ] + ans = ['h', '600', ' ', '^', '4', '^', ' ', '25', ' '] data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), From 767778f1aec64cda8630b9b1083789f2a01da53e Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 05:57:25 +0000 Subject: [PATCH 20/21] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- tests/test_tokenizer/test_tokenizer.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 8563fe49..668bdd68 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -101,7 +101,7 @@ def tokenize(text, elif (tokenizer == 'bpe'): try: tokenizer = HGTokenizer.from_file('bpeTokenizer.json') - except : + except Exception: tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) if (bpe_trainfile is None): diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index d1e01c15..44b4b58a 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -71,7 +71,7 @@ def test_TokenizerSpacy(): some, there are still 4 packs left, 25 each, how many are sold?"] ans = [ 'The', 'stationery', 'store', 'has', '600', 'exercise', - 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', + 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' ] tokenizer = get_tokenizer("pure_text", @@ -84,7 +84,7 @@ def test_TokenizerSpacy(): def test_TokenizerBPE(): items = ['The stationery store has $600$ exercise books, and after selling some,\ there are still $4$ packs left, $25$ each, how many are sold?'] - ans = ['h', '600', ' ', '^', '4', '^', ' ', '25', ' '] + ans = ['h', '600', ' ', '4', ' ', '25', ' '] data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), From e86a5e65d6d24fcac74271f9bf5c648d48a58107 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 07:24:30 +0000 Subject: [PATCH 21/21] modified: EduNLP/SIF/tokenization/text/tokenization.py --- EduNLP/SIF/tokenization/text/tokenization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 668bdd68..b5559b60 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -100,7 +100,7 @@ def tokenize(text, elif (tokenizer == 'bpe'): try: - tokenizer = HGTokenizer.from_file('bpeTokenizer.json') + tokenizer = HGTokenizer.from_file(bpe_json) except Exception: tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) @@ -109,7 +109,7 @@ def tokenize(text, trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=[bpe_trainfile], trainer=trainer) - tokenizer.save('bpeTokenizer.json', pretty=True) + tokenizer.save(bpe_json, pretty=True) output = tokenizer.encode(text) output = output.tokens return output[0]