modified: EduNLP/SIF/tokenization/text/tokenization.py

modified: tests/test_tokenizer/test_tokenizer.py
bigdata-ustc · Mar 12, 2024 · 767778f · 767778f
1 parent 05172b4
commit 767778f
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py
@@ -101,7 +101,7 @@ def tokenize(text,
     elif (tokenizer == 'bpe'):
         try:
             tokenizer = HGTokenizer.from_file('bpeTokenizer.json')
-        except :
+        except Exception:
             tokenizer = huggingface_tokenizer.Tokenizer(
                 huggingface_tokenizer.models.BPE())
             if (bpe_trainfile is None):

diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py
@@ -71,7 +71,7 @@ def test_TokenizerSpacy():
               some, there are still 4 packs left, 25 each, how many are sold?"]
     ans = [
         'The', 'stationery', 'store', 'has', '600', 'exercise',
-        'books', 'and', 'after', 'selling', '               ', 'some', 'there', 'are', 'still',
+        'books', 'and', 'after', 'selling', '             ', 'some', 'there', 'are', 'still',
         '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
     ]
     tokenizer = get_tokenizer("pure_text",
@@ -84,7 +84,7 @@ def test_TokenizerSpacy():
 def test_TokenizerBPE():
     items = ['The stationery store has $600$ exercise books, and after selling some,\
         there are still $4$ packs left, $25$ each, how many are sold?']
-    ans = ['h', '600', ' ', '^', '4', '^', ' ', '25', ' ']
+    ans = ['h', '600', ' ', '4', ' ', '25', ' ']
     data_path = path_append(abs_current_dir(__file__),
                             "../../static/test_data/standard_luna_data.json", to_str=True)
     tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"),