-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
28 lines (25 loc) · 1.13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import regex as re
import torch
import pickle
def preProcessText(text):
# put space in beteen the | -> devanagari danda to make it a separate word.
text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
# put space around the question mark ? to make it a separate word
text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
# put space in between comma(,)
text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
# remove space around the new line character
text = re.sub(r'\s*\n\s*','\n', text)
# replace any non-devangari string with a blank
text = re.sub(r'[^\u0900-\u097F,?\s+]','', text)
# add space in between the devanagari numbers and replace number by <num> token
text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text)
return text
def getTokenizer():
tokenizer_dir = "tokenizer"
tokenizer_path = tokenizer_dir + "/tokenizer.pth"
vocab_path = tokenizer_dir + "/vocab.pkl"
loaded_tokenizer = torch.load(tokenizer_path)
with open(vocab_path, 'rb') as file:
loaded_vocab = pickle.load(file)
return loaded_tokenizer, loaded_vocab