-
Notifications
You must be signed in to change notification settings - Fork 5
/
data.py
executable file
·130 lines (110 loc) · 3.1 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
Contains various utility functions to manipulate training and query data.
'''
from model import *
import random
import spacy
import unicodedata
import string
import time
import math
import copy
from tqdm import tqdm
def custom_pipeline(nlp):
return (nlp.tagger, nlp.parser)
nlp = spacy.load('en', create_pipeline=custom_pipeline)
all_letters = string.ascii_letters + "1234567890-.,;'"
n_letters = len(all_letters)
def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
def init_dictionary(dictionary_dim):
word_to_ix = {'PAD' : 0, '$' : 1, 'UNK' : 2}
ix_to_word = ["PAD", "$", "UNK"]
with open("training/dictionary.txt") as f_in:
counter = 0
for line in f_in:
if counter == dictionary_dim:
break
w = unicode_to_ascii(line)
if w not in word_to_ix:
word_to_ix[w] = len(word_to_ix)
ix_to_word.append(w)
counter += 1
return (word_to_ix, ix_to_word)
def split_to_sentences(lines):
sentences = []
for line in lines:
if line.startswith("TITLE:") or line.startswith("=="):
continue
tokens = nlp(line)
for sent in tokens.sents:
words = []
for word in sent:
words.append(unicode_to_ascii(word.text))
sentences.append(' '.join(words).strip())
return sentences
def mask_words(sent):
sent_list = sent.split()
if len(sent_list) <= 0:
return None, None
sents = list()
targets = list()
for i in range(len(sent_list)):
new_list = copy.deepcopy(sent_list)
new_list[i] = "$"
sents.append(new_list)
targets.append(sent_list[i])
return sents, targets
def prepare_sequence(sent_list, word, to_ix):
sent_tensor = prepare_tensor(sent_list, to_ix)
if sent_tensor is None:
return None, None
target = to_ix[word] if word in to_ix else to_ix['UNK']
return sent_tensor, target
def prepare_tensor(seq, to_ix):
try:
ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq]
except:
ids = None
finally:
return ids
def _prepare_sequence(seq, to_ix):
try:
ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq]
except:
ids = None
finally:
return ids
def elapsed(start):
now = time.time()
s = now - start
m = math.floor(s/60)
s -= m*60
s = float('%.3f' % (s))
if s < 10:
s = ('0%.3f' % (s))
ret = ('Time elapsed: %sm %ss' % (m, s))
return ret
def format_date(t):
date = time.localtime(t)
m = int(date.tm_mon)
if m < 10:
m = ('0%d' % (m))
d = int(date.tm_mday)
if d < 10:
d = ('0%d' % (d))
hr = int(date.tm_hour)
if hr < 10:
hr = ('0%d' % (hr))
mn = int(date.tm_min)
if mn < 10:
mn = ('0%d' % (mn))
sc = int(date.tm_sec)
if sc < 10:
sc = ('0%d' % (sc))
ret = ('%s/%s/%s %s:%s:%s' % (date.tm_year, m, d, hr, mn, sc))
return ret