-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
160 lines (136 loc) · 6.2 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import logging
import torch
import torch.autograd as autograd
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import random, time
from cuda import ftype, itype
def prepare_sequence(edus):
# sort, pad with zeros, then transpose
edus.sort(key=len, reverse=True)
edu_lengths = [len(edu) for edu in edus]
seq_tensor = torch.zeros((len(edus), edu_lengths[0])).long()
for idx, (seq, seqlen) in enumerate(zip(edus, edu_lengths)):
seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
return seq_tensor.transpose(0, 1), edu_lengths
class BiLSTM(nn.Module):
def __init__(self, vocab_size, nclasses, embedding_dim, hidden_dim, nlayers, droprate):
super(BiLSTM, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.nlayers = nlayers
self.vocab_size = vocab_size
self.nclasses = nclasses
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
self.dropout = nn.Dropout(droprate)
self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
num_layers=self.nlayers, bidirectional=True)
# Maps the output of the LSTM to the label
self.hidden2label = nn.Linear(hidden_dim, self.nclasses)
def init_hidden(self, batch_size):
h0 = autograd.Variable(torch.zeros(self.nlayers*2, batch_size, self.hidden_dim // 2))
c0 = autograd.Variable(torch.zeros(self.nlayers*2, batch_size, self.hidden_dim // 2))
#return (h0, c0)
return (h0.cuda(), c0.cuda())
def _get_edu_reps(self, doc):
edus = [edu.indices for edu in doc.edus]
padded_edus, edu_lengths = prepare_sequence(edus)
self.hidden = self.init_hidden(padded_edus.size(-1))
embeds = self.word_embeds(autograd.Variable(torch.LongTensor(padded_edus).cuda()))
embeds_dropout = self.dropout(embeds)
embeds_packed = pack_padded_sequence(embeds_dropout, edu_lengths)
lstm_out, (ht, ct) = self.lstm(embeds_packed, self.hidden)
# concat last hidden state from fwd and bkwd LSTM
edu_reps = torch.cat((ht[0], ht[1]), dim=1)
return edu_reps
def forward(self, doc):
edu_reps = self._get_edu_reps(doc)
edu_average = torch.mean(edu_reps, dim=0)
y = self.hidden2label(edu_average.view(1, -1))
return y
def train(trncorpus, devcorpus, vocab_size, nclasses, embedding_dim, hidden_dim, nlayers,
trainer, lr, droprate, niter, report_freq, verbose, model_fname, model=None):
if not model:
model = BiLSTM(vocab_size, nclasses, embedding_dim, hidden_dim, nlayers, droprate)
model.cuda()
criterion = nn.CrossEntropyLoss()
if trainer == "sgd":
optimizer = optim.SGD(model.parameters(), lr=lr)#, weight_decay=0.0001) ## remove weight decay!
elif trainer == "adagrad":
optimizer = optim.Adagrad(model.parameters(), lr=lr)#, weight_decay=0.0001)
elif trainer == "adam":
optimizer = optim.Adam(model.parameters(), lr=lr)#, weight_decay=0.0001)
# create dev sample for reporting
dev_sample = random.sample(devcorpus.docs, report_freq//2)
logging.info("Start training")
order = list(range(trncorpus.size()))
report = 0
epoch_counter = 0
best_dev_accuracy = 0.0
sample_counter = trncorpus.size()
niter = niter * trncorpus.size() / report_freq
while report < niter:
start_time = time.time()
complete_loss = 0.0
for i in range(report_freq):
# shuffle only on first pass and once we've gone through the whole corpus
if sample_counter == trncorpus.size():
sample_counter = 0
epoch_counter += 1
logging.info("*** Starting new epoch %s", epoch_counter)
random.shuffle(order)
# build graph for this instance
doc = trncorpus.docs[order[sample_counter]]
sample_counter += 1
# Step 1. Remember that Pytorch accumulates gradients.
# We need to clear them out before each instance
model.zero_grad()
# Step 2. Get our inputs ready for the network, that is,
# turn them into Variables of word indices.
target = autograd.Variable(torch.LongTensor([doc.label])).cuda()
# Step 3. Run our forward pass.
pred_target = model(doc)
# Step 4. Compute the loss, gradients, and update the parameters by
# calling optimizer.step()
loss = criterion(pred_target, target)
loss.backward()
torch.nn.utils.clip_grad_norm(model.parameters(), 5)
# loss = loss_function(softmax_function(pred_target), target)
complete_loss += loss.data[0]
#loss.backward()
optimizer.step()
end_time = time.time()
report += 1
if verbose:
logging.info("Loss: %s", complete_loss)
logging.info("Finished report %s (%s) in %s seconds.", report, report / float(niter), end_time-start_time)
# Check predictions after training
dev_accuracy = evaluate(model, dev_sample)
logging.info("Accuracy on dev: %s (%s)", dev_accuracy, best_dev_accuracy)
if dev_accuracy > best_dev_accuracy:
best_dev_accuracy = dev_accuracy
save_model(model, model_fname)
logging.info("Saved model to %s.", model_fname)
logging.info("Done training")
return model
def evaluate(model, docs):
num_correct = 0
preds = []
labels = []
for doc in docs:
output = model(doc)
_, predicted = torch.max(output.data, 1)
if doc.label == predicted[0]:
num_correct += 1
preds.append(predicted[0])
labels.append(doc.label)
# logging.info("Preds vs .labels:\n%s \n%s", preds[:50], labels[:50])
return num_correct / len(docs)
def save_model(model, fname):
torch.save(model.state_dict(), fname)
def load_model(fname, vocab_size, nclasses, embedding_dim, hidden_dim, nlayers, droprate):
model = BiLSTM(vocab_size, nclasses, embedding_dim, hidden_dim, nlayers, droprate)
model.load_state_dict(torch.load(fname))
model.cuda()
return model