forked from ilivans/tf-rnn-attention
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
147 lines (123 loc) · 6.07 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/python
"""
Toy example of attention layer use
Train RNN (GRU) on IMDB dataset (binary classification)
Learning and hyper-parameters were not tuned; script serves as an example
"""
from __future__ import print_function, division
import numpy as np
import tensorflow as tf
from keras.datasets import imdb
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tqdm import tqdm
from attention import attention
from utils import get_vocabulary_size, fit_in_vocabulary, zero_pad, batch_generator
NUM_WORDS = 10000
INDEX_FROM = 3
SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
HIDDEN_SIZE = 150
ATTENTION_SIZE = 50
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 3 # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs
DELTA = 0.5
MODEL_PATH = './model'
# Load the data set
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
# Sequences pre-processing
vocabulary_size = get_vocabulary_size(X_train)
X_test = fit_in_vocabulary(X_test, vocabulary_size)
X_train = zero_pad(X_train, SEQUENCE_LENGTH)
X_test = zero_pad(X_test, SEQUENCE_LENGTH)
# Different placeholders
with tf.name_scope('Inputs'):
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')
# Embedding layer
with tf.name_scope('Embedding_layer'):
embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
tf.summary.histogram('embeddings_var', embeddings_var)
batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)
# (Bi-)RNN layer(-s)
rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE),
inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
tf.summary.histogram('RNN_outputs', rnn_outputs)
# Attention layer
with tf.name_scope('Attention_layer'):
attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
tf.summary.histogram('alphas', alphas)
# Dropout
drop = tf.nn.dropout(attention_output, keep_prob_ph)
# Fully connected layer
with tf.name_scope('Fully_connected_layer'):
W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE * 2, 1], stddev=0.1)) # Hidden size is multiplied by 2 for Bi-RNN
b = tf.Variable(tf.constant(0., shape=[1]))
y_hat = tf.nn.xw_plus_b(drop, W, b)
y_hat = tf.squeeze(y_hat)
tf.summary.histogram('W', W)
with tf.name_scope('Metrics'):
# Cross-entropy loss and optimizer initialization
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
tf.summary.scalar('loss', loss)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)
# Accuracy metric
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
# Batch generators
train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)
train_writer = tf.summary.FileWriter('./logdir/train', accuracy.graph)
test_writer = tf.summary.FileWriter('./logdir/test', accuracy.graph)
session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
saver = tf.train.Saver()
if __name__ == "__main__":
with tf.Session(config=session_conf) as sess:
sess.run(tf.global_variables_initializer())
print("Start learning...")
for epoch in range(NUM_EPOCHS):
loss_train = 0
loss_test = 0
accuracy_train = 0
accuracy_test = 0
print("epoch: {}\t".format(epoch), end="")
# Training
num_batches = X_train.shape[0] // BATCH_SIZE
for b in tqdm(range(num_batches)):
x_batch, y_batch = next(train_batch_generator)
seq_len = np.array([list(x).index(0) + 1 for x in x_batch]) # actual lengths of sequences
loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
feed_dict={batch_ph: x_batch,
target_ph: y_batch,
seq_len_ph: seq_len,
keep_prob_ph: KEEP_PROB})
accuracy_train += acc
loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
train_writer.add_summary(summary, b + num_batches * epoch)
accuracy_train /= num_batches
# Testing
num_batches = X_test.shape[0] // BATCH_SIZE
for b in tqdm(range(num_batches)):
x_batch, y_batch = next(test_batch_generator)
seq_len = np.array([list(x).index(0) + 1 for x in x_batch]) # actual lengths of sequences
loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
feed_dict={batch_ph: x_batch,
target_ph: y_batch,
seq_len_ph: seq_len,
keep_prob_ph: 1.0})
accuracy_test += acc
loss_test += loss_test_batch
test_writer.add_summary(summary, b + num_batches * epoch)
accuracy_test /= num_batches
loss_test /= num_batches
print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".format(
loss_train, loss_test, accuracy_train, accuracy_test
))
train_writer.close()
test_writer.close()
saver.save(sess, MODEL_PATH)
print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")