-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnn_tc.py
executable file
·90 lines (69 loc) · 3.05 KB
/
nn_tc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from __future__ import print_function
import os
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from model.model import model_selector
from reader.filereader import read_glove_vectors, read_input_data
from utils import argumentparser
np.random.seed(42)
def main():
args = argumentparser.ArgumentParser()
train(args)
def train(args):
print('Reading word vectors.')
embeddings_index = read_glove_vectors(args.embedding_file_path)
print('Found {} word vectors.'.format(len(embeddings_index)))
print('Processing input data')
texts, labels_index, labels = read_input_data(args.data_dir)
# texts - list of text samples
# labels_index - dictionary mapping label name to numeric id
# labels - list of label ids
print('Found {} texts.'.format(len(texts)))
# Vectorize the text sample into 2D integer tensor
tokenizer = Tokenizer(nb_words=args.nb_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))
data = pad_sequences(sequences, maxlen=args.max_sequence_len)
# Transform labels to be categorical variables
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the input data into training set and validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(args.validation_split * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Preparing embedding matrix.')
# initiate embedding matrix with zero vectors.
nb_words = min(args.nb_words, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
for word, i in word_index.items():
if i > nb_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
args.nb_words = nb_words
args.len_labels_index = len(labels_index)
model = model_selector(args, embedding_matrix)
checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss',
verbose=1, save_best_only=True)
callbacks_list = [checkpoint]
model_json = model.to_json()
with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
json_file.write(model_json)
model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list)
if __name__ == '__main__':
main()