-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain_multiTaskLearning.py
91 lines (65 loc) · 3.37 KB
/
main_multiTaskLearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import random
import numpy as np
import tensorflow
random.seed(42)
np.random.seed(42)
tensorflow.set_random_seed(42)
# Models and Utils scripts
from code.models import *
from code.utils import *
# Load entire data
X_train_w, y_train1_w, y_train2_w, y_train3_w = load_data("dataset/clean_train.txt") # Training data
X_test_w, y_test1_w, y_test2_w, y_test3_w = load_data("dataset/clean_test.txt") # Testing data
X_valid_w, y_valid1_w, y_valid2_w, y_valid3_w = load_data("dataset/clean_valid.txt") # Validation data
# Merge digits under the same word
digits_word = "$NUM$"
X_train_w, X_test_w, X_valid_w = mergeDigits([X_train_w, X_test_w, X_valid_w], digits_word)
# Compute indexes for words+labels in the training data
ukn_words = "out-of-vocabulary" # Out-of-vocabulary words entry in the "words to index" dictionary
word2ind, ind2word = indexData_x(X_train_w, ukn_words)
label2ind1, ind2label1 = indexData_y(y_train1_w)
label2ind2, ind2label2 = indexData_y(y_train2_w)
label2ind3, ind2label3 = indexData_y(y_train3_w)
print(ind2label1)
print(ind2label2)
print(ind2label3)
# Convert data into indexes data
maxlen = max([len(xx) for xx in X_train_w])
padding_style = 'pre' # 'pre' or 'post': Style of the padding, in order to have sequence of the same size
X_train = encodePadData_x(X_train_w, word2ind, maxlen, ukn_words, padding_style)
X_test = encodePadData_x(X_test_w, word2ind, maxlen, ukn_words, padding_style)
X_valid = encodePadData_x(X_valid_w, word2ind, maxlen, ukn_words, padding_style)
y_train1 = encodePadData_y(y_train1_w, label2ind1, maxlen, padding_style)
y_test1 = encodePadData_y(y_test1_w, label2ind1, maxlen, padding_style)
y_valid1 = encodePadData_y(y_valid1_w, label2ind1, maxlen, padding_style)
y_train2 = encodePadData_y(y_train2_w, label2ind2, maxlen, padding_style)
y_test2 = encodePadData_y(y_test2_w, label2ind2, maxlen, padding_style)
y_valid2 = encodePadData_y(y_valid2_w, label2ind2, maxlen, padding_style)
y_train3 = encodePadData_y(y_train3_w, label2ind3, maxlen, padding_style)
y_test3 = encodePadData_y(y_test3_w, label2ind3, maxlen, padding_style)
y_valid3 = encodePadData_y(y_valid3_w, label2ind3, maxlen, padding_style)
# Create the character level data
char2ind, maxWords, maxChar = characterLevelIndex(X_train_w, digits_word)
X_train_char = characterLevelData(X_train_w, char2ind, maxWords, maxChar, digits_word, padding_style)
X_test_char = characterLevelData(X_test_w, char2ind, maxWords, maxChar, digits_word, padding_style)
X_valid_char = characterLevelData(X_valid_w, char2ind, maxWords, maxChar, digits_word, padding_style)
# Model parameters
epoch = 25
batch = 100
dropout = 0.5
lstm_size = 200
y_train = [y_train1, y_train2, y_train3]
y_test = [y_test1, y_test2, y_test3]
y_valid = [y_valid1, y_valid2, y_valid3]
ind2label = [ind2label1, ind2label2, ind2label3]
model_name = "multi_task"
BiLSTM_model(model_name, True, "crf",
[X_train, X_train_char], [X_test, X_test_char], word2ind, maxWords,
y_train, y_test, ind2label,
validation=True, X_valid=[X_valid, X_valid_char], y_valid=y_valid,
pretrained_embedding=True, word_embedding_size=300,
maxChar=maxChar, char_embedding_type="BILSTM", char2ind=char2ind, char_embedding_size=100,
lstm_hidden=lstm_size, nbr_epochs=epoch, batch_size=batch, dropout=dropout,
gen_confusion_matrix=True, early_stopping_patience=5
)
print("FINITO")