forked from JJJavaLai/POSTagging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PaddingSample.py
147 lines (119 loc) · 5.91 KB
/
PaddingSample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from keras.preprocessing.text import Tokenizer
import pickle
import os
from keras.preprocessing.sequence import pad_sequences
# read preprocessed data
# pickled_data = {"total_words": total_words_set, "total_tokens": total_tokens_set, "total_X": total_X,
# "total_Y": total_Y, "train_X": train_X, "train_Y": train_Y, "validation_X": validation_X,
# "validation_Y": validation_Y, "test_X": test_X, "test_Y": test_Y}
# total_words: total words set
# total_tokens: total tokens set
# total_X: total sentences in data
# total_Y: total tokens of sentences in data, total_Y and total_X are pairs
#
# train_X: train sentences in data
# train_Y: train tokens of sentences in data, train_Y and train_X are pairs
#
# validation_X: validation sentences in data
# validation_Y: validation tokens of sentences in data, validation_Y and validation_X are pairs
#
# test_X: test sentences in data
# test_Y: test tokens of sentences in data, test_Y and test_X are pairs
with open('PickledData/pickled_data.pkl', 'rb') as f:
pickled_data = pickle.load(f)
total_words = pickled_data["total_words"]
total_tokens = pickled_data["total_tokens"]
total_X = pickled_data["total_X"]
total_Y = pickled_data["total_Y"]
train_X = pickled_data["train_X"]
train_Y = pickled_data["train_Y"]
validation_X = pickled_data["validation_X"]
validation_Y = pickled_data["validation_Y"]
test_X = pickled_data["test_X"]
test_Y = pickled_data["test_Y"]
# Vectorise X and Y
# Encode X and Y to integer values
word2int = {}
int2word = {}
token2int = {}
int2token = {}
# generate dict from word to index, index to word, token to index and index to token
for i, word in enumerate(total_words):
word2int[word] = i + 1
int2word[i + 1] = word
for i, token in enumerate(total_tokens):
token2int[token] = i + 1
int2token[i + 1] = token
def tokenizer(sequences, dicts):
encoded_sequences = []
for sub_sequence in sequences:
encoded_sub_sequence = []
for item in sub_sequence:
encoded_sub_sequence.append(dicts[item])
encoded_sequences.append(encoded_sub_sequence)
return encoded_sequences
encoded_train_X = tokenizer(train_X, word2int)
encoded_train_Y = tokenizer(train_Y, token2int)
encoded_validation_X = tokenizer(validation_X, word2int)
encoded_validation_Y = tokenizer(validation_Y, token2int)
encoded_test_X = tokenizer(test_X, word2int)
encoded_test_Y = tokenizer(test_Y, token2int)
print("Train sample \n", "-" * 50, "\n")
print('X: ', train_X[0], '\n')
print('Y: ', train_Y[0], '\n')
print()
print("Encoded train sample \n", "-" * 50, "\n")
print('X: ', encoded_train_X[0], '\n')
print('Y: ', encoded_train_Y[0], '\n')
print("-" * 50, "\n")
print("Validation sample \n", "-" * 50, "\n")
print('X: ', validation_X[0], '\n')
print('Y: ', validation_Y[0], '\n')
print()
print("Encoded validation sample \n", "-" * 50, "\n")
print('X: ', encoded_validation_X[0], '\n')
print('Y: ', encoded_validation_Y[0], '\n')
print("-" * 50, "\n")
print("Test sample \n", "-" * 50, "\n")
print('X: ', test_X[0], '\n')
print('Y: ', test_Y[0], '\n')
print()
print("Encoded test sample \n", "-" * 50, "\n")
print('X: ', encoded_test_X[0], '\n')
print('Y: ', encoded_test_Y[0], '\n')
print("-" * 50, "\n")
# length check, each pair of input and output sequence should have same length
different_length = [1 if len(input) != len(output) else 0 for input, output in zip(encoded_train_X, encoded_train_Y)]
print("{} sentences have disparate input-output lengths in train samples.".format(sum(different_length)))
different_length = [1 if len(input) != len(output) else 0 for input, output in
zip(encoded_validation_X, encoded_validation_Y)]
print("{} sentences have disparate input-output lengths in validation samples.".format(sum(different_length)))
different_length = [1 if len(input) != len(output) else 0 for input, output in zip(encoded_test_X, encoded_test_Y)]
print("{} sentences have disparate input-output lengths in test samples.".format(sum(different_length)))
# pad sequence
# As of now, the sentences present in the data are of various lengths.
# We need to either pad short sentences or truncate long sentences to a fixed length.
# This fixed length is a hyperparameter.
# Pad each sequence to MAX_SEQ_LENGTH using KERAS' pad_sequences() function.
# Sentences longer than MAX_SEQ_LENGTH are truncated.
# Sentences shorter than MAX_SEQ_LENGTH are padded with zeroes.
# Truncation and padding can either be 'pre' or 'post'.
# For padding we are using 'pre' padding type, that is, add zeroes on the left side.
# For truncation, we are using 'post', that is, truncate a sentence from right side.
MAX_SEQ_LENGTH = 100
train_X = padded_train_X = pad_sequences(encoded_train_X, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
train_Y = padded_train_Y = pad_sequences(encoded_train_Y, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
validation_X = padded_validation_X = pad_sequences(encoded_validation_X, maxlen=MAX_SEQ_LENGTH, padding="pre",
truncating="post")
validation_Y = padded_validation_Y = pad_sequences(encoded_validation_Y, maxlen=MAX_SEQ_LENGTH, padding="pre",
truncating="post")
test_X = padded_test_X = pad_sequences(encoded_test_X, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
test_Y = padded_test_Y = pad_sequences(encoded_test_Y, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
padded_samples = {"train_X": train_X, "train_Y": train_Y, "validation_X": validation_X, "validation_Y": validation_Y,
"test_X": test_X, "test_Y": test_Y, "MAX_SEQ_LENGTH": 100, "int2token": int2token,
"int2word": int2word}
if not os.path.exists('PaddedData/'):
print('MAKING DIRECTORY PaddedData/ to save pickled padded samples')
os.makedirs('PaddedData/')
with open('PaddedData/padded_samples.pkl', 'wb') as f:
pickle.dump(padded_samples, f)