-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
202 lines (158 loc) · 5.71 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import json
import gutenbergpy.textget
import nltk
from collections import Counter
import os
from pathlib import Path
import random
import numpy as np
import torch
def dictToJson(dictionary, path):
## Stores a python dict to a .json file
with open(path, 'w') as f:
json.dump(dictionary, f)
def jsonToDict(path):
## Returns a python dict from a .json file
with open(path) as f:
dictionary = json.load(f)
return dictionary
def text_to_sequences(text):
## Tokenizing string based on blankline (paragraphs)
sequences = nltk.blankline_tokenize(text)
## If we want sentance level data uncomment:
# sequences = list(map(nltk.sent_tokenize, sequences))
# sequences = sum(sequences, [])
## Tokenizing each item in sequence into words & punctuation
sequences = list(map(nltk.wordpunct_tokenize, sequences))
return sequences
def write_sequences(list_of_strings, path):
with open(path, 'w', encoding='utf8') as f:
for line in list_of_strings:
f.write(' '.join(line) + '\n\n')
def read_sequences(path):
## Reading file to string
with open(path, 'r', encoding='utf8') as f:
text = f.read()
sequences = text_to_sequences(text)
return sequences
def pad_sequences(num_predict_words, sequences):
## Apply padding to each sequence on both ends
start = ['<sos>'] * num_predict_words
end = ['<eos>']
pad = lambda x: start + x + end
sequences = list(map(pad, sequences))
return sequences
def encode_sequences(encoder, sequences):
## Turns sequences containing words into sequences of integers
return list(map(lambda seq: [encoder[word] for word in seq], sequences))
def decode_sequences(decoder, sequences):
return list(map(lambda seq: [decoder[str(num)] for num in seq], sequences))
def chunk_sequence(num_predict_words, seq):
examples = []
for i in range(len(seq) - num_predict_words):
examples.append(seq[i : i + num_predict_words + 1])
return examples
def generate_examples(num_predict_words, sequences):
data = []
for seq in sequences:
data += chunk_sequence(num_predict_words, seq)
return data
class Book:
def __init__(self, gutenberg_id):
## Get, decode, and lowercase a book fromm gutenberg
self.text = gutenbergpy.textget.strip_headers(
gutenbergpy.textget.get_text_by_id(gutenberg_id)
).decode('utf-8').lower()
def clean(self, min_seq_len=4):
## Remove unwanted punctuation
## ['“', '”', '"', '"', '(', ')'] <-- keep for later
remove_chars = ['-', '—', '_', '*', '“', '”', '"', '"', '‘', '’', '\'']
for char in remove_chars:
self.text = self.text.replace(char, ' ')
## Generate sequences from text
self.sequences = text_to_sequences(self.text)
## Remove sequences < min_seq_len
self.sequences = list(filter(lambda x: len(x) > min_seq_len, self.sequences))
def extract_raw_data(ids):
## List to hold all sequences
sequences = []
## Extracting sequences from books
for ID in ids:
book = Book(gutenberg_id=ID)
## Cleaning / tokenizing
book.clean()
sequences += book.sequences
return sequences
def build_vocabulary(sequences, special_tokens=['<sos>', '<eos>']):
## Creating vocabulary
vocab = Counter()
for seq in sequences:
vocab.update(seq)
## Adding start and end of sequence tokens
vocab.update(special_tokens)
## Saving reversable vocabulary
decoder = dict(enumerate(vocab.keys()))
encoder = {v:k for k, v in decoder.items()}
return (decoder, encoder)
def save_data(path, sequences, vocab, split={'test':0.10, 'valid':0.05}):
## Creating location to store datset
os.makedirs(path, exist_ok=True)
## Reserving sequences for validation
random.shuffle(sequences)
valid_split = int(len(sequences) * split['valid'])
test_split = int(len(sequences) * split['test'])
mid = valid_split + test_split
valid = sequences[0 : valid_split]
test = sequences[valid_split : mid]
train = sequences[mid : len(sequences)]
## Validation
write_sequences(valid, path / 'valid.txt')
## Testing
write_sequences(test, path / 'test.txt')
## Training
write_sequences(train, path / 'train.txt')
## Saving vocab
decoder, encoder = vocab
dictToJson(decoder, path / 'decoder.json')
dictToJson(encoder, path / 'encoder.json')
def sequences_pipeline(path, num_predict_words, encoder):
data = read_sequences(path)
data = pad_sequences(num_predict_words, data)
data = encode_sequences(encoder, data)
data = generate_examples(num_predict_words, data)
random.shuffle(data)
return np.array(data)
def create_dataset(num_predict_words):
## Main file to house all data derived from the data in main/
data_path = Path('data/').resolve()
## File to keep the unprocessed data in
main_path = 'main/'
path = data_path / main_path
## If the main dataset doesnt exist: build it
if not os.path.exists(path):
## These ids correspond to 4 of Tolstoys novels on gutenberg.org
ids = [2600, 1399, 243, 6157]
sequences = extract_raw_data(ids)
## vocab is a tuple: (decoder, encoder)
vocab = build_vocabulary(sequences)
save_data(path, sequences, vocab)
## Creating location to store this processed data
os.makedirs(data_path / str(num_predict_words), exist_ok=False)
encoder = jsonToDict(path / 'encoder.json')
file_names = ['train', 'test', 'valid']
## Passing data through pipeline
for file_name in file_names:
data = sequences_pipeline(path / (file_name + '.txt'), num_predict_words, encoder)
np.save(data_path / str(num_predict_words) / (file_name + '.npy'), data)
class TextData(torch.utils.data.Dataset):
def __init__(self, data):
super(TextData, self).__init__()
## Casting incoming data as a LongTensor
data = torch.LongTensor(data)
## Splitting data into columns
self.prediction_words = data[:, :-1]
self.target_word = data[:, -1]
def __len__(self):
return len(self.target_word)
def __getitem__(self, idx):
return self.prediction_words[idx], self.target_word[idx]