-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
79 lines (58 loc) · 2.62 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import nltk
from preprocessing import Paragraph, Sentence
class NltkDataset:
def __init__(self, corpus_name, size=None, categories=None, train_fraction=0.9, as_paragraphs=True):
corpus = getattr(nltk.corpus, corpus_name)
val_fraction = (1 - train_fraction) / 2.
if as_paragraphs:
all_fragments = [Paragraph(para) for para in corpus.paras(categories=categories)]
else:
all_fragments = [Sentence(sent) for sent in corpus.sents(categories=categories)]
if size:
all_fragments = all_fragments[:size]
train_size = int(len(all_fragments) * train_fraction)
val_size = int(len(all_fragments) * val_fraction)
self.train_fragments = all_fragments[:train_size]
self.val_fragments = all_fragments[train_size: train_size + val_size]
self.test_fragments = all_fragments[train_size + val_size:]
def get_training_fragments(self):
return self.train_fragments
def get_validation_fragments(self):
return self.val_fragments
def get_test_fragments(self):
return self.test_fragments
class Wiki2:
def __init__(self, ds_dir, size=None, as_paragraphs=True):
self.get_fragments = get_paragraphs if as_paragraphs else get_sentences
if not os.path.isdir(ds_dir):
raise NotADirectoryError(f'{ds_dir} is not a directory')
self.size = size
self.training_path = os.path.join(ds_dir, 'wiki.train.tokens')
self.validation_path = os.path.join(ds_dir, 'wiki.valid.tokens')
self.test_path = os.path.join(ds_dir, 'wiki.test.tokens')
def get_training_fragments(self):
fragments = self.get_fragments(self.training_path)
if self.size:
fragments = fragments[:self.size]
return fragments
def get_validation_fragments(self):
return self.get_fragments(self.validation_path)
def get_test_fragments(self):
return self.get_fragments(self.test_path)
def get_paragraphs(path):
with open(path, 'r') as f:
text = f.read()
paragraphs = [para.strip().split(' ') for para in text.split('\n') if para.strip() and para.strip()[0] != '=']
paragraphs_of_sentences = []
for tokens in paragraphs:
start = 0
sentences = []
for i, token in enumerate(tokens):
if token in '.?!':
sentences.append(tokens[start:i + 1])
start = i + 1
paragraphs_of_sentences.append(Paragraph(sentences))
return paragraphs_of_sentences
def get_sentences(path):
return list(Sentence(sent) for para in get_paragraphs(path) for sent in para.sentences)