-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_batches.py
183 lines (105 loc) · 3.47 KB
/
prepare_batches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
## Prepares batches of the same length but variable sentence length
## finally dictioanry is created to key : batch_no value : list of sentence ids
import json
import random
from collections import defaultdict
import cPickle as cp
import numpy as np
def load_data(path_to_data = "data/") :
with open(path_to_data+"embedding_dict.json") as f :
embedding_dict = json.load(f)
return embedding_dict
def create_partitions():
embedding_dict = load_data("data/")
length_wise_dict = defaultdict(list)
count = 0
for k,v in embedding_dict.iteritems() :
if len(v) > 4:
length_wise_dict[int(len(v))].append(k)
count+=1
print count
with open("data/length_id_dict.json","wb+") as f:
json.dump(length_wise_dict,f)
def break_data_set() :
print "Breaking data set into train: 70,validation : 15 ; test :15 "
length_id_dict = json.load(open("data/length_id_dict.json"))
key_list = sorted([int(key) for key in length_id_dict.keys()])
## Sorted list of keys
training_ids =[]
validation_ids = []
testing_ids = []
#training_dict = defaultdict(list)
#validation_dict = defaultdict(list)
#testing_dict = defaultdict(list)
count = 0
for key in key_list :
id_list = length_id_dict[str(key)]
np.random.shuffle(id_list)
len_id_list = len(id_list)
training_size = int(0.75*len_id_list)
validation_size = int(0.15*len_id_list)
testing_size = len_id_list - training_size -validation_size
if training_size is not 0 :
training_ids+=id_list[:training_size]
if validation_size is not 0 :
validation_ids+=id_list[training_size:training_size+validation_size]
if testing_size is not 0 :
testing_ids+=id_list[training_size+validation_size:]
#count+=1
#print count
with open("data/training_ids.pkl","wb+") as f :
cp.dump(training_ids,f)
with open("data/testing_ids_ids.pkl","wb+") as f :
cp.dump(testing_ids,f)
with open("data/validation_ids.pkl","wb+") as f :
cp.dump(validation_ids,f)
print "At the end"
return training_ids,testing_ids,validation_ids
def prepare_batches(training_ids,testing_ids,validation_ids):
## creates batches of data of similar length of size batch size
print "Here"
batch_size = 64
train_batch_dict_new = defaultdict(list)
test_batch_dict_new = defaultdict(list)
validation_batch_dict_new = defaultdict(list)
batch_no = 1
items_in_batch = 0
for ids in training_ids :
if items_in_batch < batch_size :
train_batch_dict_new[batch_no].append(ids)
items_in_batch+=1
else :
batch_no+=1
#print batch_no
items_in_batch = 0
batch_no = 1
items_in_batch = 0
for ids in testing_ids :
if items_in_batch < batch_size :
test_batch_dict_new[batch_no].append(ids)
items_in_batch+=1
else :
batch_no+=1
#print batch_no
items_in_batch = 0
batch_no = 1
items_in_batch = 0
for ids in validation_ids :
if items_in_batch < batch_size :
validation_batch_dict_new[batch_no].append(ids)
items_in_batch+=1
else :
batch_no+=1
#print batch_no
items_in_batch = 0
with open("data/train_batch_dict_new.json","wb+") as f :
json.dump(train_batch_dict_new,f)
with open("data/test_batch_dict_new.json","wb+") as f :
json.dump(test_batch_dict_new,f)
with open("data/validation_batch_dict_new.json","wb+") as f :
json.dump(validation_batch_dict_new,f)
if __name__ == "__main__" :
create_partitions()
## Key is the lenght of the sentence
training_ids,testing_ids,validation_ids = break_data_set()
prepare_batches(training_ids,testing_ids,validation_ids)