-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
240 lines (182 loc) · 6.66 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
'''\
Dataset loading module. See maxout -h for a list of the supported datasets.
Using the same reinitializable iterator for all splits.
'''
import numpy as np
import tensorflow as tf
import struct
from tools import with_persistent_vars
def dataset(name, group, batch=None, seed=None):
'''\
Returns the Dataset. 'train', 'val' and 'test' datasets contain the same type
of elements. However, 'train' is also repeated and shuffled. 'val' and 'test'
are always returned as whole, 'train' can be divided in batches.
Args:
name: name of the dataset to use.
group: 'train' 'val' or 'test'.
batch: how many samples to return. If None, the entire dataset is returned.
seed: seed for deterministic dataset permutation.
Returns:
a tf Dataset that contains groups of (features, label) pairs
'''
# Check
if not group in ('train','val','test'):
raise ValueError("Group must be 'train', 'val' or 'test'")
# Create
if name == 'example':
(data, size) = _iris_dataset(group)
elif name == 'mnist':
(data, size) = _mnist_dataset(group)
elif name == 'cifar10':
(data, size) = _cifar10_dataset(group)
else:
raise ValueError(name + ' is not a valid dataset')
# Select batch
if not batch or batch < 1 or batch > size :
batch = size
# Input pipeline
if group == 'train':
data = data.shuffle(min(size, 10000), seed=seed) # shuffle up to 10000
data = data.repeat() # infinite repetition
data = data.batch(batch) # batches
data = data.prefetch(1) # also fetch next batch
else:
data = data.batch(min(size, 300)) # predict up to 300 toghether
return data
def iterator(name):
'''\
Returns a reinitializable tf Iterator for batches of the dataset 'name'. The
shape of the objects returned depends on 'name'. The iterator has the first
dimension unspecified, because that is the batch size. The iterator is
created on a generic dataset of the correct type. It needs to be initialized
with a dataset before use.
Args:
name: name of the dataset to use.
Returns:
a tf Iterator
'''
if name == 'example':
return _iris_iterator()
elif name == 'mnist':
return _mnist_iterator()
elif name == 'cifar10':
return _cifar10_iterator()
else:
raise ValueError(name + ' is not a valid dataset')
@with_persistent_vars(mean=None, std=None)
def _mnist_dataset(group):
# This dataset usually can be loaded all at once and used in batches
# Read files
if group == 'train':
images, labels = _read_mnist(
"datasets/MNIST/train-55k-images-idx3-ubyte",
"datasets/MNIST/train-55k-labels-idx1-ubyte")
elif group == 'val':
images, labels = _read_mnist(
"datasets/MNIST/val-5k-images-idx3-ubyte",
"datasets/MNIST/val-5k-labels-idx1-ubyte")
else:
images, labels = _read_mnist(
"datasets/MNIST/test-10k-images-idx3-ubyte",
"datasets/MNIST/test-10k-labels-idx1-ubyte")
# Standardization
if not _mnist_dataset.mean: # compute same for all splits
_mnist_dataset.mean = np.mean(images)
_mnist_dataset.std = np.std(images)
images = (images - _mnist_dataset.mean) / _mnist_dataset.std
# To TF
n = labels.shape[0]
images = tf.constant(images, dtype=tf.float32, name='features')
labels = tf.constant(labels, dtype=tf.int32, name='labels')
data = tf.data.Dataset.from_tensor_slices((images, labels))
return (data, n)
def _read_mnist(images_name, labels_name):
with open(labels_name, 'rb') as lab:
magic, n = struct.unpack('>II',lab.read(8))
labels = np.fromfile(lab, dtype=np.uint8)
with open(images_name, "rb") as img:
magic, num, rows, cols = struct.unpack(">IIII",img.read(16))
images = np.fromfile( img, dtype=np.uint8).reshape(len(labels), 784)
## image plot example
#img = np.reshape(img,(28,28))
#plt.imshow(img)
#plt.show()
return images,labels
def _mnist_iterator():
'''\
See iterator().
'''
return tf.data.Iterator.from_structure(
output_types = (tf.float32, tf.int32),
output_shapes = ((None,784), (None,)),
shared_name='MNIST_iterator')
@with_persistent_vars(mean=None, std=None)
def _cifar10_dataset(group):
# This dataset is small enough to be loaded all at once
# Read files
if group == 'train':
images, labels = _read_cifar10("datasets/CIFAR-10/train_data.bin")
elif group == 'val':
images, labels = _read_cifar10("datasets/CIFAR-10/val_data.bin")
else:
images, labels = _read_cifar10("datasets/CIFAR-10/test.bin")
# Standardization
if not _cifar10_dataset.mean: # compute same for all splits
_cifar10_dataset.mean = np.mean(images)
_cifar10_dataset.std = np.std(images)
images = (images - _cifar10_dataset.mean) / _cifar10_dataset.std
# To TF
n = labels.shape[0]
images = tf.constant(images, dtype=tf.float32, name='features')
labels = tf.constant(labels, dtype=tf.int32, name='labels')
data = tf.data.Dataset.from_tensor_slices((images, labels))
return (data, n)
def _read_cifar10(data_path):
# Load dataset
dataset = np.fromfile(data_path, dtype=np.uint8)
dataset = np.reshape(dataset, (-1, 1+3*32*32)) # label+channel*row*col
images = dataset[:,1:]
labels = dataset[:,0]
# Reshape images as 3D tensors
images = np.reshape(images, (-1, 3, 32, 32)) # batch, chs, rows, cols
images = np.transpose(images, (0, 2, 3, 1)) # batch, rows, cols, chs
return images, labels
def _cifar10_iterator():
'''\
See iterator().
'''
return tf.data.Iterator.from_structure(
output_types = (tf.float32, tf.int32),
output_shapes = ((None,32,32,3), (None,)),
shared_name='cifar10_iterator')
def _iris_dataset(group):
'''\
See dataset().
Returns:
(dataset, n): dataset and number of samples available
'''
# Read files
if group == 'train':
x = np.genfromtxt('datasets/IRIS/iris_train_features.csv',delimiter=',')
y = np.genfromtxt('datasets/IRIS/iris_train_labels.csv',delimiter=',')
elif group == 'val':
x = np.genfromtxt('datasets/IRIS/iris_val_features.csv',delimiter=',')
y = np.genfromtxt('datasets/IRIS/iris_val_labels.csv',delimiter=',')
else:
x = np.genfromtxt('datasets/IRIS/iris_test_features.csv',delimiter=',')
y = np.genfromtxt('datasets/IRIS/iris_test_labels.csv',delimiter=',')
# To TF
n = y.size
x = tf.constant(x, dtype=tf.float32, name='features')
y = tf.constant(y, dtype=tf.int32, name='labels')
# Return dataset
data = tf.data.Dataset.from_tensor_slices((x,y))
return (data, n)
def _iris_iterator():
'''\
See iterator().
'''
return tf.data.Iterator.from_structure(
output_types = (tf.float32, tf.int32),
output_shapes = ((None,4), (None,)),
shared_name='Iris_iterator')