forked from tensorflow/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
input.py
420 lines (335 loc) · 13.7 KB
/
input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cPickle
import gzip
import math
import numpy as np
import os
from scipy.io import loadmat as loadmat
from six.moves import urllib
import sys
import tarfile
from tensorflow.python.platform import gfile
from tensorflow.python.platform import flags
FLAGS = flags.FLAGS
def create_dir_if_needed(dest_directory):
"""
Create directory if doesn't exist
:param dest_directory:
:return: True if everything went well
"""
if not gfile.IsDirectory(dest_directory):
gfile.MakeDirs(dest_directory)
return True
def maybe_download(file_urls, directory):
"""
Download a set of files in temporary local folder
:param directory: the directory where to download
:return: a tuple of filepaths corresponding to the files given as input
"""
# Create directory if doesn't exist
assert create_dir_if_needed(directory)
# This list will include all URLS of the local copy of downloaded files
result = []
# For each file of the dataset
for file_url in file_urls:
# Extract filename
filename = file_url.split('/')[-1]
# Deduce local file url
#filepath = os.path.join(directory, filename)
filepath = directory + '/' + filename
# Add to result list
result.append(filepath)
# Test if file already exists
if not gfile.Exists(filepath):
def _progress(count, block_size, total_size):
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
float(count * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
filepath, _ = urllib.request.urlretrieve(file_url, filepath, _progress)
print()
statinfo = os.stat(filepath)
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
return result
def image_whitening(data):
"""
Subtracts mean of image and divides by adjusted standard variance (for
stability). Operations are per image but performed for the entire array.
:param image: 4D array (ID, Height, Weight, Channel)
:return: 4D array (ID, Height, Weight, Channel)
"""
assert len(np.shape(data)) == 4
# Compute number of pixels in image
nb_pixels = np.shape(data)[1] * np.shape(data)[2] * np.shape(data)[3]
# Subtract mean
mean = np.mean(data, axis=(1,2,3))
ones = np.ones(np.shape(data)[1:4], dtype=np.float32)
for i in xrange(len(data)):
data[i, :, :, :] -= mean[i] * ones
# Compute adjusted standard variance
adj_std_var = np.maximum(np.ones(len(data), dtype=np.float32) / math.sqrt(nb_pixels), np.std(data, axis=(1,2,3))) #NOLINT(long-line)
# Divide image
for i in xrange(len(data)):
data[i, :, :, :] = data[i, :, :, :] / adj_std_var[i]
print(np.shape(data))
return data
def extract_svhn(local_url):
"""
Extract a MATLAB matrix into two numpy arrays with data and labels
:param local_url:
:return:
"""
with gfile.Open(local_url, mode='r') as file_obj:
# Load MATLAB matrix using scipy IO
dict = loadmat(file_obj)
# Extract each dictionary (one for data, one for labels)
data, labels = dict["X"], dict["y"]
# Set np type
data = np.asarray(data, dtype=np.float32)
labels = np.asarray(labels, dtype=np.int32)
# Transpose data to match TF model input format
data = data.transpose(3, 0, 1, 2)
# Fix the SVHN labels which label 0s as 10s
labels[labels == 10] = 0
# Fix label dimensions
labels = labels.reshape(len(labels))
return data, labels
def unpickle_cifar_dic(file):
"""
Helper function: unpickles a dictionary (used for loading CIFAR)
:param file: filename of the pickle
:return: tuple of (images, labels)
"""
fo = open(file, 'rb')
dict = cPickle.load(fo)
fo.close()
return dict['data'], dict['labels']
def extract_cifar10(local_url, data_dir):
"""
Extracts the CIFAR-10 dataset and return numpy arrays with the different sets
:param local_url: where the tar.gz archive is located locally
:param data_dir: where to extract the archive's file
:return: a tuple (train data, train labels, test data, test labels)
"""
# These numpy dumps can be reloaded to avoid performing the pre-processing
# if they exist in the working directory.
# Changing the order of this list will ruin the indices below.
preprocessed_files = ['/cifar10_train.npy',
'/cifar10_train_labels.npy',
'/cifar10_test.npy',
'/cifar10_test_labels.npy']
all_preprocessed = True
for file in preprocessed_files:
if not gfile.Exists(data_dir + file):
all_preprocessed = False
break
if all_preprocessed:
# Reload pre-processed training data from numpy dumps
with gfile.Open(data_dir + preprocessed_files[0], mode='r') as file_obj:
train_data = np.load(file_obj)
with gfile.Open(data_dir + preprocessed_files[1], mode='r') as file_obj:
train_labels = np.load(file_obj)
# Reload pre-processed testing data from numpy dumps
with gfile.Open(data_dir + preprocessed_files[2], mode='r') as file_obj:
test_data = np.load(file_obj)
with gfile.Open(data_dir + preprocessed_files[3], mode='r') as file_obj:
test_labels = np.load(file_obj)
else:
# Do everything from scratch
# Define lists of all files we should extract
train_files = ["data_batch_" + str(i) for i in xrange(1,6)]
test_file = ["test_batch"]
cifar10_files = train_files + test_file
# Check if all files have already been extracted
need_to_unpack = False
for file in cifar10_files:
if not gfile.Exists(file):
need_to_unpack = True
break
# We have to unpack the archive
if need_to_unpack:
tarfile.open(local_url, 'r:gz').extractall(data_dir)
# Load training images and labels
images = []
labels = []
for file in train_files:
# Construct filename
filename = data_dir + "/cifar-10-batches-py/" + file
# Unpickle dictionary and extract images and labels
images_tmp, labels_tmp = unpickle_cifar_dic(filename)
# Append to lists
images.append(images_tmp)
labels.append(labels_tmp)
# Convert to numpy arrays and reshape in the expected format
train_data = np.asarray(images, dtype=np.float32).reshape((50000,3,32,32))
train_data = np.swapaxes(train_data, 1, 3)
train_labels = np.asarray(labels, dtype=np.int32).reshape(50000)
# Save so we don't have to do this again
np.save(data_dir + preprocessed_files[0], train_data)
np.save(data_dir + preprocessed_files[1], train_labels)
# Construct filename for test file
filename = data_dir + "/cifar-10-batches-py/" + test_file[0]
# Load test images and labels
test_data, test_images = unpickle_cifar_dic(filename)
# Convert to numpy arrays and reshape in the expected format
test_data = np.asarray(test_data,dtype=np.float32).reshape((10000,3,32,32))
test_data = np.swapaxes(test_data, 1, 3)
test_labels = np.asarray(test_images, dtype=np.int32).reshape(10000)
# Save so we don't have to do this again
np.save(data_dir + preprocessed_files[2], test_data)
np.save(data_dir + preprocessed_files[3], test_labels)
return train_data, train_labels, test_data, test_labels
def extract_mnist_data(filename, num_images, image_size, pixel_depth):
"""
Extract the images into a 4D tensor [image index, y, x, channels].
Values are rescaled from [0, 255] down to [-0.5, 0.5].
"""
# if not os.path.exists(file):
if not gfile.Exists(filename+".npy"):
with gzip.open(filename) as bytestream:
bytestream.read(16)
buf = bytestream.read(image_size * image_size * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = (data - (pixel_depth / 2.0)) / pixel_depth
data = data.reshape(num_images, image_size, image_size, 1)
np.save(filename, data)
return data
else:
with gfile.Open(filename+".npy", mode='r') as file_obj:
return np.load(file_obj)
def extract_mnist_labels(filename, num_images):
"""
Extract the labels into a vector of int64 label IDs.
"""
# if not os.path.exists(file):
if not gfile.Exists(filename+".npy"):
with gzip.open(filename) as bytestream:
bytestream.read(8)
buf = bytestream.read(1 * num_images)
labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int32)
np.save(filename, labels)
return labels
else:
with gfile.Open(filename+".npy", mode='r') as file_obj:
return np.load(file_obj)
def ld_svhn(extended=False, test_only=False):
"""
Load the original SVHN data
:param extended: include extended training data in the returned array
:param test_only: disables loading of both train and extra -> large speed up
:return: tuple of arrays which depend on the parameters
"""
# Define files to be downloaded
# WARNING: changing the order of this list will break indices (cf. below)
file_urls = ['http://ufldl.stanford.edu/housenumbers/train_32x32.mat',
'http://ufldl.stanford.edu/housenumbers/test_32x32.mat',
'http://ufldl.stanford.edu/housenumbers/extra_32x32.mat']
# Maybe download data and retrieve local storage urls
local_urls = maybe_download(file_urls, FLAGS.data_dir)
# Extra Train, Test, and Extended Train data
if not test_only:
# Load and applying whitening to train data
train_data, train_labels = extract_svhn(local_urls[0])
train_data = image_whitening(train_data)
# Load and applying whitening to extended train data
ext_data, ext_labels = extract_svhn(local_urls[2])
ext_data = image_whitening(ext_data)
# Load and applying whitening to test data
test_data, test_labels = extract_svhn(local_urls[1])
test_data = image_whitening(test_data)
if test_only:
return test_data, test_labels
else:
if extended:
# Stack train data with the extended training data
train_data = np.vstack((train_data, ext_data))
train_labels = np.hstack((train_labels, ext_labels))
return train_data, train_labels, test_data, test_labels
else:
# Return training and extended training data separately
return train_data,train_labels, test_data,test_labels, ext_data,ext_labels
def ld_cifar10(test_only=False):
"""
Load the original CIFAR10 data
:param extended: include extended training data in the returned array
:param test_only: disables loading of both train and extra -> large speed up
:return: tuple of arrays which depend on the parameters
"""
# Define files to be downloaded
file_urls = ['https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz']
# Maybe download data and retrieve local storage urls
local_urls = maybe_download(file_urls, FLAGS.data_dir)
# Extract archives and return different sets
dataset = extract_cifar10(local_urls[0], FLAGS.data_dir)
# Unpack tuple
train_data, train_labels, test_data, test_labels = dataset
# Apply whitening to input data
train_data = image_whitening(train_data)
test_data = image_whitening(test_data)
if test_only:
return test_data, test_labels
else:
return train_data, train_labels, test_data, test_labels
def ld_mnist(test_only=False):
"""
Load the MNIST dataset
:param extended: include extended training data in the returned array
:param test_only: disables loading of both train and extra -> large speed up
:return: tuple of arrays which depend on the parameters
"""
# Define files to be downloaded
# WARNING: changing the order of this list will break indices (cf. below)
file_urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
]
# Maybe download data and retrieve local storage urls
local_urls = maybe_download(file_urls, FLAGS.data_dir)
# Extract it into np arrays.
train_data = extract_mnist_data(local_urls[0], 60000, 28, 1)
train_labels = extract_mnist_labels(local_urls[1], 60000)
test_data = extract_mnist_data(local_urls[2], 10000, 28, 1)
test_labels = extract_mnist_labels(local_urls[3], 10000)
if test_only:
return test_data, test_labels
else:
return train_data, train_labels, test_data, test_labels
def partition_dataset(data, labels, nb_teachers, teacher_id):
"""
Simple partitioning algorithm that returns the right portion of the data
needed by a given teacher out of a certain nb of teachers
:param data: input data to be partitioned
:param labels: output data to be partitioned
:param nb_teachers: number of teachers in the ensemble (affects size of each
partition)
:param teacher_id: id of partition to retrieve
:return:
"""
# Sanity check
assert len(data) == len(labels)
assert int(teacher_id) < int(nb_teachers)
# This will floor the possible number of batches
batch_len = int(len(data) / nb_teachers)
# Compute start, end indices of partition
start = teacher_id * batch_len
end = (teacher_id+1) * batch_len
# Slice partition off
partition_data = data[start:end]
partition_labels = labels[start:end]
return partition_data, partition_labels