-
Notifications
You must be signed in to change notification settings - Fork 2
/
dataset_build.py
314 lines (248 loc) · 13.3 KB
/
dataset_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import pickle
import numpy as np
from include.telegram_logger import *
from include.globals_and_functions import *
import gc
def calculate_dataset_statistics(data_mean, data_std, data_size):
"""
This function is used to calculate mean and standart deviantion of the dataset
without loading the full dataset into memory, as it would be loaded astype float64
The way used to calculate mean and std was:
1 - instead of calculating mean and std for all videos in the dataset,
calculate mean and std for each video in the dataset
2 - After this, calculate the joined mean and std of all videos, by
2.1 - combined_mean = weighted average of all means
2.2 - combined_std using the formula on https://www.statstodo.com/CombineMeansSDs_Pgm.php
"""
# If there is only on video, there is no meaning in combining multiple statistics
if len(data_size) == 1:
return data_mean, data_std
combined_mean = np.average(data_mean, axis=(0), weights=data_size)
tn = np.sum(data_size)
tx = np.array(data_mean)
for i in range(data_mean.shape[0]):
tx[i] = tx[i] * data_size[i]
tx = np.sum(tx, axis=0)
txx = np.square(data_std)
for i in range(data_std.shape[0]):
txx[i] = txx[i] * data_size[i]-1
A = (np.square(data_mean)[i] * data_size[i])
txx[i] = txx[i] + A
txx = np.sum(txx, axis=0)
combined_std = np.sqrt((txx - (np.square(tx) / tn))/(tn))
return combined_mean, combined_std
def normalize_dataset(data, mean, std):
"""
Reshape dataset from (n,224*224*3) to (n,224,224,3)
Normalize dataset by subtracting mean and dividing by std
"""
data = np.reshape(data, (data.shape[0],)+CONST_VEC_DATASET_OUTPUT_IMAGE_SHAPE).astype("float32")
data[:, :, :, 0] -= (mean[0])
data[:, :, :, 1] -= (mean[1])
data[:, :, :, 2] -= (mean[2])
data[:, :, :, 0] /= (std[0])
data[:, :, :, 1] /= (std[1])
data[:, :, :, 2] /= (std[2])
return data
def dataset_build(train_videos, test_videos):
try:
# -------------- Error checking
# load config file with all processed videos in disk
try: #load the config file
with open(os.path.join(CONST_STR_DATASET_BASE_PATH,CONST_STR_DATASET_DATAPATH,CONST_STR_DATASET_CONFIG_FILENAME), "rb") as fp:
processed_videos = pickle.load(fp)
except: # new dataset, config file does not exist
print_error("There is no config file on system. You should run the dataset_process.py script before atempting to build it")
exit(1)
# Check if i'm passing a video that is not processed in the dataset
unknown_videos = [item for item in train_videos if item not in processed_videos]
if unknown_videos != []:
print_error("The following videos were not processed in the dataset:")
for unknown_video in unknown_videos:
print('\t'+unknown_video)
print_error("Please run dataset_process.py before atempting to build")
exit(1)
unknown_videos = [item for item in test_videos if item not in processed_videos]
if unknown_videos != []:
print_error("The following videos were not processed in the dataset:")
for unknown_video in unknown_videos:
print('\t'+unknown_video)
print_error("Please run dataset_process.py before atempting to build")
exit(1)
# Check if a video is beeing used for both test and train dataset
common_videos = [item for item in test_videos if item in train_videos]
if common_videos != []:
print_error("The following videos are being used for both train and test dataset. You should not train and test on same videos!")
for common_video in common_videos:
print('\t'+common_video)
print_error("Please reevaluate your data")
exit(1)
# -------------- Build starting
# This Loop:
#
# get all number of frames: train_number_of_frames
# get all means and std: train_mean, train_std
# get all frames for dataset: input_train_data
# get all outputs on dataset: output_train_data
#
# Test and Train codes are the same, just duplicated
#
# -------------- Train dataset
#
print_info("Starting building process for train dataset")
telegramSendMessage("Starting building process for train dataset")
train_number_of_frames = [] # Vector with the total number of frames in each video. This is necessary to calculate a number of things such as total mean, std, data loading, etc...
first_video = True
for video_name in train_videos:
video_datapath = os.path.join(CONST_STR_DATASET_BASE_PATH,CONST_STR_DATASET_DATAPATH,video_name.replace(".", ""))
# Get total number of frames in video
with open(os.path.join(video_datapath,CONST_STR_DATASET_NMB_OF_FRAMES_FILENAME), "rb") as fp:
number_of_frames = pickle.load(fp)
train_number_of_frames.append(number_of_frames)
# Get the statistics in the video
# Put those in the train_mean and train_std ndarrays
with open(os.path.join(video_datapath,CONS_STR_DATASET_STATISTICS_FILENAME), "rb") as fp:
video_statistics = pickle.load(fp)
video_statistics = np.asarray(video_statistics)
if first_video:
train_mean = video_statistics[0]
train_std = video_statistics[1]
else:
train_mean = np.vstack((train_mean, video_statistics[0]))
train_std = np.vstack((train_std, video_statistics[1]))
# Load numpy video data.
video_data = np.load(os.path.join(video_datapath,CONS_STR_DATASET_STACKED_FRAMES_FILENAME))
if first_video:
# Declare the train data. this is the input data used to train the model
input_train_data = video_data
else:
input_train_data = np.vstack((input_train_data, video_data))
# Load numpy output data.
audio_data = np.load(os.path.join(video_datapath,CONS_STR_DATASET_AUDIODATA_FILENAME))
# If audio file is Stereo, I take the mean between both chanels and concatenate in one channel
try:
if audio_data.shape[1] == 2:
print_info("Audio file from video "+video_name+" is stereo. Taking the average of both channels as output")
audio_data = np.mean(audio_data, axis=1)
except:
pass
if first_video:
# Declare the train data. this is the input data used to train the model
output_train_data = audio_data
else:
output_train_data = np.concatenate((output_train_data, audio_data))
# Last of all, switch of the "first video" variable
if first_video:
first_video = False
# Calculate mean and std for all dataset:
# Dataset preprossesing. We normalize the dataset by subtracting mean and dividing by std
#
train_combined_mean, train_combined_std = calculate_dataset_statistics(train_mean, train_std, train_number_of_frames)
input_train_data = normalize_dataset(input_train_data, train_combined_mean, train_combined_std)
# For some reason, axis 3 (colour) is fliped
input_train_data = np.flip(input_train_data, axis=3)
#
# -------------- Test dataset
#
test_number_of_frames = [] # Vector with the total number of frames in each video. This is necessary to calculate a number of things such as total mean, std, data loading, etc...
first_video = True
for video_name in test_videos:
video_datapath = os.path.join(CONST_STR_DATASET_BASE_PATH,CONST_STR_DATASET_DATAPATH,video_name.replace(".", ""))
# Get total number of frames in video
with open(os.path.join(video_datapath,CONST_STR_DATASET_NMB_OF_FRAMES_FILENAME), "rb") as fp:
number_of_frames = pickle.load(fp)
test_number_of_frames.append(number_of_frames)
# Get the statistics in the video
# Put those in the train_mean and train_std ndarrays
with open(os.path.join(video_datapath,CONS_STR_DATASET_STATISTICS_FILENAME), "rb") as fp:
video_statistics = pickle.load(fp)
video_statistics = np.asarray(video_statistics)
if first_video:
test_mean = video_statistics[0]
test_std = video_statistics[1]
else:
test_mean = np.vstack((test_mean, video_statistics[0]))
test_std = np.vstack((test_std, video_statistics[1]))
# Load numpy video data.
video_data = np.load(os.path.join(video_datapath,CONS_STR_DATASET_STACKED_FRAMES_FILENAME))
if first_video:
# Declare the train data. this is the input data used to train the model
input_test_data = video_data
else:
input_test_data = np.vstack((input_test_data, video_data))
# Load numpy output data.
audio_data = np.load(os.path.join(video_datapath,CONS_STR_DATASET_AUDIODATA_FILENAME))
# If audio file is Stereo, I take the mean between both chanels and concatenate in one channel
try:
if audio_data.shape[1] == 2:
print_info("Audio file from video "+video_name+" is stereo. Taking the average of both channels as output")
audio_data = np.mean(audio_data, axis=1)
except:
pass
if first_video:
# Declare the train data. this is the input data used to train the model
output_test_data = audio_data
else:
output_test_data = np.concatenate((output_test_data, audio_data))
# Last of all, switch of the "first video" variable
if first_video:
first_video = False
# Calculate mean and std for all dataset:
# Dataset preprossesing. We normalize the dataset by subtracting mean and dividing by std
#
test_combined_mean, test_combined_std = calculate_dataset_statistics(test_mean, test_std, test_number_of_frames)
input_test_data = normalize_dataset(input_test_data, test_combined_mean, test_combined_std)
# For some reason, axis 3 (colour) is fliped
input_test_data = np.flip(input_test_data, axis=3)
# Return the builded dataset and the vectors with train and test video sizes
print_info("Dataset built completed")
return input_train_data, output_train_data, input_test_data, output_test_data, train_number_of_frames, test_number_of_frames
except Exception as e:
print_error('An error has occurred')
print_error(str(e))
telegramSendMessage('[ERROR]: An error has occurred')
telegramSendMessage(str(e))
if __name__ == "__main__":
#
# Script for generation of the "folds", so that training process is made upon various different scenarios
#
from folds import *
# Make directory to hold this folder information
try:
fold_path = os.path.join(CONST_STR_DATASET_BASE_PATH,CONST_STR_DATASET_DATAPATH,CONST_STR_DATASET_FOLDS_DATAPATH)
if not os.path.exists(fold_path):
os.makedirs(fold_path)
except OSError:
print_error("Could not make directory for folds")
telegramSendMessage("Error: Creating directory")
exit()
for fold in folds:
telegramSendMessage("Creating fold "+str(fold['number']))
print_info("Creating fold "+str(fold['number']))
[
input_train_data,
output_train_data,
input_test_data,
output_test_data,
train_number_of_frames,
test_number_of_frames
] = dataset_build(fold["training_videos"], fold["testing_videos"])
# Save this dataset to the corresponding fold path
telegramSendMessage("Saving fold "+str(fold['number'])+" to disk")
print_info("Saving fold "+str(fold['number'])+" to disk")
np.save(os.path.join(fold_path,"input_training_data_"+fold['name']), input_train_data)
np.save(os.path.join(fold_path,"output_training_data_"+fold['name']), output_train_data)
np.save(os.path.join(fold_path,"input_testing_data_"+fold['name']), input_test_data)
np.save(os.path.join(fold_path,"output_testing_data_"+fold['name']), output_test_data)
np.save(os.path.join(fold_path,"nof_train_"+fold['name']), train_number_of_frames)
np.save(os.path.join(fold_path,"nof_test_"+fold['name']), test_number_of_frames)
# Forcefully collect all garbage in memory
del input_train_data
del output_train_data
del input_test_data
del output_test_data
del train_number_of_frames
del test_number_of_frames
gc.collect()
telegramSendMessage("Script ended sucessfully")
print_info("Script ended sucessfully")