generated from BloomTech-Labs/template-ds
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathvisualize.py
399 lines (292 loc) · 9.8 KB
/
visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import os
from os.path import join
import shutil
import librosa
import numpy as np
import random
import torch
from scipy.misc import toimage, imsave
from tqdm import tqdm
from pytorch_pretrained_biggan import (BigGAN, one_hot_from_names,
truncated_noise_sample,
save_as_images, display_in_terminal)
def model_resolution(resolution):
"""
set model's resolution, default 128
128, 256, or 512
lower = faster generation, lower quality.
"""
model_name = 'biggan-deep-' + resolution
model = BigGAN.from_pretrained(model_name)
return model
def song_duration(duration=30):
"""
Song duration in seconds, returns fram_lim
default = 30 seconds
"""
seconds = duration
frame_lim = int(np.floor(seconds * 22050 / frame_length / batch_size))
return frame_lim
# set pitch sensitivity
def sensitivity_pitch(pitch_sensitivity):
"""
INT
Set how quickly images move according to pitch
Default 220
Recommended range: 200 – 295
"""
pitch_sensitivity = (300 - pitch_sensitivity) * 512 / frame_length
return pitch_sensitivity
# set tempo sensitivity
def sensitivity_tempo(tempo_sensitivity):
"""
FLOAT between 0 and 1
Set how quickly images morph due to tempo
Default 0.25
Recommended range: 0.05 – 0.8
"""
tempo_sensitivity = tempo_sensitivity * frame_length / 512
return tempo_sensitivity
# can reduce this number to make clearer images or increase to reduce computational load
# default: 512
# range: multiples of 64
frame_length = 512
# BigGAN generates the images in batches of size [batch_size].
# default 32
# only reason to lower this is if you run out of cuda memory. will take
# slightly longer.
batch_size = 32
# set device
# use cuda or face a generation time in the hours. You have been warned.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def smooth_rate(smooth_factor):
"""
int > 0
smooths the class vectors to prevent small fluctuations in pitch from causing the frames to go back and forth
default 20
recommended range: 10 – 30
"""
if smooth_factor > 1:
smooth_factor = int(smooth_factor * 512 / frame_length)
else:
smooth_factor = smooth_factor
return smooth_factor
def new_jitters(jitter):
"""
update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity
"""
jitters = np.zeros(128)
for j in range(128):
if random.uniform(0, 1) < 0.5:
jitters[j] = 1
else:
jitters[j] = 1 - jitter
return jitters
def new_update_dir(nv2, update_dir, truncation, tempo_sensitivity):
"""
changes the direction of the noise vector
"""
for ni, n in enumerate(nv2):
if n >= 2 * truncation - sensitivity_tempo(tempo_sensitivity):
update_dir[ni] = -1
elif n < -2 * truncation + sensitivity_tempo(tempo_sensitivity):
update_dir[ni] = 1
return update_dir
#
def smooth(class_vectors, smooth_factor):
"""
smooth class vectors
"""
if smooth_factor == 1:
return class_vectors
class_vectors_terp = []
for c in range(int(np.floor(len(class_vectors) / smooth_factor) - 1)):
ci = c * smooth_factor
cva = np.mean(class_vectors[int(ci):int(ci) + smooth_factor], axis=0)
cvb = np.mean(
class_vectors[int(ci) + smooth_factor:int(ci) + smooth_factor * 2], axis=0)
for j in range(smooth_factor):
cvc = cva * (1 - j / (smooth_factor - 1)) + \
cvb * (j / (smooth_factor - 1))
class_vectors_terp.append(cvc)
return np.array(class_vectors_terp)
def normalize_cv(cv2):
"""
normalize class vector between 0-1
"""
min_class_val = min(i for i in cv2 if i != 0)
for ci, c in enumerate(cv2):
if c == 0:
cv2[ci] = min_class_val
cv2 = (cv2 - min_class_val) / np.ptp(cv2)
return cv2
def song_analysis(song, classes, jitter, depth, truncation,pitch_sensitivity,
tempo_sensitivity, smooth_factor):
"""
creates the class and noise vectors files
Inputs:
song: STR; path of 30 second mp3 file
classes: LIST; classes by index from ImageNet1000 -max 12 classes
jitter: FLOAT 0 to 1
depth: FLOAT 0 to 1
truncation: FLOAT 0 to 1
pitch_sensitivity: INT 1-299
tempo_sensitivity: FLOAT 0 to 1
smooth_factor: INT > 0
Output:
noise and class vectors of song based on input variables
"""
# read song: audio waveform and sampling rate saved
# y = time, sr = sample rate
y, sr = librosa.load(song)
# create spectrogram
spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000,
hop_length=frame_length)
# get mean power at each time point
specm = np.mean(spec, axis=0)
# compute power gradient across time points
gradm = np.gradient(specm)
# set max to 1
gradm = gradm / np.max(gradm)
# set negative gradient time points to zero
gradm = gradm.clip(min=0)
# normalize mean power between 0-1
specm = (specm - np.min(specm)) / np.ptp(specm)
# create chromagram of pitches X time points
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=frame_length)
# sort pitches by overall power
chromasort = np.argsort(np.mean(chroma, axis=1))[::-1]
# gets # of classes
num_classes = len(classes)
#sorts classes by power so user chosen classes have more weight
classes=[classes[s] for s in np.argsort(chromasort[:num_classes])]
# initialize first class vector
cv1 = np.zeros(1000)
for pi, p in enumerate(chromasort[:num_classes]):
if num_classes < 12:
cv1[classes[pi]] = chroma[p][
np.min([np.where(chrow > 0)[0][0] for chrow in chroma])]
else:
cv1[classes[p]] = chroma[p][
np.min([np.where(chrow > 0)[0][0] for chrow in chroma])]
# initialize first noise vector
nv1 = truncated_noise_sample(truncation=truncation)[0]
# initialize list of class and noise vectors
class_vectors = [cv1]
noise_vectors = [nv1]
# initialize previous vectors (will be used to track the previous frame)
cvlast = cv1
nvlast = nv1
# initialize the direction of noise vector unit updates
update_dir = np.zeros(128)
for ni, n in enumerate(nv1):
if n < 0:
update_dir[ni] = 1
else:
update_dir[ni] = -1
# initialize noise unit update
update_last = np.zeros(128)
for i in tqdm(range(len(gradm))):
# print progress
pass
if i % 200 == 0:
jitters = new_jitters(jitter)
# get last noise vector
nv1 = nvlast
# set noise vector update based on direction, sensitivity, jitter, and
# combination of overall power and gradient of power
update = np.array([sensitivity_tempo(tempo_sensitivity) for k in range(
128)]) * (gradm[i] + specm[i]) * update_dir * jitters
# smooth the update with the previous update (to avoid overly sharp
# frame transitions)
update = (update + update_last * 3) / 4
# set last update
update_last = update
# update noise vector
nv2 = nv1 + update
# append to noise vectors
noise_vectors.append(nv2)
# set last noise vector
nvlast = nv2
# update the direction of noise units
update_dir = new_update_dir(nv2, update_dir, truncation,
tempo_sensitivity)
# get last class vector
cv1 = cvlast
# generate new class vector
cv2 = np.zeros(1000)
for j in range(num_classes):
cv2[classes[j]] = (cvlast[classes[j]] +
((chroma[chromasort[j]][i]) / (sensitivity_pitch(pitch_sensitivity)))) / (1 + (1 / ((sensitivity_pitch(pitch_sensitivity)))))
# if more than 6 classes, normalize new class vector between 0 and 1,
# else simply set max class val to 1
if num_classes > 6:
cv2 = normalize_cv(cv2)
else:
cv2 = cv2 / np.max(cv2)
# adjust depth
cv2 = cv2 * depth
# this prevents rare bugs where all classes are the same value
if np.std(cv2[np.where(cv2 != 0)]) < 0.0000001:
cv2[classes[0]] = cv2[classes[0]] + 0.01
# append new class vector
class_vectors.append(cv2)
# set last class vector
cvlast = cv2
# interpolate between class vectors of bin size [smooth_factor] to smooth
# frames
class_vectors = smooth(class_vectors, smooth_rate(smooth_factor))
# save record of vectors for current video
# TODO: have deezer_id prepended to file for saving in s3
# np.save('class_vectors.npy', class_vectors)
# np.save('noise_vectors.npy', noise_vectors)
return noise_vectors, class_vectors
def generate_images(video_id, noise_vectors, class_vectors, resolution,
truncation):
"""
Take vectors from song_analysis and generate images
Inputs:
video_id: STR; used to make unique files, avoids overwriting files
noise_vectors: NUMPY ARRAY; formed during song analysis
class_vectors: NUMPY ARRAY; formed during song analysis
resolution: STR; 128, 256, 512; determines resolution of video
truncation: FLOAT 0 to 1; should be same as passed to song analysis
Output:
tmp_folder_path: points to location of frames on disk
"""
# convert to Tensor
noise_vectors = torch.Tensor(np.array(noise_vectors))
class_vectors = torch.Tensor(np.array(class_vectors))
# initialize bigGAN model
model = model_resolution(resolution=resolution)
# send to CUDA if running on GPU - YOU SHOULD REALLY DO THIS
model = model.to(device)
noise_vectors = noise_vectors.to(device)
class_vectors = class_vectors.to(device)
# adds temp folder for saving frames on local disk
tmp_folder_path = os.path.join(os.getcwd(), f"{video_id}_frames")
if os.path.exists(tmp_folder_path):
shutil.rmtree(tmp_folder_path)
os.mkdir(tmp_folder_path)
counter = 0
for i in tqdm(range(song_duration())):
# print progress
pass
if (i + 1) * batch_size > len(class_vectors):
torch.cuda.empty_cache()
break
# get batch
noise_vector = noise_vectors[i * batch_size:(i + 1) * batch_size]
class_vector = class_vectors[i * batch_size:(i + 1) * batch_size]
with torch.no_grad():
output = model(noise_vector, class_vector, truncation)
#generates image frames as numpy array
output_cpu = output.cpu().data.numpy()
# convert to image array and add to file containing frames
for out in output_cpu:
im = np.array(toimage(out))
imsave(os.path.join(tmp_folder_path, str(counter) + ".png"), im)
counter = counter + 1
# empty cuda cache
torch.cuda.empty_cache()
return tmp_folder_path