-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathutils.py
118 lines (94 loc) · 4.25 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
from os.path import exists, join, expanduser
import torch
import numpy as np
import librosa
import librosa.display
from torch.utils.data import Dataset
# need this for English text processing frontend
import nltk
import pickle
# import dv3.synthesis
# import train
# from deepvoice3_pytorch import frontend
# from train import build_model
# from train import restore_parts, load_checkpoint
from dv3.synthesis import tts as _tts
def tts(model, text, p=0, speaker_id=0, fast=True, figures=True):
from dv3.synthesis import tts as _tts
waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
if figures:
visualize(alignment, spectrogram)
IPython.display.display(Audio(waveform, rate=fs))
def visualize(alignment, spectrogram):
label_fontsize = 16
figure(figsize=(16,16))
subplot(2,1,1)
imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
xlabel("Decoder timestamp", fontsize=label_fontsize)
ylabel("Encoder timestamp", fontsize=label_fontsize)
colorbar()
subplot(2,1,2)
librosa.display.specshow(spectrogram.T, sr=fs,
hop_length=hop_length, x_axis="time", y_axis="linear")
xlabel("Time", fontsize=label_fontsize)
ylabel("Hz", fontsize=label_fontsize)
tight_layout()
colorbar()
def generate_cloned_samples(model,cloning_text_path = None, no_speakers = 108 , fast = True, p =0 ):
#cloning_texts = ["this is the first" , "this is the second"]
if(cloning_text_path == None):
cloning_text_path = "./Cloning_Audio/cloning_text.txt"
cloning_texts = open("./Cloning_Audio/cloning_text.txt").read().splitlines()
# no_cloning_texts = len(cloning_texts)
all_speakers = []
for speaker_id in range(no_speakers):
speaker_cloning_mel = []
print("The Speaker being cloned speaker-{}".format(speaker_id))
for text in cloning_texts:
waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
speaker_cloning_mel.append([speaker_id, mel])
#print(np.array(speaker_cloning_mel).shape)
all_speakers.append(speaker_cloning_mel)
with open("./Cloning_Audio/speakers_cloned_voices_mel.p", "wb") as fp: #Pickling
pickle.dump(all_speakers, fp)
# print("")
print("Shape of all speakers:",np.array(all_speakers).shape)
# print(all_speakers.shape)
# all speakers[speaker_id][cloned_audio_number]
# print(all_speakers[0][1].shape)
return all_speakers
class Speech_Dataset(Dataset):
def __init__(self, mfccs, embeddings, sampler):
'''Mfccs have to be list of lists of numpy arrays. Each of these numpy arrays will be a mel spectrogram'''
self.voices = mfccs
temp = [spec.shape[0] for text in self.voices for spec in text]
largest_size = np.amax(np.array(temp))
self._pad(largest_size)
self.embeddings = embeddings
if sampler==True:
self.sampler = True
def _pad(self, maximum_size):
'''Input:
Specs: Mel Spectrograms with 80 channels but the length of each channel is not the same.
maximum_size: Largest channel length. Others are padded to this length
Padding with 0 won't affect the convolutions because anyway the neurons corresponding to the states have to
be dead if they are not padded. Putting 0 will also make those neurons dead. And later an average is taken along
this dimension too.
Returns: A padded array of arrays of spectrograms.'''
for i, i_element in enumerate(self.voices):
for j, j_element in enumerate(i_element):
final = np.zeros((maximum_size, 80))
final[:self.voices[i][j].shape[0], :] += j_element
self.voices[i][j]=final
self.voices = np.array(self.voices)
print(self.voices.shape)
def __len__(self):
'''Returns total number of speakers'''
return len(self.voices)
def __getitem__(self, idx):
if self.sampler==False:
return (self.voices[idx], self.embeddings[idx])
elif self.sampler==True:
sample = np.random.random_integers(1, 22, size=int(np.random.randint(1, 10, size=1)))
return (self.voices[idx, sample, :, :], self.embeddings[idx])