-
Notifications
You must be signed in to change notification settings - Fork 0
/
cspl.py
151 lines (137 loc) · 6.54 KB
/
cspl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
import os
import re
import argparse
import nltk
from nltk.corpus import cmudict
import audio as a
import Dict
import time
parser = argparse.ArgumentParser(description='text to speech')
parser.add_argument('--speak', '-s', action="store_true", default=False, help="speaks the output audio")
parser.add_argument('text', nargs=1, help="The text to be synthesised")
args = parser.parse_args()
class Synthesis:
def __init__(self, wav_folder):
self.concatenatedsound = Dict.Dict()
if os.path.isdir("./corpora/cmudict"):
dl = nltk.downloader.Downloader()
dl._update_index()
dl._status_cache['cmudict'] = 'installed' # Trick the index to treat cmudict as it's already installed.
else:
nltk.download('cmudict', download_dir="./")
dl = nltk.downloader.Downloader()
dl._update_index()
dl._status_cache['cmudict'] = 'installed' # Trick the index to treat cmudict as it's already installed.
nltk.data.path = "./"
tmpaudio = a.Audio(rate=16000) # Audio_obj
for file in os.listdir(wav_folder):
if file.endswith(".wav"):
try:
sound = file.split(".")[0]
wav_path = os.path.join(wav_folder, file)
tmpaudio.load(wav_path)
self.concatenatedsound.insert(sound, tmpaudio.data)
except Exception as e:
# print the exception as an error message
print(str(e))
pass
length = 16000 * 0.1 # sample rate = 16 kHz, frame size = 0.1 seconds
self.concatenatedsound.insert(' ', np.zeros(int(length), tmpaudio.nptype))
length = 16000 * 0.2 # sample rate = 16 kHz, frame size = 0.2 seconds
self.concatenatedsound.insert(',', np.zeros(int(length), tmpaudio.nptype))
length = 16000 * 0.4 # sample rate = 16 kHz, frame size = 0.4 seconds
self.concatenatedsound.insert(':', np.zeros(int(length), tmpaudio.nptype))
self.concatenatedsound.insert('?', np.zeros(int(length), tmpaudio.nptype))
self.concatenatedsound.insert('!', np.zeros(int(length), tmpaudio.nptype))
self.concatenated_wavs = self.concatenatedsound
class Speech:
def __init__(self, text):
print(text)
self.text = text.lower()
self.text = re.split(r"([?!.,:\s+])", self.text)
self.sequence = []
print(str(self.text))
dict = cmudict.dict()
sequence = []
for word in self.text:
if 1 > len(word):
continue
if word in ' ?!,.:':
sequence.append([word])
else:
if word in dict:
diphone = dict[word][0]
for i in range(len(diphone)):
diphone[i] = re.sub("[^a-zA-Z\\s\-]", "", diphone[i]).lower()
sequence.append(diphone)
self.sequence = sequence
def concatenate_diphones(self):
global diphone, tmp
leading_whitespace = False
sentence_1 = False
sentence_number = 0
i = 0
for sequence in self.sequence:
if (len(sequence) == 1) and (sequence[0] in ' ,.:?!'):
if sequence[0] == " ":
leading_whitespace = True
elif "?" in sequence or '.' in sequence or "!" in sequence:
sentence_1 = True
sentence_number = i + 1 # the sentence number is the index of the first word of the sentence
# if i > 0:
# self.sequence[i - 1].append(" ")
# else:
# self.sequence[i].append(" ")
# concatenatedsound.insert(i, " ")
# inserts a space in the concatenated sound before and after the seq in self.sequence and splits the concatenated sound into a list of sounds)
concatenatedsound.insert(len(concatenatedsound),
concatenatedsound[len(concatenatedsound) - 1].split("-")[1] + "-pau")
else:
concatenatedsound.append(sequence[0])
else:
if sentence_1 and i == sentence_number:
# concatenatedsound.append("-pau")
# insert the first diphone of the sentence as pau- for pause
concatenatedsound.insert(len(concatenatedsound), "pau-" + sequence[0])
for j in range(1, len(sequence)):
# this append to concatenated sounds as a bit of a hack, but it works until I can figure out a better way
concatenatedsound.append(sequence[j - 1] + '-' + sequence[j])
if leading_whitespace:
# concatenatedsound.insert(len(concatenatedsound), "pau-" + seq[len(seq) - 1])
# inserts pau- for a pause diphone where self.sequence[i][-1] is the last diphone per word
# self.sequence[i][0] is the first diphone per word
concatenatedsound.insert(len(concatenatedsound) - j,
self.sequence[i - 2][-1] + "-" + self.sequence[i][0])
leading_whitespace = False
if i == 0:
# inserts the first diphone of the sentence as pau- for pause
concatenatedsound.insert(0, "pau-" + self.sequence[i][0])
elif i == (len(self.sequence) - 1):
# inserts the last diphone of the sentence as pau- for pause
concatenatedsound.insert(len(concatenatedsound), sequence[len(sequence) - 1] + "-pau")
i += 1
print(concatenatedsound)
for sound in concatenatedsound:
try:
# appends the diphone to the concatenated sound
if type(diphone_synthesis.concatenatedsound[sound]) == np.ndarray:
tmp = np.append(tmp, (diphone_synthesis.concatenatedsound[sound]))
except Exception as e:
print(str(e))
pass
if __name__ == "__main__":
nltk.data.path = "./"
concatenatedsound = []
tmp = []
start_time = time.time()
diphone_synthesis = Synthesis(wav_folder="./cstr_en_us")
out = a.Audio(rate=16000)
speech = Speech(args.text[0])
speech.concatenate_diphones()
out.data = tmp.astype(np.int16)
print(out.data, type(out.data))
if args.speak is True:
out.play()
print("--- %s seconds ---" % (time.time() - start_time))
out.plot_waveform()