forked from dusty-nv/jetson-voice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtts.py
executable file
·105 lines (81 loc) · 3.06 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
# coding: utf-8
from jetson_voice.utils import load_resource
def TTS(resource, *args, **kwargs):
"""
Loads a TTS service or model.
See the TTSService class for the signature that implementations use.
"""
factory_map = {
'riva' : 'jetson_voice.backends.riva.RivaTTSService',
'tensorrt' : 'jetson_voice.models.tts.TTSEngine',
'onnxruntime' : 'jetson_voice.models.tts.TTSEngine'
}
return load_resource(resource, factory_map, *args, **kwargs)
class TTSService():
"""
TTS service base class.
"""
def __init__(self, config, *args, **kwargs):
"""
Create service instance.
"""
self.config = config
def __call__(self, text):
"""
Generate audio from text.
Parameters:
text (string) -- The phrase to convert to audio.
Returns audio samples in a numpy array.
"""
pass
@property
def sample_rate(self):
"""
Get the output sample rate (in Hz)
"""
pass
if __name__ == "__main__":
from jetson_voice import list_audio_devices, ConfigArgParser
from soundfile import SoundFile
import pprint
import pyaudio
import time
parser = ConfigArgParser()
parser.add_argument('--model', default='fastpitch_hifigan', type=str)
parser.add_argument('--text', default='Hello, how are you today?', type=str)
parser.add_argument('--warmup', type=int, default=9, help='the number of warmup runs')
parser.add_argument("--output-device", type=int, default=None, help='output audio device to use')
parser.add_argument("--output-wav", type=str, default=None, help='output wav file to write to')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')
args = parser.parse_args()
print(args)
# list audio devices
if args.list_devices:
list_audio_devices()
# load the model
tts = TTS(args.model)
# display the text
print(f"\n'{args.text}'\n")
# run the TTS
for run in range(args.warmup+1):
start = time.perf_counter()
audio = tts(args.text)
stop = time.perf_counter()
latency = stop-start
duration = audio.shape[0]/tts.sample_rate
print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.")
# output the audio
if args.output_device is not None:
p = pyaudio.PyAudio()
stream = p.open(output_device_index=args.output_device,
format=pyaudio.paFloat32,
channels=1, rate=tts.sample_rate, output=True)
stream.write(audio.tobytes())
stream.close_stream()
stream.close()
if args.output_wav is not None:
wav = SoundFile(args.output_wav, mode='w', samplerate=tts.sample_rate, channels=1)
wav.write(audio)
wav.close()
print(f"Wrote audio to {args.output_wav}")