-
Notifications
You must be signed in to change notification settings - Fork 0
/
AudioManager.py
166 lines (140 loc) · 6.34 KB
/
AudioManager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import numpy as np
import pvporcupine
import pyaudio
import speech_recognition as sr
import struct
from threading import Thread
import time
import configparser
from queue import Queue
class AudioManager:
l = None
stop_rec = False
display_man = None
samps_stale = True
current_samps = None
output_queue = Queue()
def __init__(self, logger, config_file, display_man, sound_man):
# Porcupine handles the wakewords
# self.porc = pvporcupine.create(access_key="ozyxNjip5m0gzCt7axK2B9+x1UOnzZrYIEJs0eSJmp4KTKh7xbC6sg==", keywords=["Hey Fetch"])
# self.porc = pvporcupine.create(access_key="ozyxNjip5m0gzCt7axK2B9+x1UOnzZrYIEJs0eSJmp4KTKh7xbC6sg==", keyword_paths=["Hey-Fetch_en_mac_v2_1_0.ppn"])
self.porc = pvporcupine.create(access_key="ozyxNjip5m0gzCt7axK2B9+x1UOnzZrYIEJs0eSJmp4KTKh7xbC6sg==", keyword_paths=["Hey-Fetch_en_linux_v2_1_0.ppn"])
self.fs = self.porc.sample_rate
self.frame_len = self.porc.frame_length
self.l = logger
self.config = configparser.ConfigParser()
self.config.read(config_file)
self.parse_config()
self.display_man = display_man
self.sound_man = sound_man
# Continuously collect samples in a separate thread, and make sure
# they're always fresh before delivering them
sample_thread = Thread(target = self.sample_loop)
sample_thread.start()
# Let things settle then baseline audio level for a bit
baseline_samps = int(self.initial_thresh_time*self.fs)
time.sleep(0.5)
samps = self.get_samps(baseline_samps)
self.base_level = self.rms(samps)
self.l.log(f"Base RMS Level: {self.base_level}", "DEBUG")
# Continuously wait for wakeword in this thread
run_thread = Thread(target = self.run)
run_thread.start()
def sample_loop(self):
pa = pyaudio.PyAudio()
audio_stream = pa.open(
rate=self.fs,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.frame_len)
while not self.stop_rec:
self.current_samps = \
audio_stream.read(self.porc.frame_length,
exception_on_overflow=False)
self.samps_stale = False
audio_stream.stop_stream()
audio_stream.close()
pa.terminate()
def run(self):
r = sr.Recognizer()
self.l.log("Started to listen...", "RUN")
wait_speech_nsamp = int(self.fs*self.wait_speech_buffer_time)
transcription_nsamp = int(self.fs*self.transcription_buffer_time)
current_nsamp = wait_speech_nsamp
while not self.stop_rec:
samps = self.get_samps_single()
pcm = struct.unpack_from("h" * self.porc.frame_length, samps)
keyword_index = self.porc.process(pcm)
# If the wakeword was detected...
if keyword_index >= 0:
self.display_man.wakeword_detected()
self.l.log("Wakeword Detected. Waiting for speech.", "RUN")
self.sound_man.play_blocking("wakeword")
# Continuously read samples until RMS value goes to baseline
to_transcribe = []
still_quiet = True
while True:
samps = self.get_samps(current_nsamp)
if self.rms(samps) < self.dev_thresh*self.base_level \
and still_quiet:
continue
elif self.rms(samps) < self.dev_thresh*self.base_level:
current_nsamp = wait_speech_nsamp
break
if still_quiet:
current_nsamp = transcription_nsamp
self.display_man.talking_started()
self.l.log("You started talking!", "DEBUG")
still_quiet = False
to_transcribe.append(samps)
to_transcribe = np.hstack(to_transcribe)
self.display_man.talking_finished()
self.l.log("Done talking. Transcribing...", "DEBUG")
audio = sr.AudioData(to_transcribe, self.fs, 2)
try:
transcription = r.recognize_google(audio)
self.l.log(f"You said: {transcription}", "RUN")
self.output_queue.put(transcription)
self.display_man.transcription_finished(transcription)
self.sound_man.play_blocking("transcription success")
except sr.UnknownValueError:
self.l.log("No audio found in segment", "DEBUG")
self.display_man.transcription_finished("")
self.sound_man.play_blocking("transcription failed")
else:
# Eventually implement continuous RMS updating here
# with a list of past RMS values that gets averaged
pass
def stop(self):
self.stop_rec = True
def get_samps_single(self):
while self.samps_stale:
time.sleep(0.001)
self.samps_stale = True
return self.current_samps
# Returns at least nsamp samps
def get_samps(self,nsamp):
all_vals = []
total = 0
while total < nsamp:
samps = self.get_samps_single()
samps = np.frombuffer(samps, dtype=np.int16)
all_vals.append(samps)
total = total+samps.size
all_vals = np.hstack(all_vals)
return all_vals
def rms(self, samps):
# We're using 16-bit integers, so want to cast to 64-bit before rms.
# This caused me lots of headaches before realizing that the values
# were overflowing...
larger = np.array(samps, dtype=np.int64)
return np.sqrt(np.mean(larger**2))
def parse_config(self):
self.transcription_buffer_time = \
float(self.config["Audio"]["transcription_buffer_time"])
self.wait_speech_buffer_time = \
float(self.config["Audio"]["wait_speech_buffer_time"])
self.dev_thresh = float(self.config["Audio"]["rms_deviation_thresh"])
self.initial_thresh_time = \
float(self.config["Audio"]["initial_thresh_time"])