-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
129 lines (114 loc) · 4.39 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pyaudio
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
import os
from pydub import AudioSegment
# Preprocess the audio to the correct format
def preprocess_audio(filename):
# Trim or pad audio segment to 10000ms
padding = AudioSegment.silent(duration=10000)
segment = AudioSegment.from_wav(filename)[:10000]
segment = padding.overlay(segment)
# Set frame rate to 44100
segment = segment.set_frame_rate(44100)
# Export as wav
segment.export(filename, format='wav')
# Calculate and plot spectrogram for a wav audio file
def graph_spectrogram(wav_file):
rate, data = get_wav_info(wav_file)
#print(rate, data.shape)
nfft = 200 # Length of each window segment
fs = 8000 # Sampling frequencies
noverlap = 120 # Overlap between windows
nchannels = data.ndim
if nchannels == 1:
pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
elif nchannels == 2:
pxx, freqs, bins, im = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
return pxx
# Load a wav file
def get_wav_info(wav_file):
rate, data = wavfile.read(wav_file)
return rate, data
# Used to standardize volume of audio clip
def match_target_amplitude(sound, target_dBFS):
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
# Load raw audio files for speech synthesis
def load_raw_audio():
activates = []
backgrounds = []
negatives = []
for filename in os.listdir("./raw_data/activates"):
if filename.endswith("wav"):
activate = AudioSegment.from_wav("./raw_data/activates/"+filename)
activates.append(activate)
for filename in os.listdir("./raw_data/backgrounds"):
if filename.endswith("wav"):
background = AudioSegment.from_wav("./raw_data/backgrounds/"+filename)
backgrounds.append(background)
for filename in os.listdir("./raw_data/negatives"):
if filename.endswith("wav"):
negative = AudioSegment.from_wav("./raw_data/negatives/"+filename)
negatives.append(negative)
return activates, negatives, backgrounds
def has_new_triggerword(predictions, chunk_duration, feed_duration, threshold=0.5):
"""
Function to detect new trigger word in the latest chunk of input audio.
It is looking for the rising edge of the predictions data belongs to the
last/latest chunk.
Argument:
predictions -- predicted labels from model
chunk_duration -- time in second of a chunk
feed_duration -- time in second of the input to model
threshold -- threshold for probability above a certain to be considered positive
Returns:
True if new trigger word detected in the latest chunk
"""
predictions = predictions > threshold
chunk_predictions_samples = int(len(predictions) * chunk_duration / feed_duration)
chunk_predictions = predictions[-chunk_predictions_samples:]
level = chunk_predictions[0]
for pred in chunk_predictions:
if pred > level:
return True
else:
level = pred
return False
def callback(in_data, frame_count, time_info, status):
global run, timeout, data, silence_threshold
if time.time() > timeout:
run = False
data0 = np.frombuffer(in_data, dtype='int16')
if np.abs(data0).mean() < silence_threshold:
sys.stdout.write('-')
return (in_data, pyaudio.paContinue)
else:
sys.stdout.write('.')
data = np.append(data, data0)
if len(data) > feed_samples:
data = data[-feed_samples:]
# Process data async by sending a queue.
q.put(data)
return (in_data, pyaudio.paContinue)
def detect_triggerword(filename, model):
x = graph_spectrogram(filename)
# the spectrogram outputs (freqs, Tx) and we want (Tx, freqs) to input into the model
x = x.swapaxes(0, 1)
x = np.expand_dims(x, axis=0)
#print(x.shape)
print("Time start")
st = time.time()
predictions = model.predict(x)
#print("prediction required: {} secs".format(time.time()-st))
return predictions
def rec_to_spec(data):
data = np.int16(data / np.max(np.abs(data)) * 32767)
nfft = 200 # Length of each window segment
fs = 8000 # Sampling frequencies
noverlap = 120 # Overlap between windows
data, freqs, bins, im = plt.specgram(data[:441000], nfft, fs, noverlap=noverlap)
return data