-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathHarmonicPercussiveVocal.m
192 lines (142 loc) · 4.67 KB
/
HarmonicPercussiveVocal.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
function HarmonicPercussiveVocal(filename, varargin)
p = inputParser;
WindowSizeP = 1024;
HopSizeP = 256;
Power = 2;
LHarmSTFT = 17;
LPercSTFT = 17;
LHarmCQT = 17;
LPercCQT = 7;
defaultOutDir = '.';
addRequired(p, 'filename', @ischar);
addOptional(p, 'OutDir', defaultOutDir, @ischar);
parse(p, filename, varargin{:});
[x, fs] = audioread(p.Results.filename);
%%%%%%%%%%%%%%%%%%%
% FIRST ITERATION %
%%%%%%%%%%%%%%%%%%%
% CQT of original signal
[cfs1,~,g1,fshifts1] = cqt(x, 'SamplingFrequency', fs, 'BinsPerOctave', 96);
cmag1 = abs(cfs1); % use the magnitude CQT for creating masks
H1 = movmedian(cmag1, LHarmCQT, 2);
P1 = movmedian(cmag1, LPercCQT, 1);
% soft masks, Fitzgerald 2010 - p is usually 1 or 2
Hp1 = H1 .^ Power;
Pp1 = P1 .^ Power;
total1 = Hp1 + Pp1;
Mh1 = Hp1 ./ total1;
Mp1 = Pp1 ./ total1;
% recover the complex STFT H and P from S using the masks
H1 = Mh1 .* cfs1;
P1 = Mp1 .* cfs1;
% finally istft to convert back to audio
xh1 = icqt(H1, g1, fshifts1);
xp1 = icqt(P1, g1, fshifts1);
%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SECOND ITERATION, VOCAL %
%%%%%%%%%%%%%%%%%%%%%%%%%%%
xim2 = xp1;
% CQT of original signal
[cfs2,~,g2,fshifts2] = cqt(xim2, 'SamplingFrequency', fs, 'BinsPerOctave', 24);
cmag2 = abs(cfs2); % use the magnitude CQT for creating masks
H2 = movmedian(cmag2, LHarmCQT, 2);
P2 = movmedian(cmag2, LPercCQT, 1);
% soft mask
Hp2 = H2 .^ Power;
Pp2 = P2 .^ Power;
total2 = Hp2 + Pp2;
Mh2 = Hp2 ./ total2;
Mp2 = Pp2 ./ total2;
% todo - set bins of mask below 100hz to 0
% recover the complex STFT H and P from S using the masks
H2 = Mh2 .* cfs2;
P2 = Mp2 .* cfs2;
% finally istft to convert back to audio
xh2 = icqt(H2, g2, fshifts2);
xp2 = icqt(P2, g2, fshifts2);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THIRD ITERATION, PERCUSSIVE %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xim3 = xp1 + xp2;
% STFT parameters
winLen3 = WindowSizeP;
fftLen3 = winLen3 * 2;
overlapLen3 = HopSizeP;
win3 = sqrt(hann(winLen3, "periodic"));
% STFT of original signal
S3 = stft(xim3, "Window", win3, "OverlapLength", overlapLen3, ...
"FFTLength", fftLen3, "Centered", true);
halfIdx3 = 1:ceil(size(S3, 1) / 2); % only half the STFT matters
Shalf3 = S3(halfIdx3, :);
Smag3 = abs(Shalf3); % use the magnitude STFT for creating masks
% median filters
H3 = movmedian(Smag3, LHarmSTFT, 2);
P3 = movmedian(Smag3, LPercSTFT, 1);
% binary masks with separation factor, Driedger et al. 2014
% soft masks, Fitzgerald 2010 - p is usually 1 or 2
Hp3 = H3 .^ Power;
Pp3 = P3 .^ Power;
total3 = Hp3 + Pp3;
Mp3 = Pp3 ./ total3;
% recover the complex STFT H and P from S using the masks
P3 = Mp3 .* Shalf3;
% we previously dropped the redundant second half of the fft
P3 = cat(1, P3, flipud(conj(P3)));
% finally istft to convert back to audio
xp3 = istft(P3, "Window", win3, "OverlapLength", overlapLen3,...
"FFTLength", fftLen3, "ConjugateSymmetric", true);
% fix up some lengths
if size(xh1, 1) < size(x, 1)
xh1 = [xh1; x(size(xh1, 1)+1:size(x, 1))];
end
if size(xp3, 1) < size(x, 1)
xp3 = [xp3; x(size(xp3, 1)+1:size(x, 1))];
end
if size(xh2, 1) < size(x, 1)
xh2 = [xh2; x(size(xh2, 1)+1:size(x, 1))];
xp2 = [xp2; x(size(xp2, 1)+1:size(x, 1))];
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% FOURTH ITERATION, REFINE HARMONIC %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if size(xp3, 1) < size(x, 1)
xp3 = [xp3; x(size(xp3, 1)+1:size(x, 1))];
end
if size(xh2, 1) < size(x, 1)
xh2 = [xh2; x(size(xh2, 1)+1:size(x, 1))];
end
% use 2nd iter vocal estimation to improve harmonic sep
x_vocal = xh2;
x_harmonic = xh1;
x_percussive = xp3;
% CQT of harmonic signal
% use a high frequency resolution here as well
[cfs4,~,g4,fshifts4] = cqt(x_harmonic, 'SamplingFrequency', fs, 'BinsPerOctave', 12);
[cfs4_vocal,~,~,~] = cqt(x_vocal, 'SamplingFrequency', fs, 'BinsPerOctave', 12);
[cfs4_percussive,~,~,~] = cqt(x_percussive, 'SamplingFrequency', fs, 'BinsPerOctave', 12);
cmag4 = abs(cfs4); % use the magnitude CQT for creating masks
cmag4_vocal = abs(cfs4_vocal);
cmag4_percussive = abs(cfs4_percussive);
% soft masks, Fitzgerald 2010 - p is usually 1 or 2
H4 = cmag4 .^ Power;
V4 = cmag4_vocal .^ Power;
P4 = cmag4_percussive .^ Power;
total4 = H4 + V4 + P4;
Mh4 = H4 ./ total4;
H4 = Mh4 .* cfs4;
% finally istft to convert back to audio
xh4 = icqt(H4, g4, fshifts4);
[~,fname,~] = fileparts(p.Results.filename);
splt = split(fname, "_");
prefix = splt{1};
% fix up some lengths
if size(xh4, 1) < size(x, 1)
xh4 = [xh4; x(size(xh4, 1)+1:size(x, 1))];
end
xhOut = sprintf("%s/%s_harmonic.wav", p.Results.OutDir, prefix);
xpOut = sprintf("%s/%s_percussive.wav", p.Results.OutDir, prefix);
xvOut = sprintf("%s/%s_vocal.wav", p.Results.OutDir, prefix);
audiowrite(xhOut, xh4, fs);
audiowrite(xpOut, xp3, fs);
audiowrite(xvOut, xh2, fs);
end