Skip to content

Commit

Permalink
Add more slides
Browse files Browse the repository at this point in the history
  • Loading branch information
sevagh committed Apr 20, 2021
1 parent 832bfed commit ba30e6f
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 9 deletions.
13 changes: 13 additions & 0 deletions latex/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,16 @@ @article{renyi
journal = {Proceedings IV Berkeley Symposium on Mathematical Statistics and Probability, Berkeley, 20-30 June 1961},
volume = {1},
pages = {547--561}}

@article{cdae,
author = {Emad M. Grais and
Mark D. Plumbley},
title = {Single Channel Audio Source Separation using Convolutional Denoising
Autoencoders},
journal = {CoRR},
volume = {abs/1703.08019},
year = {2017},
url = {http://arxiv.org/abs/1703.08019},
archivePrefix = {arXiv},
eprint = {1703.08019},
}
Binary file added latex/elmestizo.wav
Binary file not shown.
Binary file added latex/elmestizo_harm_501.wav
Binary file not shown.
Binary file added latex/elmestizo_harm_622.wav
Binary file not shown.
Binary file added latex/elmestizo_perc_501.wav
Binary file not shown.
Binary file added latex/elmestizo_perc_622.wav
Binary file not shown.
Binary file modified latex/final-presentation.pdf
Binary file not shown.
123 changes: 121 additions & 2 deletions latex/presentation.tex
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
\documentclass{beamer}
\usetheme{Boadilla}
\usepackage{minted}
\usepackage{hyperref}
\usepackage{tikz}
\usetikzlibrary{shapes,positioning}
Expand Down Expand Up @@ -27,6 +28,13 @@
\setbeameroption{show notes on second screen=right}
%\setbeameroption{hide notes}

\newlength{\mintednumbersep}
\AtBeginDocument{%
\sbox0{\tiny00}%
\setlength\mintednumbersep{8pt}%
\addtolength\mintednumbersep{-\wd0}%
}

\def\footshortciteintern[#1][#2]#3{%
\ifx#1\empty
% Nur Autor
Expand All @@ -49,7 +57,6 @@
\optparams{\footshortciteintern}{[][]}
}


\title[TF representations for music separation]{Time-Frequency Representations for Music Source Separation}
\subtitle{Final project presentation}
\author{Sevag Hanssian}
Expand Down Expand Up @@ -162,6 +169,45 @@
\end{figure}
\end{frame}

\begin{frame}[fragile]
\frametitle{HPSS MATLAB pseudocode}
\begin{figure}[h]
\centering
\begin{minipage}{0.48\textwidth}
\centering
\begin{minted}[numbersep=\mintednumbersep,linenos,mathescape=true,breaklines,frame=single,escapeinside=||]{text}
|$s = \text{mixed audio}$|
|$\hat{S} = \text{STFT}(s)\text{\textbf{ or CQT}}$|
|$S = \text{abs}(\hat{S})$|
|$H = \text{medianfilter}(S, l_{H}, \text{axis}=2)$|
|$P = \text{medianfilter}(S, l_{P}, \text{axis}=1)$|
|$\text{\textbf{soft} } M_{H} = \frac{H^{p}}{H^{p} + P^{p}}, M_{P} = \frac{P^{p}}{H^{p} + P^{p}}$|
|$\text{\textbf{hard} } M_{H} = \frac{H}{P + \epsilon} \ge \beta, M_{P} = \frac{P}{H + \epsilon} > \beta$|
|$\hat{H} = \hat{S} \cdot M_{H}$|
|$\hat{P} = \hat{S} \cdot M_{P}$|
|$h = \text{ISTFT}(\hat{H})\text{\textbf{ or ICQT}}$|
|$p = \text{ISTFT}(\hat{P})\text{\textbf{ or ICQT}}$|
\end{minted}
\end{minipage}
\hspace{0.02\textwidth}
\begin{minipage}{0.48\textwidth}
\centering
\begin{minted}[numbersep=\mintednumbersep,linenos,mathescape=true,breaklines,frame=single,escapeinside=||]{text}
|$s = \text{mixed audio}$|
|$\hat{S1} = \text{STFT}(s)\text{\textbf{ or CQT}}$|
|$ \text{1-pass algorithm } \rightarrow \hat{H1}, \hat{P1} $|
|$\text{\textbf{final harmonic} } h1 = \text{ISTFT}(\hat{H1})\text{\textbf{ or ICQT}}$|
|$p1 = \text{ISTFT}(\hat{P1})\text{\textbf{ or ICQT}}$|
|$\hat{S2} = \text{STFT}(p1)\text{\textbf{ or CQT}}$|
|$ \text{1-pass algorithm } \rightarrow \hat{H2}, \hat{P2} $|
|$h2 = \text{ISTFT}(\hat{H2})\text{\textbf{ or ICQT}}$|
|$\text{\textbf{final percussive} } p1 = \text{ISTFT}(\hat{P2})\text{\textbf{ or ICQT}}$|
\end{minted}
\end{minipage}
\captionof{listing}{1- and 2-pass median filtering HPSS algorithms}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{STFT, CQT, and TF resolution}
STFT vs. CQT\footnote{\url{https://www.mathworks.com/help/wavelet/ref/cqt.html}} (based on NSGT\footmediumcite{balazs}):
Expand Down Expand Up @@ -223,6 +269,26 @@
Use 2 WMDCT transforms (wide + narrow window) + Group-LASSO to shrink input signal into significant coefficients in ``time'' and ``frequency'' groups
\end{frame}

\begin{frame}[fragile]
\frametitle{Audioshrink MATLAB pseudocode}
\begin{figure}[h]
\centering
\centering
\begin{minted}[numbersep=\mintednumbersep,linenos,mathescape=true,breaklines,frame=single,escapeinside=||]{text}
|$f = \text{mixed audio}$|
|$F1 = \text{frametight}(\text{frame}(\text{wmdct}, \text{gauss}, \text{winsize}_{h}))\qquad\text{\textbf{WMDCT} }$|
|$F2 = \text{frametight}(\text{frame}(\text{wmdct}, \text{gauss}, \text{winsize}_{p}))$|
|$c1 = \text{franagrouplasso}(F1, f, \lambda_{h}, \text{soft}, \text{freq})$|
|$c2 = \text{franagrouplasso}(F2, f, \lambda_{p}, \text{soft}, \text{time})$|
|$xh = \text{frsyn}(F1, c1)\qquad\qquad\qquad\text{\textbf{Inverse WMDCT} }$|
|$xp = \text{frsyn}(F2, c2)$|
\end{minted}
\captionof{listing}{WMDCTLasso tonal/transient separation algorithm}
\end{figure}
\textit{franagrouplasso} in LTFAT solves the Group-LASSO regression problem in the time-frequency domain\\
Soft thresholding vs. hard thresholding -- similar to soft/hard masking
\end{frame}

\begin{frame}
\frametitle{R{\'e}nyi entropy vs. Lasso}
\begin{itemize}
Expand Down Expand Up @@ -281,6 +347,30 @@
\end{itemize}
}

\begin{frame}[fragile]
\frametitle{TFJigsaw MATLAB pseudocode}
\begin{figure}[h]
\centering
\begin{minted}[numbersep=\mintednumbersep,linenos,mathescape=true,breaklines,frame=single,escapeinside=||]{text}
|$f = \text{mixed audio}$|
|$a,M,\text{winsize},b\{1,2\} = \text{Gabor systems 1 and 2 configuration}$|
|$r\{1,2\} = \text{significance level of tonal and transient layer re: white noise ref}$|
|$[\text{ref}1, \text{ref}2] = \text{generate estimate of random white noise entropy within supertile}$|
|$[\text{tau}1, \text{tau}2] = [\text{ref}1 \cdot r1, \text{ref}2 \cdot r2]$|
|$c1 = \text{DGTReal}(f, \text{winsize}1, a1, M1)\qquad\text{\textbf{Discrete Gabor Transform}}$|
|$c2 = \text{DGTReal}(f, \text{winsize}2, a2, M2)$|
|$\text{for all time and frequency supertiles}$|
|$\qquad f\{1,2\} = \text{frequency supertile location, Gabor system 1,2}$|
|$\qquad t\{1,2\} = \text{time supertile location, Gabor system 1,2}$|
|$\qquad [c1, c2] = \text{decision}(c1,c2,f1,f2,t1,t2,\text{tau}1,\text{tau}2)$|
|$\text{endfor}$|
|$f_{\text{tonal}} = \text{IDGTReal}(c1)\qquad\qquad\text{\textbf{Inverse discrete Gabor Transform}}$|
|$f_{\text{transient}} = \text{IDGTReal}(c2)$|
\end{minted}
\captionof{listing}{TFJigsaw tonal/transient separation algorithm}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Evaluation testbench}
Inspired by SigSep\footnote{\url{https://sigsep.github.io/}}, SISEC (Signal Separation Evaluation Campaign):
Expand Down Expand Up @@ -409,6 +499,23 @@
\end{table}
\end{frame}

\begin{frame}
\frametitle{NSGT in STFT-based neural network}
\begin{figure}
\includegraphics[width=10cm]{./cdae_arch.png}
\caption{Convolutional denoising autoencoders\footfullcite{cdae}}
\end{figure}
Idea: replace STFT spectrogram with NSGT spectrogram. Both MATLAB Wavelet Toolbox CQT (based on NSGT) and reference Python implementation of NSGT (\href{https://github.com/grrrr/nsgt}{https://github.com/grrrr/nsgt}) can produce a rectangular matrix of TF coefficients, solving a common difficulty of using the CQT in spectrogram-based algorithms.
\end{frame}

\begin{frame}
\frametitle{NSGT-spectrogram neural network}
\begin{figure}
\includegraphics[width=9cm]{./mixin_arch.png}
\caption{Toy/demo in \href{https://github.com/sevagh/MiXiN}{https://github.com/sevagh/MiXiN}}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Conclusions}
\begin{itemize}
Expand All @@ -419,8 +526,20 @@
\item
Competitive PEASS separation results in hybrid algorithms based on advanced DSP/time-frequency analysis (not so good in BSSv4)
\item
Swap STFT for NSGT in a both traditional DSP algorithms, and machine/deep learning networks\footnote{\url{https://github.com/sevagh/MiXiN}} -- lots of future potential
Swap STFT for NSGT in a both traditional DSP algorithms, and machine/deep learning networks -- lots of future potential
\end{itemize}
501 to 622:
\begin{table}[ht]
\centering
\begin{tabular}{lll c c c}
\hline\hline
Algorithm & Harmonic & Percussive \\ [0.5ex]
\hline\hline
HPSS (MUMT 501) \href{run:./elmestizo.wav}{\faVolumeUp \ mix} & \href{run:./elmestizo_harm_501.wav}{\faVolumeUp \ h} & \href{run:./elmestizo_perc_501.wav}{\faVolumeUp \ p} \\ [0.5ex]
MiXiN (MUMT 622) & \href{run:./elmestizo_harm_622.wav}{\faVolumeUp \ h} & \href{run:./elmestizo_perc_622.wav}{\faVolumeUp \ p} \\ [0.5ex]
\hline
\end{tabular}
\end{table}
\end{frame}

\end{document}
14 changes: 7 additions & 7 deletions latex/report.tex
Original file line number Diff line number Diff line change
Expand Up @@ -739,19 +739,19 @@ \subsubsection{TFJigsaw}

\begin{figure}[h]
\centering
\centering
\begin{minted}[numbersep=\mintednumbersep,linenos,mathescape=true,breaklines,frame=single,escapeinside=||]{text}
|$f = \text{mixed audio}$|
|$a,M,\text{winsize},b\{1,2\} = \text{Gabor systems 1 and 2 configuration}$|
|$r\{1,2\} = \text{significance level of tonal and transient layer re: white noise ref}$|
|$[\text{ref}1, \text{ref}2] = \text{generate estimate of random white noise entropy within supertile}$|
|$[\text{tau}1, \text{tau}2] = [\text{ref}1 \cdot r1, \text{ref}2 \cdot r2]$|
|$c1 = \text{DGTReal}(f, \text{winsize}1, a1, M1)\qquad\text{\textbf{Discrete Gabor Transform}}$|
|$c2 = \text{DGTReal}(f, \text{winsize}2, a2, M2)$|
|$f1 = \text{frequency supertile location, Gabor system 1}$|
|$f2 = \text{frequency supertile location, Gabor system 2}$|
|$t1 = \text{time supertile location, Gabor system 1}$|
|$t2 = \text{time supertile location, Gabor system 2}$|
|$[c1, c2] = \text{decision}(c1,c2,f1,f2,t1,t2,\text{tau}1,\text{tau}2)$|
|$\text{for all time and frequency supertiles}$|
|$\qquad f\{1,2\} = \text{frequency supertile location, Gabor system 1,2}$|
|$\qquad t\{1,2\} = \text{time supertile location, Gabor system 1,2}$|
|$\qquad [c1, c2] = \text{decision}(c1,c2,f1,f2,t1,t2,\text{tau}1,\text{tau}2)$|
|$\text{endfor}$|
|$f_{\text{tonal}} = \text{IDGTReal}(c1)\qquad\qquad\text{\textbf{Inverse discrete Gabor Transform}}$|
|$f_{\text{transient}} = \text{IDGTReal}(c2)$|
\end{minted}
Expand Down Expand Up @@ -842,7 +842,7 @@ \subsubsection{WMDCTLasso}
\label{lst:wmdctlasso}
\end{figure}

The \Verb#franagrouplasso# function is provided in the LFAT, and solves the Group-LASSO regression problem in the time-frequency domain. The theoretical background was covered in section \ref{sec:theorysparsity}. It is used in the WMDCTLasso algorithm to search for tonal components (with the `freq' argument), and transient components (with the `time' argument). The parameters of WMDCT are $\lambda$, or sparsity, for the Group Lasso regression for each of the tonal and transient systems, and Gaussian window sizes for the WMDCT frames for each system. A variety of parameter combinations were tested, shown in table \ref{table:round2wmdct}. The defaults, taken from the demo, are window sizes of 256 (tonal) and 32 (transient), and lambda values of 0.8 (tonal) and 0.5 (transient) for the percussive system.
The \Verb#franagrouplasso# function is provided in the LTFAT, and solves the Group-LASSO regression problem in the time-frequency domain. The theoretical background was covered in section \ref{sec:theorysparsity}. It is used in the WMDCTLasso algorithm to search for tonal components (with the `freq' argument), and transient components (with the `time' argument). The parameters of WMDCT are $\lambda$, or sparsity, for the Group Lasso regression for each of the tonal and transient systems, and Gaussian window sizes for the WMDCT frames for each system. A variety of parameter combinations were tested, shown in table \ref{table:round2wmdct}. The defaults, taken from the demo, are window sizes of 256 (tonal) and 32 (transient), and lambda values of 0.8 (tonal) and 0.5 (transient) for the percussive system.

\begin{table}[ht]
\centering
Expand Down

0 comments on commit ba30e6f

Please sign in to comment.