-
Notifications
You must be signed in to change notification settings - Fork 0
/
multivariate.tex
415 lines (366 loc) · 13.5 KB
/
multivariate.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
\documentclass{beamer}
%
% Choose how your presentation looks.
%
% For more themes, color themes and font themes, see:
% http://deic.uab.es/~iblanes/beamer_gallery/index_by_theme.html
%
\mode<presentation>
{
\usetheme{default} % or try Darmstadt, Madrid, Warsaw, ...
\usecolortheme{default} % or try albatross, beaver, crane, ...
\usefonttheme{default} % or try serif, structurebold, ...
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{caption}[numbered]
}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\title[Multivariate Probability and Statistics]{Multivariate Probability and Statistics}
\author{Jussi Martin}
%\institute{Where You're From}
\date{October 24, 2016}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% Uncomment these lines for an automatically generated outline.
%\begin{frame}{Outline}
% \tableofcontents
%\end{frame}
% \section{Introduction}
%
% \begin{frame}{Introduction}
%
% \begin{itemize}
% \item Your introduction goes here!
% \item Use \texttt{itemize} to organize your main points.
% \end{itemize}
%
% \vskip 1cm
%
% \begin{block}{Examples}
% Some examples of commonly used commands and features are included, to help you get started.
% \end{block}
%
% \end{frame}
\section{Introduction}
\begin{frame}{Introduction}
This presentation is a brief summary of some of the topics introduced in
chapter 4 of the book \emph{Natural Image Statistics} (reference at
the last slide).
Since there won't be enough time to cover the whole chapter 4, I decide to go
trough the things that are needed for understanding the \emph{Bayes' rule},
which I think is one of the core contents in this chapter.
I will also introduce \emph{expectation}, \emph{variance}, \emph{covariance},
\emph{likelihood}, \emph{log-likelihood}, \emph{maximum likelihood estimate}
and \emph{maximum a posteriori estimates} in these slides, since some of them
are needed in the exercises.
\end{frame}
\section{Random Variables}
\begin{frame}{Random Variables}
Discrete random variable is a variable which can have any value from some given
countable set with some given probability. Example: outcome of throwing a dice.
The outcome can have any integer value from 1 to 6, each with the probability
$\frac{1}{6}$.
Continuous random variable is a variable which can have any value from some given
continuous range of values, with some given probabilities for the value been in
any given interval. Example: uniform probability distribution on the interval
$[0, 1]$. Now the outcome is in an interval $[a, b]$ with the probability $b-a$
for any given values $0 \le a \le b \le 1$.
\end{frame}
\section{Random Vectors}
\begin{frame}{Random Vectors}
Random vector is a $n$-tuple of random variables. Each of these can be either
similar or different types of variables. Formally:
\[ \mathbf{z} = \begin{pmatrix} z_1 \\ z_2 \\ \vdots \\ z_n \end{pmatrix} \]
where $\mathbf{z}$ is the random vector and the random variables $z_i$ are
its components.
\end{frame}
\section{Probability Density Function (pdf)}
\begin{frame}{Probability Density Function (pdf)}
Let $P(\text{$z$ is in [a, b]})$ be the probability that the value of a
random variable $z$ is in the interval $[a, b]$, then the probability density
function $p_z(a)$ can be defined approximately as follows:
\[ p_z(a) \approx \frac{P(\textnormal{$z$ is in $[a, a + \nu]$})}{\nu}\]
for very small values of $\nu$.
Similarly, if $\mathbf{z}$ is a random vector:
\[ p_{\mathbf{z}}(\mathbf{a}) \approx
\frac{P(\textnormal{$z_i$ is in $[a_i, a_i + \nu]$ for all $i$})}{\nu^n}\]
for very small values of $\nu$. Rigorous definitions are obtained by taking
the one-sided limit $\nu \to 0^+$.
\end{frame}
\section{Joint and Marginal pdfs}
\begin{frame}{Joint and Marginal pdfs}
The pdf $p(z_1, z_2, \ldots, z_n)$ of a random vector $\mathbf{z}$ is also
called joint pdf, since it depends on all the components $z_i$.
The marginal pdfs $p_{z_i}(z_i)$ are obtained by integrating over the other
variables (that is $z_j$, with $j \ne i$). Example in two dimensions:
\[ p_{z_1}(z_1) = \int p_{\mathbf{z}}(z_1, z_2)dz_2.\]
% Here we have used same notation for the random variable and its value. More
% rigorous way would be to write:
% \[ p_{z_1}(\nu_1) = � \int p(\nu_1, \nu_2)d\nu_2.\]
% Earlier notation is often used in practice and we will also use it later.
\end{frame}
\section{Conditional Probabilities}
\begin{frame}{Conditional Probabilities}
Conditional pdfs are obtained from any joint pdf by fixing the value of one (or
several) of its components and multiplying the result with a normalization
factor. Example: (conditional pdf of $z_2$ with $z_1$ given)
\[
p(z_2|z_1 = a) =
\frac{ p_{\mathbf{z}}(a, z_2)}{\int p_{\mathbf{z}}(a, z_2)dz_2}.
\]
Using the definition of marginal pdf, this can be written as
\[
p(z_2|z_1 = a) =
\frac{ p_{\mathbf{z}}(a, z_2)}{p_{z_1}(a)},
\]
which can be simplified further by omitting the subscripts
\[
p(z_2|z_1 = a) =
\frac{ p(a, z_2)}{p(a)}
\]
\end{frame}
\begin{frame}{Conditional Probabilities}
or by not introducing the quantity $a$ we can write it as
\[
p(z_2|z_1) =
\frac{ p_{\mathbf{z}}(z_1, z_2)}{p(z_1)}.
\]
For the discrete case the integral is replaced by sum, that is
\[
P(z_2|z_1) = \frac{P_{\mathbf{z}}(z_1, z_2)}{P_{z_1}(z_1)}
\]
where
\[
P_{z_1}(z_1) = \sum_{z_2}P_{\mathbf{z}}(z_1, z_2).
\]
\end{frame}
\section{Independence}
\begin{frame}{Independence}
Two random variables $z_1$ and $z_2$ are said to be statistically independent
if information about the value of one of them does not give any information
about the value of the other. Formally this can be written as
\[
p(z_2|z_1) = p(z_2) \ \text{for every $z_1$ and $z_2$},
\]
since conditional probability assumes that the value of the other variable is
fixed (and hence known). Substituting the formula of $p(z_1|z_2)$ we obtain
two alternative ways to define independence:
\[
\frac{p(z_1, z_2)}{p(z_1)} = p(z_2) \ \text{for every $z_1$ and $z_2$}
\]
or
\[
\ p(z_1, z_2) = p(z_1)p(z_2) \ \text{for every $z_1$ and $z_2$}.
\]
\end{frame}
%
% \section{Independence and covariance}
%
% \begin{frame}{Independence and covariance}
% For independent variables $z_1$ and $z_2$ the relation
% \[
% E\{g_1(z_1)g_2(z_2)\} = E\{g_1(z_1)\}E\{g_2(z_2)\}
% \]
% holds for any functions $g_1$ and $g_2$
% (rigorously speaking: for any measurable functions), which implies that
% indepandent variables are uncorreleted (take $g_1(z) = g_2(z) = z$).
% \end{frame}
\section{Bayes' Rule}
\begin{frame}{Bayes' Rule}
Bayes' rule gives us the probability distribution $p(\mathbf{s}|\mathbf{z})$
when we know the distributions $p(\mathbf{z}|\mathbf{s})$ and $p(s)$. For
continuously distributed random vector it is written as follows:
\[
p(\mathbf{s}|\mathbf{z}) =
\frac{p(\mathbf{z}|\mathbf{s})p_{\mathbf{s}}(\mathbf{s})}{\int p(\mathbf{z}|\mathbf{s})p_{\mathbf{s}}(\mathbf{s})ds}.
\]
The distribution on the left is called \emph{posterior distribution} and the
distribution $p_{\mathbf{s}}(\mathbf{s})$ on the right side is called
\emph{prior distribution}.
In the case of discretely distributed random vector the rule takes the form:
\[
P(\mathbf{s}|\mathbf{z}) =
\frac{P(\mathbf{z}|\mathbf{s})P_{\mathbf{s}}(\mathbf{s})}{\sum_{\mathbf{s}} P(\mathbf{z}|\mathbf{s})P_{\mathbf{s}}(\mathbf{s})}.
\]
\end{frame}
% \section{Non informative priors}
%
% \begin{frame}{Non informative priors}
% definition
% \[
% p(\mathbf{s}|\mathbf{z}) =
% \frac{p(\mathbf{z}|\mathbf{s})c}{\int p(\mathbf{z}|\mathbf{s})cds} =
% \frac{p(\mathbf{z}|\mathbf{s})}{\int p(\mathbf{z}|\mathbf{s})ds}
% \]
% \end{frame}
%
% \section{Bayesian inference as a incremental learning process}
%
% \begin{frame}{Bayesian inference as a incremental learning process}
% Explanation
% \end{frame}
%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \section{Template examples}
%
% \begin{frame}{Template examples}
% Examples from the template on the next slides:
% \end{frame}
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%% Some of the original default code below: %%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% \section{Some \LaTeX{} Examples}
%
% \subsection{Tables and Figures}
%
% \begin{frame}{Tables and Figures}
%
% \begin{itemize}
% \item Use \texttt{tabular} for basic tables --- see Table~\ref{tab:widgets}, for example.
% \item You can upload a figure (JPEG, PNG or PDF) using the files menu.
% \item To include it in your document, use the \texttt{includegraphics} command (see the comment below in the source code).
% \end{itemize}
%
% % Commands to include a figure:
% %\begin{figure}
% %\includegraphics[width=\textwidth]{your-figure's-file-name}
% %\caption{\label{fig:your-figure}Caption goes here.}
% %\end{figure}
%
% \begin{table}
% \centering
% \begin{tabular}{l|r}
% Item & Quantity \\\hline
% Widgets & 42 \\
% Gadgets & 13
% \end{tabular}
% \caption{\label{tab:widgets}An example table.}
% \end{table}
%
% \end{frame}
%
% \subsection{Mathematics}
%
% \begin{frame}{Readable Mathematics}
%
% Let $X_1, X_2, \ldots, X_n$ be a sequence of independent and identically distributed random variables with $\text{E}[X_i] = \mu$ and $\text{Var}[X_i] = \sigma^2 < \infty$, and let
% $$S_n = \frac{X_1 + X_2 + \cdots + X_n}{n}
% = \frac{1}{n}\sum_{i}^{n} X_i$$
% denote their mean. Then as $n$ approaches infinity, the random variables $\sqrt{n}(S_n - \mu)$ converge in distribution to a normal $\mathcal{N}(0, \sigma^2)$.
%
% \end{frame}
\section{Expectation}
\begin{frame}{Expectation}
Expectation $E\{z\}$ of a random variable $z$ is a weighted average of the
outcomes which the variable can obtain, with the weights been the individual
probabilities (or values of the pdf in the continuous case).
\[
\text{Discrete case:} \qquad
E\{z\} = \sum_{z} P_{z}(z)z
\]
\[
\text{Continuous case:} \qquad
E\{z\} = \int p_{z}(z)zdz
\]
The expectation can be also defined for random vectors component wise:
\[
E\{\mathbf{z}\} =
\begin{pmatrix} E\{z_1\} \\ E\{z_2\} \\ \vdots \\ E\{z_n\} \end{pmatrix}
\]
where $E\{z_i\}$ is defined as above, with $z$ replaced by $z_i$ for all $i$.
% (write the components?)\\
% it is linear:
% \[
% E\{a\mathbf{z} + b\mathbf{z}\} = aE\{\mathbf{z}\} + bE\{\mathbf{z}\}.
% \]
% Similarily, for any matrix $\mathbf{M}$
% \[
% E\{\mathbf{Mz}\} = \mathbf{M}E\{\mathbf{z}\}.
% \]
\end{frame}
\section{Variance and Covariance}
\begin{frame}{Variance and Covariance}
Variance in some sense tells how close the probability mass is concentrated
near the expected value. It is defined as follows:
\[
var(z)= E\{z^2\}-(E\{z\})^2.
\]
Alternatively it can be written as
\[
var(z) = E\{(z-E\{z\})^2\}.
\]
When we have two variables we can define their covariance:
\[
cov(z_1, z_2) = E\{z_1 z_2\} - E\{z_1\}E\{z_2\}
\]
which measures how well we can predict the value of one variable from another
by using simple linear predictor.
% \section{Correlation Coefficient}
Covariance is often normalized and this quantity is the correlation
coefficient:
\[
corr(z_1,z_2) = \frac{cov(z_1, z_2)}{\sqrt{var(z_1)var(z_2)}}.
\]
\end{frame}
% \section{Covariance Matrix}
%
% \begin{frame}{Covariance Matrix}
% \[
% \mathbf{C}(\mathbf{z}) =
% \begin{pmatrix}
% cov(z_1, z_1) \ cov(z_1, z_2) \ldots cov(z_1, z_n) \\
% cov(z_2, z_1) \ cov(z_2, z_2) \ldots cov(z_2, z_n) \\
% \vdots \qquad \qquad \ddots \quad \vdots \\
% cov(z_n, z_1) \ cov(z_n, z_2) \ldots cov(z_n, z_n)
% \end{pmatrix}
% \]
% or more concisely:
% \[ \mathbf{C}(\mathbf{z}) =
% E\{\mathbf{zz}^T\} - E\{\mathbf{z}\} E\{\mathbf{z}\}^T\]
% \end{frame}
\section{Likelihood and Log-likelihood}
\begin{frame}{Likelihood and Log-likelihood}
Given $n$ samples $z(1), \ldots , z(n)$ of a random variable variable $z$, which
is assumed to have probability distribution $p(z, \alpha)$ for some unknown
parameter $\alpha$, we can define the likelihood:
\[
p(z(1), \ldots , z(n)|\alpha) = p(z(1)|\alpha)\times \ldots \times p(z(n)|\alpha)
\]
and the log-likelihood:
\[
\log p(z(1), \ldots , z(n)|\alpha) = \log p(z(1)|\alpha) + \ldots + \log p(z(n)|\alpha).
\]
These are functions of the parameter $\alpha$, when the model $p(z, \alpha)$
and the samples $z(1), \ldots , z(n)$ are fixed.
\end{frame}
\section{Maximum Likelihood and Maximum a Posteriori}
\begin{frame}{Maximum Likelihood and Maximum a Posteriori}
In maximum likelihood and maximum a posteriori estimates one tries to find
the value of the parameter $\alpha$ which is in some sense optimal for
observing the samples $z(1), \ldots , z(n)$.
Using the Bayes' rule we see that
\[
p(\alpha| z(1), \ldots , z(n)) = \frac{p(z(1), \ldots , z(n)|\alpha)p(\alpha)}{p(z(1), \ldots , z(n))}.
\]
The maximum a posteriori estimate $\hat{\alpha}$ is the one that maximizes
the value of the posterior probability for the given prior $p(\alpha)$.
Maximum likelihood estimate $\hat{\alpha}$ on the other hand is the one which
maximizes the likelihood.
If we choose a flat prior, that is $p(\alpha) =$ constant, these estimates are
the same.
\end{frame}
\section{References}
\begin{frame}
\frametitle{References}
\footnotesize{
\begin{thebibliography}{09}
\bibitem[1]{p1} Aapo Hyvärinen, Jarmo Hurri, and Patrik O. Hoyer
\newblock Natural Image Statistics: A Probabilistic Approach to Early Computational Vision
\newblock \emph{Springer-Verlag}, 2009
\end{thebibliography}
}
\end{frame}
\end{document}