-
Notifications
You must be signed in to change notification settings - Fork 0
/
FlashCards.tex
1761 lines (1758 loc) · 73.8 KB
/
FlashCards.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[avery5388,grid,frame]{flashcards}
\cardfrontstyle[\large\slshape]{headings}
\usepackage{fancyhdr}
\usepackage{graphicx}
\usepackage{amsmath,amssymb}\DeclareMathOperator*{\argmax}{\arg\!\max}\DeclareMathOperator*{\argmin}{\arg\!\min}
\usepackage{wasysym}
\usepackage{needspace,setspace,relsize,url}
\cardbackstyle{empty}
\begin{document}
\cardfrontfoot{Fundamentals of Probability}
\begin{flashcard}[Definition]{Bonferroni's Inequality}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
P(A, B)\geq P(A)+P(B)-1
\end{equation*}
This is useful if you are asked to give the minimum of P(A,B)
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Bayes' Rule}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
P(A|B)=\frac{P(B|A)P(A)}{P(B)}
\end{equation*}
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Sensitivity and Specificity}
\begin{center}
\bigskip\bigskip\bigskip
Sensitivity=$P(T=1|D=1)$\\
\bigskip
Specificity=$P(T=0|D=0)$
\end{center}
\end{flashcard}
\begin{flashcard}[Equation]{Change of Variable}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
f_Y(y)=f_X(g^{-1}(y))\Big|\frac{d}{dy}g^{-1}(g)\Big|
\end{equation*}
\bigskip\\
Note that this only works for 1-1 monotonic functions
\end{center}
\end{flashcard}
\begin{flashcard}[Equation]{Moment Generating Function}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
E[e^{xt}]=\int_xe^{xt}f(x)dx
\end{equation*}
\bigskip\\
Then evaluate the derivative for each moment at $t=0$. For example, the second moment would be the second derivative of the mgf evaluated at $t=0$.
\end{center}
\end{flashcard}
\begin{flashcard}[Equation]{Location-Scale shift}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
f_X(x)=\frac{1}{\sigma}f_Z\left(\frac{x-\mu}{\sigma}\right)
\end{equation*}
\bigskip\\
Basically, multiply the pdf by $\frac{1}{\sigma}$ and replace z with $\frac{x-\mu}{\sigma}$
\end{center}
\end{flashcard}
\begin{flashcard}[Form]{Exponential Family}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
f(x|\theta)=h(x)c(\theta)\exp\left\{\sum_{i=1}^kw_i(\theta)t_i(x)\right\}
\end{equation*}
\bigskip\\
If a family is exponential and there is a non-empty parameter space, it is considered ``complete". Therefore $t(x)$ is MSS.
\end{center}
\end{flashcard}
\begin{flashcard}[Pro Tip]{Mgf when X $\perp$ Y }
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
Mgf(x+y)=Mgf(x)Mgf(y)
\end{equation*}
\bigskip\\
This is cool because it shows that if $X\perp Y$ and $X\sim N(\mu_x, \sigma_x^2)$ and $Y\sim N(\mu_y,\sigma_y^2)$, then $X+Y\sim N(\mu_x+\mu_y,\sigma_x^2+\sigma_y^2)$
\end{center}
\end{flashcard}
\begin{flashcard}[Equation]{Bivariate Transformations}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
f_{u,v}(u,v)=f_{x,y}(h_1(u,v),h_2(u,v))|J|
\end{equation*}
\bigskip
This is for continuous. DON'T FORGET THE JACOBIAN. or Jacob Marley will come after you.
\end{center}
\end{flashcard}
\begin{flashcard}[Equation]{Iterative Expectations}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
E[Y]=E_X[E_{Y|X}[Y|X]]
\end{equation*}
\end{center}
\end{flashcard}
\begin{flashcard}[Equation]{Iterative Variance}
\begin{center}
\bigskip\bigskip\bigskip
\begin{equation*}
Var[Y]=E_X[Var[Y|X]]+Var_X[E[Y|X]]
\end{equation*}
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Covariance}
\bigskip\bigskip\bigskip
{\begin{align*}
cov(X,Y)&=E[(X-\mu_X)(Y-\mu_Y)]\\
&=E[XY]-\mu_X\mu_Y
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Correlation}
\bigskip\bigskip\bigskip
{\begin{align*}
corr(X,Y)=\frac{cov(X,Y)}{\sigma_x\sigma_y}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Chebyshev's Inequality}
\bigskip\bigskip\bigskip
{\begin{align*}
P(g(X)\geq r)&\leq \frac{E(g(X))}{r}\\
P(|X-\mu|\geq t\sigma)&\leq\frac{1}{t^2}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Jensen's Inequality}
\bigskip\bigskip\bigskip
{\begin{align*}
E(g(x))\geq g(E(x))
\end{align*}}
\bigskip\\
This holds if $g(x)$ is a convex function. That means it has a positive 2nd derivative, like a smile. I have high expectations!
\end{flashcard}
\begin{flashcard}[Definition]{Holder's Inequality}
\bigskip\bigskip\bigskip
{\begin{align*}
|E(XY)|\leq E|XY|\leq (E(|X|^p))^{1/p}(E(|Y|^q))^{1/q}\\
\frac{1}{p}+\frac{1}{q}=1
\end{align*}}
\bigskip\\
\begin{center}
Cauchy-Schwartz is a special case where $p=q=2$. Can use to show correlation is bounded by -1 and 1.
\end{center}
\end{flashcard}
\begin{flashcard}[Proof]{Correlation is Bounded by -1 and 1}
\bigskip\bigskip\bigskip
{\begin{align*}
\Big|cov(X,Y)\Big|&=\Big|E[(X-\mu_x)(Y-\mu_y)]\Big|\\
&\leq(E|X-\mu_x|^2)^{1/2}(E|Y-\mu_y|^2)^{1/2}\leq\sqrt{\sigma_x^2}\sqrt{\sigma_y^2}\\
&\leq\sigma_x\sigma_y\\
\Big|corr(X,Y)\Big|&=\Big|\rho\Big|=\Big|\frac{cov(X,Y)}{\sigma_x\sigma_y}\Big|=\frac{\Big|cov(X,Y)\Big|}{\sigma_x\sigma_y}\leq\frac{\sigma_x\sigma_y}{\sigma_x\sigma_y}=1
\end{align*}}
\end{flashcard}
\begin{flashcard}[Equation]{Order Statistics PDF}
\bigskip\bigskip\bigskip
{\begin{align*}
\frac{n!}{(j-1)!(n-j)!}f(x)[F(x)]^{j-1}[1-F(x)]^{n-j}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Equation]{Order Statistics CDF}
\bigskip\bigskip\bigskip
{\begin{align*}
\sum_{i=j}^n{n\choose i}F(x)^i[1-F(x)]^{n-i}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Equation]{Joint PDF of Order Statistics}
\bigskip\bigskip\bigskip
{\begin{align*}
&\frac{n!}{(l-1)!(m-l-1)!(n-m)!}\\
&F(x_l)^{l-1}f(x_l)f(x_m)[F(x_m)-F(x_l)]^{m-l-1}[1-F(x_m)]^{n-m}
\end{align*}}
\bigskip\\
soooo its so long it can't fit on one line...but this is all one equation. memorize it fooooool.
\end{flashcard}
\begin{flashcard}[Pro Tip]{If $X_i\overset{iid}{\sim}Unif(0,1)$ the pdf of the kth order statistic}
\bigskip\bigskip\bigskip
{\begin{align*}
Beta(k,n-k+1)
\end{align*}}
\end{flashcard}
\begin{flashcard}[Proof]{Convergence in Probability}
\bigskip
\begin{center}
Convergence in probability means that the estimator is consistent. We can prove something converges in probability using Chebychev's. Also, the Weak Law of Large Numbers is that $\bar{X}\overset{p}{\rightarrow}\mu$.
{\begin{align*}
P(|\bar{X}_n-X|\geq \epsilon)&\leq\frac{E[(\bar{X}_n-\mu)^2]}{\epsilon^2}\\
&\leq\frac{Var(\bar{X}_n)}{\epsilon^2}=\frac{\sigma^2}{n\epsilon^2}\\
\lim_{n\rightarrow\infty}P(|\bar{X}_n-X|\geq \epsilon)&\leq\lim_{n\rightarrow\infty}\frac{\sigma^2}{n\epsilon^2}\\
\lim_{n\rightarrow\infty}\frac{\sigma^2}{n\epsilon^2}=0\\
\lim_{n\rightarrow\infty}P(|\bar{X}_n-X|\geq \epsilon)=0
\end{align*}}
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Convergence Almost Surely}
\bigskip
\begin{center}
\bigskip\bigskip\bigskip
{\begin{align*}
P(\lim_{n\rightarrow\infty}|\bar{X}_n-X|\geq \epsilon)=0
\end{align*}}
Convergence almost surely implies convergence in probability which implies convergence in distribution.
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Convergence in Distribution}
\bigskip
\begin{center}
\bigskip\bigskip\bigskip
{\begin{align*}
X_n\overset{d}{\rightarrow}X\iff \lim_{n\rightarrow\infty}F_{x_n}(x)=F_x(x) \textrm{ for all x where $F_x(x)$ is continuous}
\end{align*}}
The CLT is convergence in distribution. Coming to a flashcard near you - proof of the CLT using MGFs. Yeah. You can't wait.
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Slutsky's}
\bigskip
\begin{center}
If $X_n\overset{d}{\rightarrow}X$ and $Y_n\overset{p}{\rightarrow}a$ then:\\
$Y_nX_n\overset{d}{\rightarrow}aX$\\
and $Y_n+X_n\overset{d}{\rightarrow}a+X$
{\begin{align*}
\frac{\sqrt{n}(\bar{X}-\mu)}{S_n}&=\frac{\sqrt{n}(\bar{X}-\mu)}{\sigma}\frac{\sigma}{S_n}\overset{d}{\rightarrow}N(0,1)
\intertext{because}
\frac{\sqrt{n}(\bar{X}-\mu)}{\sigma}&\overset{d}{\rightarrow}N(0,1)\\
\frac{\sigma}{S_n}&\overset{p}{\rightarrow}1
\end{align*}}
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Delta Method}
\begin{center}
\bigskip\bigskip\bigskip
1st order:
\begin{equation*}
\sqrt{n}(g(X_n)-g(\theta))\overset{d}{\rightarrow}N(0,g'(\theta)^2var(X_n))
\end{equation*}
2nd order:
\begin{equation*}
n(g(X_n)-g(\theta))\overset{d}{\rightarrow}\frac{Var(X_n)}{2}g''(\theta)\chi^2_1
\end{equation*}
\end{center}
\end{flashcard}
\begin{flashcard}[Definition]{Accept-Reject Algorithm}
\bigskip\bigskip\bigskip
To generate Y from $f(y)$:
\begin{enumerate}
\item Generate $v$ from a known distribution, $f(v)$, with support that contains the support of $Y$.
\item Calculate $M=\sup_y\frac{f_Y(y)}{f_V(y)}$ (the supremum of $Y$ in $V$)
\item Generate $U\sim Unif(0,1)$ if $U\leq\frac{1}{M}\frac{f_Y(v)}{f_V(v)}$ then accept, otherwise reject.
\end{enumerate}
\end{flashcard}
\begin{flashcard}[Distribution]{Bernoulli}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Bin(1,p)\\
f(x)&=p^x(1-p)^{1-x}\textrm{ for } x=0,1\\
E(x)&=p\\
Var(x)&=p(1-p)
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Binomial}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Bin(n,p)\\
f(x)&={n\choose x}p^x(1-p)^{n-x}\\
E(x)&=np\\
Var(x)&=np(1-p)
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Geometric}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Geometric(p) \textrm{ or } X\sim NegBinom(1,p)\\
f(x)&=p(1-p)^{x-1}\textrm{ for x=1,...}\\
E(x)&=\frac{1}{p}\\
Var(x)&=\frac{1-p}{p^2}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Negative Binomial}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim NegBinom(r,p)\\
f(x)&={x-1\choose r-1}p^r(1-p)^{x-r} \textrm{ for x=r,r+1,...}\\
E(x)&=\frac{r}{p}\\
Var(x)&=\frac{r(1-p)}{p^2}
\end{align*}}
\bigskip\\
where x is the number of experiments needed to get r successes
\end{flashcard}
\begin{flashcard}[Distribution]{Hypergeometric}
\bigskip\bigskip\bigskip
$N=\#$ of balls, $K=\#$ selected $M=\#$ of successes $X=\#$ of successes in your sample
{\begin{align*}
f(x)&=\frac{{M\choose X}{N-M\choose K-X}}{{N\choose K}}\\
E(x)&=\frac{KM}{N}\\
Var(x)&=\frac{KM}{N}\left(\frac{N-M}{N}\right)\left(\frac{N-K}{N-1}\right)
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Discrete Uniform}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim DUnif(a,b)\\
f(x)&=\frac{1}{b-a+1} \textrm{ for x=a,...,b}\\
E(x)&=\frac{a+b}{2}\\
Var(x)&=\frac{(b-a+1)^2-1}{12}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Poisson}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Pois(\lambda)\\
f(x)&=\frac{e^{-\lambda}\lambda^x}{x!}\textrm{ for x=0,1,2...}\\
E(x)&=\lambda\\
Var(x)&=\lambda
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Uniform}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Unif(a,b)\\
f(x)&=\frac{1}{b-a}\textrm{ a$<$x$<$b}\\
E(x)&=\frac{a+b}{2}\\
Var(x)&=\frac{(b-a)^2}{12}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Gamma}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Gamma(\alpha,\beta)\\
f(x)&=\frac{1}{\Gamma(\alpha)\beta^\alpha}x^{\alpha-1}e^{-x/\beta}\textrm{ for $x>0$, $\alpha>0$, $\beta>0$}\\
E(x)&=\alpha\beta\\
Var(x)&=\alpha\beta^2
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Chi-square}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim \chi^2(p)\\
f(x)&=\frac{1}{\Gamma(p/2)2^{p/2}}x^{p/2-1}e^{-x/2}\\
E(x)&=p\\
Var(x)&=2p
\end{align*}}
special case of Gamma where $\alpha=p/2$ and $\beta=2$.
\end{flashcard}
\begin{flashcard}[Distribution]{Exponential}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Exp(\beta)\\
f(x)&=\beta e^{-x\beta}\textrm{ for $x>0$, $\beta>0$}\\
E(x)&=\frac{1}{\beta}\\
Var(x)&=\frac{1}{\beta^2}
\end{align*}}
Special case of Gamma where $\alpha=1$
\end{flashcard}
\begin{flashcard}[Distribution]{Normal}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim N(\mu,\sigma^2)\\
f(x)&=\frac{1}{\sqrt{2\pi\sigma^2}}\exp\left\{-\frac{(x-\mu)^2}{2\sigma^2}\right\}\textrm{ for $-\infty< x<\infty$, $-\infty<\mu<\infty$, $\sigma>0$}\\
E(x)&=\mu\\
Var(x)&=\sigma^2
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Beta}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Beta(\alpha,\beta)\\
f(x)&=\frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)}x^{\alpha-1}(1-x)^{\beta-1}\textrm{ for $0<x<1$, $\alpha>0$, $\beta>0$}\\
E(x)&=\frac{\alpha}{\alpha+\beta}\\
Var(x)&=\frac{\alpha\beta}{(\alpha+\beta)^2(\alpha+\beta+1)}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Distribution]{Cauchy}
\bigskip\bigskip\bigskip
{\begin{align*}
X&\sim Cauchy(\theta)\\
f(x)&=\frac{1}{\pi(1+(x-\theta)^2)}\textrm{ for $-\infty<x<\infty$, $-\infty<\theta<\infty$}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{$e^t$}
\bigskip\bigskip\bigskip
{\begin{align*}
\lim_{n\rightarrow\infty}\left(1+\frac{t}{n}\right)^n
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{Binomial Series}
\bigskip\bigskip\bigskip
{\begin{align*}
\sum_{v=0}^u\frac{u!}{(u-v)!v!}\theta^{u-v}\lambda^v=(\theta+\lambda)^u
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{Geometric Series}
\bigskip\bigskip\bigskip
{\begin{align*}
\sum_{i=1}^nq^{i-1}=\frac{1-q^n}{1-q}
\end{align*}}
\end{flashcard}
\cardfrontfoot{Statistical Inference}
\begin{flashcard}[Definition]{Positive Predictive Value}
\bigskip\bigskip\bigskip
{\begin{align*}
P(D^+|T^+)=\left[1+\frac{P(T^+|D^-)}{P(T^+|D^+)}\frac{P(D^-)}{P(D^+)}\right]^{-1}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Kullback-Leibler Divergence}
\bigskip\bigskip\bigskip
{\begin{align*}
KLD(g,f)=E_g\left[\log\frac{g(X)}{f(X)}\right]\geq0
\end{align*}}
Measures how much information you lose by using the worse distribution.
\end{flashcard}
\begin{flashcard}[Definition]{Hellinger Distance}
\bigskip\bigskip\bigskip
{\begin{align*}
KLD(g,f)\geq2[H(f,g)]^2
\end{align*}}
Lower bound for the KLD
\end{flashcard}
\begin{flashcard}[Proof]{LR is bounded by $1/k$}
\bigskip\bigskip\bigskip
{\begin{align*}
P_g\left(\frac{\prod f(x_i)}{\prod g(x_i)}>k\right)&\leq\frac{E_g\left[\frac{\prod f(x_i)}{\prod g(x_i)}\right]}{k}=\frac{1}{k}\\\\
E_g\left[\frac{\prod f(x_i)}{\prod g(x_i)}\right]&=\int \frac{\prod f(x_i)}{\prod g(x_i)}\prod g(x_i)dx\\
&=\int\prod f(x_i)dx=\int P_f(X_1,...,X_n)dx=1
\end{align*}}
Proof by Markov's inequality. This bound holds if we look at the data as it accumulates.
\end{flashcard}
\begin{flashcard}[Proof]{Asymptotic behavior of LR}
\bigskip
As evidence accumulates, the LR converges to 0
{\begin{align*}
LR_n=\exp\left\{\log\prod\frac{f(x_i)}{g(x_i)}\right\}&=\exp\left\{\sum\log f(x_i)-\sum \log g(x_i)\right\}\\
&=\exp\left\{n\left[\frac{1}{n}\sum \log f(x_i)-\frac{1}{n}\sum\log g(x_i)\right]\right\}\\
\textrm{From the LLN: }\frac{1}{n}\sum\log\frac{f(x_i)}{g(x_i)}&\rightarrow E_g\left[\log\frac{f(x_i)}{g(x_i)}\right]\leq \log E_g\left[\frac{f(x_i)}{g(x_i)}\right] \textrm{ by Jensen's }\\
& \log E_g\left[\frac{f(x_i)}{g(x_i)}\right] =\log(1)=0\\
\textrm{ Therefore this portion}&\textrm{ some negative number, call it -c}\\
\prod\frac{f(x_i)}{g(x_i)}\rightarrow \lim_{n\rightarrow\infty}e^{n[-c]}=0
\end{align*}}
\end{flashcard}
\begin{flashcard}[Proof]{Convergence of the posterior}
Due to the LR convergence properties, the posterior converges as well. (Note this proof is in the discrete case).
{\begin{align*}
X_1,...,X_n\overset{iid}{\sim}f(X;\theta_0)\\
P(\theta=\theta_0|\underbar{X})=\left[1+\sum_{\theta\neq\theta_0}\frac{P(\underbar{X}|\theta)}{P(\underbar{X}|\theta_0)}\frac{P(\theta)}{P(\theta_0)}\right]^{-1}\\
\frac{P(\underbar{X}|\theta)}{P(\underbar{X}|\theta_0)}\rightarrow 0\textrm{ By LR convergence principle}\\
\frac{P(\underbar{X}|\theta)}{P(\underbar{X}|\theta_0)}\frac{P(\theta)}{P(\theta_0)}\rightarrow 0 \textrm{ as } n\rightarrow \infty\textrm{ therefore }\sum_{\theta\neq\theta_0}\frac{P(\underbar{X}|\theta)}{P(\underbar{X}|\theta_0)}\frac{P(\theta)}{P(\theta_0)}\rightarrow 0\\
P(H_0|\underbar{X})\rightarrow 1\textrm{ as } n\rightarrow\infty
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Rational for Maximum Likelihood}
\bigskip\bigskip\bigskip\bigskip
Because $\hat{\theta}$ maximizes the likelihood function, it is the parameter value that is best supported by the data by the Law of Likelihood.
\end{flashcard}
\begin{flashcard}[Definition]{Invariance of the MLE}
\bigskip\bigskip\bigskip\bigskip
If $\hat{\theta}$ is the MLE for $\theta$ then $g(\hat{\theta})$ is the MLE for $g(\theta)$ as long as $g(\theta)$ is a 1-1 function of $\theta$.
\end{flashcard}
\begin{flashcard}[Definition]{Bias}
\bigskip\bigskip\bigskip
{\begin{align*}
E[\hat{\theta}-\theta]=b(\hat{\theta})
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Variance}
\bigskip\bigskip\bigskip
{\begin{align*}
E[(\hat{\theta} -E[\hat{\theta}])^2]
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{MSE}
\bigskip\bigskip\bigskip
{\begin{align*}
E[(\hat{\theta}-\theta)^2]\\
=Var[\hat{\theta}]+b^2(\hat{\theta})
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Consistency}
\bigskip\bigskip\bigskip
{\begin{align*}
\hat{\theta}\rightarrow\theta \textrm{ as } n\rightarrow\infty \textrm{ in probability, a.s., etc.}
\end{align*}}
This implies that the limiting bias is 0.
\end{flashcard}
\begin{flashcard}[Definition]{Biases of MLEs}
\bigskip\bigskip\bigskip
MLEs are often biased. For example:
\begin{itemize}
\item The MLE of the variance in the Normal case has a slight \textbf{negative} bias $-\frac{\sigma^2}{n}$. This goes to 0 in large samples
\item Poisson mean inverse - the bias is undefined! zabert alert!
\end{itemize}
\end{flashcard}
\begin{flashcard}[Definition]{Bayes Estimator}
\bigskip\bigskip\bigskip
Trade some bias for a reduction in variance.
{\begin{align*}
f(\theta|\underbar{x})=\frac{f(\underbar{X}|\theta)f(\theta)}{\int_{\Theta}f(\underbar{X}|\theta)f(\theta)d\theta}
\end{align*}}
Here, a posterior mean is achieved by shrinking the sample mean towards the prior mean.
\end{flashcard}
\begin{flashcard}[Proof]{Consistency of MLEs}
\bigskip\bigskip\bigskip
To show consistency:
{\begin{align*}
\hat{\theta}_n\overset{p}{\rightarrow}\theta \textrm{ as } n\rightarrow \infty\\
\textrm{In other words } \hat{\theta}_n-\theta=o_p(1)\\
\textrm{\textbf{Method 1} }P(|\hat{\theta}_n-\theta)\geq\epsilon)\rightarrow 0\\
\textrm{\textbf{Method 2} quadratic mean } MSE(\hat{\theta}_n)=Var(\hat{\theta}_n)+b^2(\hat{\theta}_n)\rightarrow 0\\
\end{align*}}
If you can show that bias$\rightarrow 0$ and var$\rightarrow 0$ then $\hat{\theta}_n\overset{qm}{\rightarrow}\theta$ which implies $\hat{\theta}_n\overset{p}{\rightarrow}\theta$.
\end{flashcard}
\begin{flashcard}[Pro Tip]{When will the MLE not be consistent?}
\bigskip
When the number of parameters is increasing as $n\rightarrow\infty$. Here is an example where the MLE is not consistent from Neyman-Scott:
{\begin{align*}
Y_{11},Y_{12}&\sim N(\mu_1,\sigma^2)\\
Y_{21},Y_{22}&\sim N(\mu_2,\sigma^2)\\
.&\sim.\\
.&\sim.\\
Y_{n1},Y_{n2}&\sim N(\mu_n,\sigma^2)\\
\hat{\sigma^2}&=\sum_{i=1}^n\sum{j=1}^2\frac{(Y_{ij}-\bar{Y}_i)^2}{2n}\\
\hat{\sigma}^2\overset{p}{\rightarrow}\frac{\sigma^2}{2}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Continuous Mapping Theorem}
\bigskip
This is basically the best theorem out there.
{\begin{align*}
X_n\overset{d}{\rightarrow}X&\Rightarrow g(X_n)\overset{d}{\rightarrow}g(X)\\
X_n\overset{p}{\rightarrow}X&\Rightarrow g(X_n)\overset{p}{\rightarrow}g(X)\\
X_n\overset{a.s.}{\rightarrow}X&\Rightarrow g(X_n)\overset{a.s.}{\rightarrow}g(X)\\
\end{align*}}
Obviously the function has to be continuous for this to work.
\end{flashcard}
\begin{flashcard}[Definition]{Conditions for MLE consistency}
\bigskip
\begin{enumerate}
\item Identifiability
\item Compactness of the parameter space (it is sufficient to assume concavity of the log LF and MLE cannot be at the boundary of the parameter space)
\item Continuity of $L(\theta)$ in $\theta$ - to ensure smoothness and existence of derivatives
\item Dominance: $|\log f(x;\theta)|<D(x)$ $\forall\theta\in\Theta$
\end{enumerate}
\end{flashcard}
\begin{flashcard}[Definition]{Score Function}
\bigskip
\begin{itemize}
\item First derivative of the log-likelihood function
\item Unbiased estimator of zero
\end{itemize}
\end{flashcard}
\begin{flashcard}[Definition]{Fisher's Information}
\bigskip
Information is the variance of the score function.
{\begin{align*}
\mathcal{I}(\theta)=Var(S_i)=E[S_i^2]\\
\mathcal{I}_n(\theta)=Var\left(\sum S_i\right)=n\mathcal{I}(\theta)
\end{align*}}
It can be estimated by:
{\begin{align*}
\frac{\sum S_i^2}{n}=\frac{1}{n}\sum_i\left(\frac{\partial \log f(x_i;\theta)}{\partial\theta}\right)^2
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Bartlett's Second Identity}
\bigskip \bigskip\bigskip\bigskip
Under the correct model:
{\begin{align*}
Var(S_i)=E[S_i^2]=-E[S'_i]
\end{align*}}
\end{flashcard}
\begin{flashcard}[Proof]{Asymptotic Normality of MLE}
{\begin{align*}
l_i&=log f(x_i;\theta)\\
\intertext{By Taylor Series Expansion:}
0&=l'_n(\hat{\theta}_n)\approx l'_n(\theta)+(\hat{\theta}_n-theta)l''_n(\theta)+R_n\\
(\hat{\theta}_n-\theta)&\approx\frac{l'_n(\theta)}{-l''_n(\theta)}\Rightarrow \sqrt{n}(\hat{\theta}_n-\theta)\approx\frac{\frac{1}{\sqrt{n}}l'_n(\theta)}{-\frac{1}{n}l''_n(\theta)}\\
\sqrt{n}\frac{1}{n}\sum l'_i(\theta)&\overset{d}{\rightarrow}N(0,\mathcal{I}(\theta))\textrm{ by CLT and } -\frac{1}{n}l''_n(\theta)\overset{p}{\rightarrow}\mathcal{I}(\theta)\textrm{by LLN}\\
\textrm{By Slutsky's }\sqrt{n}(\hat{\theta}_n-\theta)&\overset{d}{\rightarrow}N(0,\frac{1}{\mathcal{I}(\theta)})\\
\sqrt{n\mathcal{I}(\hat{\theta}_n)}(\hat{\theta}_n-\theta)&\overset{d}{\rightarrow}N(0,1)
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{What happens to the MLE when the working model fails?}
\bigskip
The MLE $\hat{\theta}_n$ converges to $\theta_g$ where:
{\begin{align*}
\hat{\theta}_n=\argmax_{\theta\in\Theta}\frac{\sum_i\log f(x_i;\theta)}{n}\rightarrow\argmax_{\theta\in\Theta}E_g[\log f(x_i;\theta)]=\theta_g
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Asymptotic Normality of the MLE under Model Failure}
\bigskip\bigskip\bigskip
{\begin{align*}
\sqrt{n}(\hat{\theta}_n-\theta)&\overset{d}{\rightarrow}N(0,a^{-1}ba^{-1}) \textrm{ as } n\rightarrow\infty\\
\sqrt{\frac{\hat{a}^2n}{\hat{b}}}(\hat{\theta}_n-\theta)&\overset{d}{\rightarrow}N(0,1)
\intertext{You can make a likelihood robust by:}
L_R(\theta)=L(\theta)^{\hat{a}/\hat{b}}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Equation]{Making a likelihood robust}
\bigskip\bigskip\bigskip
{\begin{align*}
L_R(\theta)&=L(\theta)^{\hat{a}/\hat{b}}\\
\hat{a}&=-\frac{1}{n}\sum\frac{\partial^2\log f(x_i;\hat{\theta}_n)}{\partial\theta^2}\\
\hat{b}&=\frac{1}{n}\sum\left(\frac{\partial \log f(x_i;\hat{\theta}_n)}{\partial\theta}\right)^2
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Unbiased Estimating Equation}
\bigskip\bigskip\bigskip
{\begin{align*}
E[g(\underbar{X};\theta)]=0\textrm{ }\forall\theta\in\Theta
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Standardized}
\bigskip\bigskip\bigskip
{\begin{align*}
g_s(\underbar{X};\theta)=\frac{g(\underbar{X};\theta)}{E\left[\frac{\partial g(\underbar{X};\theta)}{\partial\theta}\right] }\textrm{ } \forall\theta\in\Theta
\end{align*}}
\end{flashcard}
\begin{flashcard}[Examples]{Natural Estimating Equations}
\bigskip\bigskip\bigskip
\begin{itemize}
\item score functions
\item equations from MOM estimation
\end{itemize}
\end{flashcard}
\begin{flashcard}[Theorem]{Optimality of the Score Function}
\bigskip
This is literally the Godambe Theorem of 1960. NOT A JOKE (but what an awesome name!)
\begin{enumerate}
\item The variance of a standardized estimating equation is bounded below by $1/\mathcal{I}_n(\theta)$,
\begin{equation*}
Var[g_s(\underbar{X};\theta)]=\frac{E_\theta[g^2]}{\left\{E_\theta\left[\frac{\partial g}{\partial\theta}\right]\right\}^2}\geq\frac{1}{E_\theta\left[\left(\frac{\partial\log f}{\partial\theta}\right)^2\right]}
\end{equation*}
\item It follows that $\forall$ $g\in G$,
\begin{equation*}
\frac{E_\theta[g^2]}{\left\{E_\theta\left[\frac{\partial g}{\partial\theta}\right]\right\}^2}\geq\frac{E_\theta[(g^*)^2]}{\left\{E_\theta\left[\frac{\partial g^*}{\partial\theta}\right]\right\}^2}
\end{equation*}
\end{enumerate}
\end{flashcard}
\begin{flashcard}[Definition]{Variance of the Standardized Score Function}
\bigskip\bigskip\bigskip
\begin{equation*}
\frac{1}{\mathcal{I}_n(\theta)}
\end{equation*}
\end{flashcard}
\begin{flashcard}[Definition]{What form does the estimating equation have to be to achieve the variance lower bound?}
\bigskip\bigskip\bigskip
\begin{equation*}
g(\underbar{X};\theta)=a(\theta)\left\{T(\underbar{X})-\underbrace{E_\theta[T(\underbar{X})]}_{h(\theta)}\right\}
\end{equation*}
\bigskip\\
This implies that $T(\underbar{X})$ is the best unbiased estimator for $h(\theta)$. This achieves the CRLB.
\end{flashcard}
\begin{flashcard}[Definition]{Cramer-Rao Lower Bound}
\bigskip\bigskip\bigskip
\begin{equation*}
Var[T(\underbar{X})]\geq\frac{\{h'(\theta)\}^2}{\mathcal{I}_n(\theta)}
\end{equation*}
\bigskip\\
This is the smallest possible variance for any unbiased estimator of $h(\theta)$
\end{flashcard}
\begin{flashcard}[Definition]{Sufficient Statistic}
\bigskip\bigskip\bigskip
\begin{equation*}
f_{\underbar{X}}(\underbar{X};\theta)=g(T(\underbar{X});\theta)h(\underbar{X})
\end{equation*}
\bigskip\\
If the pdf can be factorized as above, then $T(\underbar{X})$ is a sufficient statistic for $\theta$
\end{flashcard}
\begin{flashcard}[Pro Tip]{The most famous MSS (oh yeahhh!)}
\bigskip\bigskip\bigskip
The likelihood function. le duh, whose class is this any ways?
\end{flashcard}
\begin{flashcard}[Definition]{Minimal Sufficient Statistic}
\bigskip\bigskip\bigskip
A sufficient statistic is minimally sufficient if it is a function of every other sufficient statistic.
\end{flashcard}
\begin{flashcard}[Pro Tip]{Technique to find the MSS}
\bigskip\bigskip\bigskip
\begin{equation*}
\frac{f(\underbar{x};\theta)}{f(\underbar{y};\theta)}=c(\underbar{x},\underbar{y})
\end{equation*}
\bigskip\\
Here $(\underbar{x},\underbar{y})$ is free of $\theta$. So you find a way to set these equal to cross all of the $\theta$s out.
\end{flashcard}
\begin{flashcard}[Definition]{Rao-Blackwellization}
\bigskip\bigskip\bigskip
Conditioning on a sufficient statistic always yields a better estimator, with a variance less than or equal to that of the first estimator.
\end{flashcard}
\begin{flashcard}[Definition]{Ancillary Statistic}
\bigskip\bigskip\bigskip
A statistic is ancillary if its distribution does not depend on $\theta$.
\end{flashcard}
\begin{flashcard}[Definition]{Completeness}
\bigskip\bigskip\bigskip
A family is complete if
\begin{equation*}
E_\theta(g(t))=0\textrm{ }\forall\theta\Rightarrow P_\theta(g(t)=0)=1\textrm{ }\forall\theta
\end{equation*}
\bigskip\\
Exponential families with non-empty parameter space are complete.
\end{flashcard}
\begin{flashcard}[Definition]{Basu's Theorem}
\bigskip\bigskip\bigskip
If $T(\underbar{X})$ is complete and a minimally sufficient statistic, then $T(\underbar{X})$ is independent of every ancillary statistic.
\end{flashcard}
\begin{flashcard}[Definition]{MSS/CSS Lemma}
\bigskip\bigskip\bigskip
If a MSS exists, than any CSS is also the MSS.
\end{flashcard}
\begin{flashcard}[Definition]{Lehmann-Scheffe Theorem}
\bigskip\bigskip\bigskip
If $T(\underbar{X})$ is a CSS (and therefore a MSS), then any statistic $h[T(\underbar{X})]$ with fine variance is the MVUE of its expectation $E[h[T(\underbar{X})]]$. In other words if an estimator is a function of a CSS, then it has the smallest variance among all estimators of its expected value.
\end{flashcard}
\begin{flashcard}[Definition]{Checking for completeness}
\bigskip\bigskip\bigskip
\begin{enumerate}
\item Exponential families are complete as longs as the interior of the parameter space is non-empty
\item A sufficient statistic $T(\underbar{X})$ are complete if no function is first order ancillary.
\end{enumerate}
\end{flashcard}
\begin{flashcard}[Proof]{Lehmann-Scheffe Theorem}
\textbf{Proof by contradiction}. Suppose $h[T(\underbar{X})]$ is unbiased for $\gamma$ and $h[T(\underbar{X})]$ is not the MVUE of $\gamma$. Then there exists another estimator, say $W(\underbar{X})$ such that $E[W(\underbar{X})]=\gamma$ and $Var[W(\underbar{X})]<Var[h[T(\underbar{X})]]$\\
Using Rao-Blackwellization, we can create a new estimator $r[T(\underbar{X})]=E[W(\underbar{X})|T(\underbar{X})]$ such that $E[r[T(\underbar{X})]]=\gamma$ and
\begin{equation*}
Var[r[T(\underbar{X})]]<Var[W(\underbar{X})]<Var[h[T(\underbar{X})]]
\end{equation*}
Notice both $r[T(\underbar{X})]$ and $h[T(\underbar{X})]$ are unbiased for $\gamma$, so $E[r[T(\underbar{X})]-h[T(\underbar{X})]]=0 \textrm{ }\forall\gamma$\\
But completeness implies that $r[T(\underbar{X})]=h[T(\underbar{X})]$ with probability 1, so we must have
\begin{equation*}
Var[r[T(\underbar{X})]]=Var[h[T(\underbar{X})]]
\end{equation*}
Which contradicts the previous inequality. This completes the proof that an unbiased function of the CSS is the MVUE.
\end{flashcard}
\begin{flashcard}[Proof]{Uniqueness of the MVUE}
If $T(\underbar{X})$ and $S(\underbar{X})$ are MVUE for $\gamma$ then $E[T(\underbar{X})]=E[S(\underbar{X})]=E\left[\frac{T(\underbar{X})+S(\underbar{X})}{2}\right]$. It follows that $Var[T(\underbar{X})]=Var[S(\underbar{X})]$ but
{\begin{align*}
Var\left[\frac{T(\underbar{X})+S(\underbar{X})}{2}\right]&=\frac{1}{4}\left[Var[T(\underbar{X})]+Var[S(\underbar{X})]+2Cov[T(\underbar{X}),S(\underbar{X})]\right]\\
&=\frac{1}{4}[2Var[S(\underbar{X})]+2\rho Var[S(\underbar{X})]]=Var[S(\underbar{X})]\left(\frac{1+\rho}{2}\right)
\end{align*}}
This implies that $Var\left[\frac{T(\underbar{X})+S(\underbar{X})}{2}\right]\leq Var[S(\underbar{X})]$. Because $S(\underbar{X})$ is the MVUE, this must be an equality and $Var\left[\frac{T(\underbar{X})+S(\underbar{X})}{2}\right]=Var[S(\underbar{X})]$. By \textbf{Cauchy-Schwartz inequality}, this equality only holds when $S(\underbar{X})=aT(\underbar{X})+b$. We know that $E[T(\underbar{X})]=E[S(\underbar{X})]$ so $a=1$ and $b=0$, therefore P(T(\underbar{X})=S(\underbar{X}))=1
\end{flashcard}
\begin{flashcard}[Pro Tip]{Conditionality Principle}
\bigskip\bigskip\bigskip\bigskip
Always condition on Ancillary Statistics!
\end{flashcard}
\begin{flashcard}[Pro Tip]{Likelihood Prinicple}
\bigskip\bigskip\bigskip\bigskip
If two experiments yield likelihood functions that are proportional, then those two sets of data are equivalent as statistical evidence. If likelihoods are the same, evidence should be the same. Inferences can of course be different.
\end{flashcard}
\begin{flashcard}[Definition]{Criteria for Confidence Intervals}
\bigskip\bigskip
\begin{enumerate}
\item Consistent estimator of the parameter: $\hat{\theta}_n\overset{p}{\rightarrow}\theta$
\item Asymptotic Normality: $\sqrt{\mathcal{I}_n(\theta)}(\hat{\theta}_n-\theta)\overset{d}{\rightarrow}N(0,1)$
\item Consistent estimator of the information: $\frac{\mathcal{I}_n(\hat{\theta}_n)}{\mathcal{I}_n(\theta)}\overset{p}{\rightarrow}1$
\item (also) Expected Length
\item Unbiasedness
\item Selectivity
\end{enumerate}
\end{flashcard}
\begin{flashcard}[Proof]{Quantile Convergence}
\bigskip
\textbf{Proof by contradiction}. Assume that $C_n\not\rightarrow Z$ the either
\begin{enumerate}
\item $\exists\delta$ s.t. $\forall n$ $\exists$ $n'>n\Rightarrow C_{n'}>Z+\delta$
\item $\exists\delta$ s.t. $\forall n$ $\exists$ $n'>n\Rightarrow C_{n'}<Z-\delta$
\end{enumerate}
If 1, then $\forall n$ $\exists n'>n$ s.t. $F_{n'}(C_{n'})\geq F_{n'}(Z+\delta)$ and \\
$\therefore \lim_{n'\rightarrow\infty} F_{n'}(C_{n'})\geq\lim_{n'\rightarrow\infty}F_{n'}(Z+\delta)=F(Z+\delta)>F(Z)$\\\\
If 2, then $\therefore \lim_{n'\rightarrow\infty} F_{n'}(C_{n'})\leq F(Z-\delta)<F(Z)$\\\\
However, we know that $F_n(C_n)=\alpha$ $\forall n$ (by definition) and $F(Z)=\alpha$. So both cases lead to a contradiction therefore \\\\$Y_n\overset{d}{\rightarrow}Y\Rightarrow C_n\rightarrow Z$
\end{flashcard}
\begin{flashcard}[Proof]{Use of estimate of information in MLE CI}
\bigskip\bigskip
For the approximate large-sample CI for the MLE:
{\begin{align*}
\sqrt{n\mathcal{I}(\hat{\theta}_n)}(\hat{\theta}_n-\theta)=\underbrace{\sqrt{n\mathcal{I}(\theta)}(\hat{\theta}_n-\theta)}_{\overset{d}{\rightarrow}N(0,1)}\underbrace{\sqrt{\frac{\mathcal{I}(\hat{\theta}_n)}{\mathcal{I}(\theta)}}}_{\overset{p}{\rightarrow}1}\overset{d}{\rightarrow}N(0,1)
\end{align*}}
\bigskip\\This is a consequence of asymptotic normality of the MLE, Slutsky's and CMT (because $\mathcal{I}(\hat{\theta}_n)$ is a continuous function of $\theta$ so if $\hat{\theta}_n\overset{p}{\rightarrow}\theta$ then $\mathcal{I}(\hat{\theta}_n)\overset{p}{\rightarrow}\mathcal{I}(\theta)$.)
\end{flashcard}
\begin{flashcard}[Definition]{Mean Value Theorem}
\bigskip\bigskip\bigskip
\begin{equation*}
\gamma(\hat{\theta}_n)=\gamma(\theta)+\gamma'(\overset{\sim}{\theta})(\hat{\theta}_n-\theta)
\end{equation*}
\bigskip\\
This is helpful because you can rearrange to be:
\begin{equation*}
\sqrt{n}(\gamma(\hat{\theta}_n)-\gamma(\theta))=\gamma'(\overset{\sim}{\theta})\sqrt{n}(\hat{\theta}_n-\theta)
\end{equation*}
\bigskip\\Which allows you to show asymptotic normality of MLE
\end{flashcard}
\begin{flashcard}[Proofish]{Why $\bar{X}_n\pm Z_{\alpha/2}s/\sqrt{n}$ works}
\bigskip\bigskip\bigskip
{\begin{align*}
\frac{\sqrt{n}(\bar{X}_n-\theta)}{s}=\underbrace{\frac{\sqrt{n}(\bar{X}_n-\theta)}{\sigma}}_{\overset{d}{\rightarrow}N(0,1)}\underbrace{\frac{\sigma}{s}}_{\overset{p}{\rightarrow}1}\overset{d}{\rightarrow}N(0,1) \textrm{ as } n\rightarrow\infty
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{When is the $t$-interval exact?}
\bigskip\bigskip\bigskip
The $t$-interval is exact when $X_i\sim Normal$ because the pivot $\sqrt{n}(\bar{X}_n-\theta)/s$ is exactly $t^{n-1}$. It is approximately correct in large samples when the normality assumption fails because $t_{\alpha/2}^{n-1}\rightarrow Z_{\alpha/2}$ by the quantile convergence.
\end{flashcard}
\begin{flashcard}[Proof]{$t$-interval is robust to non-normality in large samples because...}
\bigskip\bigskip\bigskip
\begin{equation*}
P\left(\frac{\sqrt{n}|\bar{X}_n-\theta|}{s}\leq t_{\alpha/2}^{n-1}\right)=P\left(\underbrace{\frac{\sqrt{n}|\bar{X}_n-\theta|}{\sigma}}_{\overset{d}{\rightarrow}N(0,1)}\underbrace{\frac{\sigma}{s}}_{\overset{p}{\rightarrow}1}\underbrace{\frac{Z_{\alpha/2}}{t_{\alpha/2}^{n-1}}}_{\overset{p}{\rightarrow}1}\leq Z_{\alpha/2}\right)\rightarrow1-\alpha
\end{equation*}
\end{flashcard}
\begin{flashcard}[Equation]{Robust variance estimator of Normal Linear Regression}
\bigskip\bigskip\bigskip
\begin{equation*}
\hat{\Lambda}=n(X'W^{-1}X)^{-1}X'W^{-1}diag\{r_i^2\}W^{-1}X(X'W^{-1}X)^{-1}
\end{equation*}
Weighted least squares is just least squares after scaling the data by the variance.
\end{flashcard}
\begin{flashcard}[Definition]{Robust Large Sample Intervals}
\bigskip\bigskip\bigskip
Basically, the same as before, just raise to the $b/a$ in other words our new variance is just $b/a^2$. To estimate this:
\begin{equation*}
\hat{\lambda}=\frac{n\sum\left(\frac{\partial l_i(\hat{\theta}_n)}{\partial\theta}\right)^2}{[\mathcal{I}_n(\hat{\theta}_n)]^2}
\end{equation*}
\bigskip\\ Here this $\mathcal{I}_n(\hat{\theta}_n)$ is the observed information: $-\sum\frac{\partial^2\log f(y_i;\hat{\theta}_n)}{\partial\theta^2}$
\end{flashcard}
\begin{flashcard}[Pro Tip]{Why does the robust large sample interval work?}
\bigskip\bigskip\bigskip
\begin{equation*}
P\left(\underbrace{\frac{\sqrt{n}|\hat{\theta}_n-\theta_0|}{\sqrt{\lambda}}}_{\overset{d}{\rightarrow}N(0,1)}\underbrace{\frac{\sqrt{\lambda}}{\sqrt{\hat{\lambda}}}}_{\overset{p}{\rightarrow}1}\leq Z_{\alpha/2}\right)\rightarrow 1-\alpha
\end{equation*}
\end{flashcard}
\begin{flashcard}[Definition]{One parameter exponential family general case of robust large sample intervals}
{\begin{align*}
l_i(\theta;Y_i)&=a(\theta)b(Y_i)+c(\theta)\\
\textrm{Where: }
\sum l_i(\hat{\theta}_n;Y_i)&=0\textrm{ and }
\sum b(Y_i)=-n\frac{\partial c(\hat{\theta}_n)}{\partial\theta}\left(\frac{\partial a(\hat{\theta}_n)}{\partial\theta}\right)^{-1}\\
\textrm{Here: }
\frac{n}{\mathcal{I}_n(\hat{\theta}_n)}&=-\left\{\frac{\partial^2 a(\hat{\theta}_n)}{\partial\theta^2}\frac{\sum b(Y_i)}{n}+\frac{\partial^2 c(\hat{\theta}_n)}{\partial\theta^2}\right\}^{-1}\\
\textrm{therefore: }
\hat{\lambda}&=\frac{n[a(\hat{\theta}_n)]^2\sum\left[b(Y_i)-\frac{\sum b(Y_i)}{n}\right]^2}{[\mathcal{I}_n(\hat{\theta}_n)]^2}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{Weighted Least Squares Linear Regression if W misspecified}
{\begin{align*}
Y&\sim MVN(X,\underbar{$\beta$}, \sigma^2W)\textrm{ and } W=diag\{w_i\}=
\begin{bmatrix}
w_i&\dots&0\\
\vdots&\ddots&\vdots\\
0&\dots&w_n
\end{bmatrix}
\intertext{The weighted least squares estimate of $\underbar{$\beta$}$ is}
\hat{\beta}_{wls}&=X'W^{-1}X)^{-1}X'W^{-1}Y\\
\intertext{This is a \textbf{consistent} estimator of $\underbar{$\beta$}$ ever if the $Y$s are not normal and the covariance matrix is not proportional to $W$. BUT if $W$ is misspecified that the variance-covariance matrix is not estimated by $n/\mathcal{I}$. So you need the robust variance estimator:}
\hat{\Lambda}&=n(X'W^{-1}X)^{-1}X'W^{-1}diag\{r_i^2\}W^{-1}X(X'W^{-1}X)^{-1}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{If $\underbar{Z}\sim MVN(\underbar{$\mu$},\Sigma)$ how is $AZ$ distributed?}
\bigskip\bigskip\bigskip
\begin{equation*}
AZ\sim MVN(A\underbar{$\mu$},A\Sigma A')
\end{equation*}
\end{flashcard}
\begin{flashcard}[Pro Tip]{If $\underbar{Z}\sim MVN(\underbar{$\mu$},\Sigma)$ what would be distributed as $\chi^2_k$?}
\bigskip\bigskip\bigskip
\begin{equation*}
(\underbar{Z}-\underbar{$\mu$})'\Sigma^{-1}(\underbar{Z}-\underbar{$\mu$})\sim \chi^2_k
\end{equation*}
This is only when $\Sigma$ is full rank.
{\begin{align*}
\Sigma&=\sigma^2\underbar{$\mathcal{I}$}\\
(\underbar{Z}-\underbar{$\mu$})'\Sigma^{-1}(\underbar{Z}-\underbar{$\mu$})&=\Sigma(Z_i-\mu_i)^2/\sigma^2\\
f_{\underbar{z}}(Z)&=\frac{1}{(2\pi)^{k/2}|\Sigma|^{1/2}}\exp\left\{-\frac{1}{2}(Z-\underbar{$\mu$})'\Sigma^{-1}(Z-\underbar{$\mu$})\right\}
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Size of a test}
\bigskip\bigskip
{\begin{align*}
P(\textrm{Choose $H_1$ when $H_0$ is true})&=P(\underbar{X}\in C_\delta|H_0)=P_0(\underbar{X}\in C_\delta)\\
&=P_0(\delta(\underbar{X})=1)\\
&=E_0[\delta(\underbar{X})]=E[\delta(\underbar{X})|H_0]\\
&=\int_{C_\delta}f(\underbar{X};\theta_0)d\underbar{X}\\
&=\alpha
\end{align*}}
\end{flashcard}
\begin{flashcard}[Definition]{Power}
\bigskip\bigskip
{\begin{align*}
1-P(\textrm{Choose $H_0$ when $H_1$ is true})&=P(\textrm{Choose $H_1$ when $H_1$ is true})\\
&=P_1(\underbar{X}\in C_\delta)=P_1(\delta(\underbar{X})=1)\\
&=E_1[\delta(\underbar{X})]\\
&=\int_{C_\delta}f(\underbar{X};\theta_1)d\underbar{X}\\
&=1-\beta
\end{align*}}
\end{flashcard}
\begin{flashcard}[Pro Tip]{General Setup for size and power}
{\begin{align*}
X_1,...,X_n&\overset{iid}{\sim}N(0,1)\\
H_0:\theta=0&\textrm{ and } H_1:\theta=1\\
\textrm{Test Stat: }\delta(\underbar{X})&=\begin{cases}
1&\bar{X}_n>c\\
0&\bar{X}_n\leq c\\
\end{cases}\\
\textrm{Critical Region: }C_\delta&=\{\underbar{X}:\bar{X}_n>c\}\\
\alpha&=P_0(\bar{X}_n>c)=P(\sqrt{n}\bar{X}_n>\sqrt{n}c)=1-\Phi[\sqrt{n}c]\\
\beta&=P_1(\bar{X}_n\leq c)=P_1\left(\sqrt{n}\frac{(\bar{X}_n-\mu_1)}{\sigma}\leq\sqrt{n}\frac{(c-\mu_1)}{\sigma}\right)\\
&=\Phi\left[\sqrt{n}\frac{(c-\mu_1)}{\sigma}\right]
\end{align*}}
\end{flashcard}
%%Print from here
\begin{flashcard}[Definition]{Neyman-Pearson Lemma}
\bigskip\bigskip\bigskip
Using the LR will yield a most powerful test of size $\alpha$
\end{flashcard}
\begin{flashcard}[Definition]{Significance Testing}
\begin{itemize}
\item Statistical procedure for measuring the strength of evidence against the null hypothesis - R.A. Fisher
\item takes a test stat $T(\underbar{X})$ where
\begin{enumerate}
\item Larger values of $T(\underbar{X})$ represents strong evidence of departure from $H_0$.
\item Distribution of $T(\underbar{X})$ under $H_0$ is known
\item For given observations $\underbar{x}$, the p-value is
\begin{equation*}
p-value=P(T(\underbar{X})\geq T(\underbar{x})|H_0)
\end{equation*}
\end{enumerate}
\item there are no rejection regions or alternative hypotheses.
\item p-value are always in the tails of the null distribution
\item answers ``How do I interpret these observations as evidence"
\end{itemize}
\end{flashcard}
\begin{flashcard}[Definition]{The Power Function}
\bigskip\bigskip
The power function is the probability of rejecting $H_0$ (defined over \\$\Theta=\Theta_0\cup \Theta_1$)
\begin{equation*}
1-\beta(\theta)=E_\theta[\delta(\underbar{X})] \textrm{ for }\theta\in\Theta
\end{equation*}
\end{flashcard}
\begin{flashcard}[Definition]{Uniformly Most Powerful test}
\bigskip\bigskip\bigskip
When $\delta(\underbar{X})$ is free of the alternative hypothesis, if N-P holds, then $\delta(\underbar{X})$ is UMP.
\end{flashcard}
\begin{flashcard}[Definition]{Unbiasedness of test}
\bigskip\bigskip\bigskip
As long as the power $\geq$ size of the test, it is unbiased. In fancy speak: a size $\alpha$ test of $H_0:\theta\in\Theta_0$ vs. $H_1:\theta\in\Theta_1$ is unbiased if
\begin{equation*}
\inf_{\theta\in\Theta_1}1-\beta(\theta)\geq\alpha
\end{equation*}
\bigskip\\
UMP tests are unbiased. If a UMP test does not exist (like in 2-sided case) you can use UMPU - among unbiased tests, the UMP.
\end{flashcard}
\begin{flashcard}[Definition]{Composite Hypothesis Power Function}
\bigskip\bigskip\bigskip
\begin{equation*}
P_\theta(Reject\:H_0)=1-\beta(\theta)=E_\theta[\delta(\underbar{X})]\textrm{ for }\theta\in\Theta
\end{equation*}
\end{flashcard}
\begin{flashcard}[Definition]{Composite Hypothesis Size}
\bigskip\bigskip\bigskip
\begin{equation*}
\alpha=\sup_{\theta\in\Theta_0}1-\beta(\theta)=\sup_{\theta\in\Theta_0}E_\theta[\delta(\underbar{X})]
\end{equation*}
\end{flashcard}
\begin{flashcard}[Definition]{Composite Hypothesis Consistency}
\bigskip\bigskip\bigskip
A series of tests $\delta_1,...,\delta_n$ is consistent versus the alternative if $1-\beta_{\delta_n}(\theta)1\rightarrow1$ as $n\rightarrow \infty$
\end{flashcard}
\begin{flashcard}[Definition]{Generalized Likelihood Ratio Test}
\bigskip\bigskip\bigskip
Reject $H_0$ if\\
\begin{equation*}
\lambda(\underbar{X})=\frac{\sup_{\theta\in\Theta_0}f(\underbar{X};\theta)}{\sup_{\theta\in\Theta}f(\underbar{X});\theta)}=\inf_{\theta\in\Theta}\sup_{\theta\in\Theta_0}f(\underbar{X};\theta)
\end{equation*}
is too small.\\\\Specifically, reject $H_0:\theta\in\Theta_0$ if $\lambda(\underbar{X})\leq\lambda_0$ where $\alpha=\sup_{\theta\in\Theta_0}P_\theta(\lambda(\underbar{X})\leq\lambda_0)$.
\end{flashcard}
\begin{flashcard}[Pro Tip]{Interpreting GLRT}
\bigskip\bigskip\bigskip
Do not interpret test as ``evidence" for or against a composite hypothesis. It is just saying you can find one simple alternative that is better supported than each null hypothesis. It does not mean that the alternative as a set is better than the set of null hypothesis.
\end{flashcard}
\begin{flashcard}[Definition]{Monotone Likelihood Ratio Property}
\bigskip\bigskip\bigskip
A family of pdfs or pmfs with univariable random variable t and a parameter $\theta$ has a MLR if
\begin{equation*}
\forall\:\theta_2>\theta_1\textrm{ we have that } g(t|\theta_2)/g(t|\theta_1) \textrm{ is a monotone }(\uparrow\textrm{ or }\downarrow) \textrm{ function of t.}
\end{equation*}
\end{flashcard}
\begin{flashcard}[Definition]{Karlin-Rubin Theorem}
\bigskip\bigskip\bigskip
Let $T(\underbar{X})$ be a sufficient statistic for $\theta$. If $\{g(t|\theta);\theta\in\Theta\}$ has the MLR property, then for any $t_0$ the test of $H_0:\theta\leq\theta_0$ vs $H_1:\theta>\theta_0$ rejects if $T(\underbar{X})$ is a UMP test of size $\alpha=P_{\theta_0}(T(\underbar{X})>t_0)$
\end{flashcard}
\begin{flashcard}[Pro Tip]{Exponential Family MLRs}
\bigskip
In the exponential family, we have $h(\underbar{X})c(\theta)\exp\{w(\theta)T(\underbar{X})\}$. If $w(\theta)$ is increasing, by Karlin-Rubin test:
{\begin{align*}
\delta(\underbar{X})=\begin{cases}
1&T(\underbar{X})>t_0\\
0&T(\underbar{X})<t_0\\