-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
4170 lines (4167 loc) · 402 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
Automatically generated by Mendeley Desktop 1.14
Any changes to this file will be lost if it is regenerated by Mendeley.
BibTeX export options can be customized via Preferences -> BibTeX in Mendeley Desktop
@book{Devroye1997,
abstract = {Pattern recognition presents one of the most significant challenges for scientists and engineers, and many different approaches have been proposed. The aim of this book is to provide a self-contained account of probabilistic analysis of these approaches. The book includes a discussion of distance measures, nonparametric methods based on kernels or nearest neighbors, Vapnik-Chervonenkis theory, epsilon entropy, parametric classification, error estimation, tree classifiers, and neural networks. Wherever possible, distribution-free properties and inequalities are derived. A substantial portion of the results or the analysis is new. Over 430 problems and exercises complement the material.},
author = {Devroye, Luc and Györfi, László and Lugosi, Gábor},
doi = {10.1007/978-1-4612-0711-5},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Devroye, Györfi, Lugosi - 1997 - A Probabilistic Theory of Pattern Recognition.pdf:pdf},
isbn = {978-0387946184},
pages = {661},
title = {{A Probabilistic Theory of Pattern Recognition}},
year = {1997}
}
@article{Leistner2010,
author = {Leistner, Christian and Saffari, Amir and Bischof, Horst},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Leistner, Saffari, Bischof - 2010 - MIForests Multiple-instance learning with randomized trees.pdf:pdf},
journal = {11th ECCV},
number = {825840},
pages = {29--42},
title = {{MIForests: Multiple-instance learning with randomized trees}},
year = {2010}
}
@article{Breiman2004a,
author = {Breiman, Leo},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Breiman - 2004 - Consistency for a simple model of random forests.pdf:pdf},
journal = {Technical Report 670},
title = {{Consistency for a simple model of random forests}},
url = {http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:CONSISTENCY+FOR+A+SIMPLE+MODEL+OF+RANDOM+FORESTS\#0},
year = {2004}
}
@article{Lopez2014,
abstract = {A wide number of real word applications presents a class distribution where examples belonging to one class heavily outnumber the examples in the other class. This is an arduous situation where standard classification techniques usually decrease their performance, creating a handicap to correctly identify the minority class, which is precisely the case under consideration in these applications.In this work, we propose the usage of the Iterative Instance Adjustment for Imbalanced Domains (IPADE-ID) algorithm. It is an evolutionary framework, which uses an instance generation technique, designed to face the existing imbalance modifying the original training set. The method, iteratively learns the appropriate number of examples that represent the classes and their particular positioning. The learning process contains three key operations in its design: a customized initialization procedure, an evolutionary optimization of the positioning of the examples and a selection of the most representative examples for each class.An experimental analysis is carried out with a wide range of highly imbalanced datasets over the proposal and recognized solutions to the problem. The results obtained, which have been contrasted through non-parametric statistical tests, show that our proposal outperforms previously proposed methods. © 2013 Elsevier B.V.},
author = {L\'{o}pez, Victoria and Triguero, Isaac and Carmona, Crist\'{o}bal J. and Garc\'{\i}a, Salvador and Herrera, Francisco},
doi = {10.1016/j.neucom.2013.01.050},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/L\'{o}pez et al. - 2014 - Addressing imbalanced classification with instance generation techniques IPADE-ID.pdf:pdf},
issn = {09252312},
journal = {Neurocomputing},
keywords = {Decision tree,Differential evolution,Imbalanced datasets,Instance generation,Nearest neighbor},
pages = {15--28},
title = {{Addressing imbalanced classification with instance generation techniques: IPADE-ID}},
volume = {126},
year = {2014}
}
@article{Swersky2014,
abstract = {In machine learning, the term “training” is used to describe the procedure of fitting a model to data. In many popular models, this fitting procedure is framed as an optimization problem, in which a loss is minimized as a function of the parameters. In all but the simplest machine learning models, this minimization must be performed with an iterative algorithm such as stochastic gradient descent or the nonlinear conjugate gradient method. Another aspect of training involves fitting model “hyperparameters.” These are parameters that in some way govern the model space or fitting procedure; in both cases they are typically difficult to minimize directly in terms of the training loss and are usually evaluated in terms of generalization performance via held-out data. Hyperparameters are often regularization penalties such as ?p norms on model parameters, but can also capture model capacity as in the number of hidden units in a neural network. These hyperparameters help determine the appropriate bias-variance tradeoff for a given model family and data set. On the other hand, hyperparameters of the fitting procedure govern algorithmic aspects of training, such as the learning rate schedule of stochastic gradient descent, or the width of a Monte Carlo proposal distribution. The goal of fitting both kinds of hyperparameters is to identify a model and an optimization procedure in which successful minimization of training loss is likely to result in good generalization performance. When a held- out validation set is used to evaluate the quality of hyperparameters, the overall optimization proceeds as a double loop, where the outer loop sets the hyperparameters and the inner loop applies an iterative training procedure to fit the model to data. Often this outer hyperparameter optimization is performed by hand, which—even if rigorously systematized— can be a difficult and laborious process. Simple alternatives include the application of heuristics and intu- ition, grid search, which scales poorly with dimension, or random search [1], which is computationally expensive due to the need to train many models. In light of this, Bayesian optimization [2] has recently been proposed as an effective method for systematically and intelligently setting the hyperparameters of machine learning models [3, 4]. Using a principled characterization of model uncertainty, Bayesian optimization attempts to find the best hyperparameter settings with as few model evaluations as possible. One issue with previously proposed approaches to Bayesian optimization for machine learning is that a model must be fully trained before the quality of its hyperparameters can be assessed. Human experts, however, appear to be able to rapidly assess whether or not a model is likely to eventually be useful, even when the inner-loop training is only partially completed. When such an assessment can be made accurately, it is possible to explore the hyperparameter space more effectively by aborting model fits that are likely to be low quality. The goal of this paper is to take advantage of the partial information provided by iterative training procedures, within the Bayesian optimization framework for hyperparameter search.We propose a new technique that makes it possible to estimate when to pause the training of one model in favor of starting a new one with different hyperparameters, or resuming a partially-completed training procedure from an old model.We refer to our approach as freeze-thaw Bayesian optimization, as the algorithm maintains a set of “frozen” (partially completed but not being actively trained) models and uses an information-theoretic criterion to determine which ones to “thaw” and continue training. Our approach hinges on the assumption that, for many models, the training loss during the fitting proce- dure roughly follows an exponential decay towards an unknown final value.We build a Bayesian nonpara- metric prior around this assumption by developing a new kernel that is an infinite mixture of exponentially- decaying basis functions, with the goal of characterizing these training curves. Using this kernel with a novel and efficient temporal Gaussian process prior, we are able to forecast the final result of partially trained mod- els and use this during Bayesian optimization to determine the most promising action.We demonstrate that freeze-thaw Bayesian optimization can find good hyperparameter settings for many different models in con- siderably less time than ordinary Bayesian optimization.},
archivePrefix = {arXiv},
arxivId = {arXiv:1406.3896v1},
author = {Swersky, Kevin and Snoek, Jasper and Adams, Ryan P.},
eprint = {arXiv:1406.3896v1},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Swersky, Snoek, Adams - 2014 - Freeze-Thaw Bayesian Optimization.pdf:pdf},
journal = {arXiv preprint},
pages = {1--12},
title = {{Freeze-Thaw Bayesian Optimization}},
url = {http://arxiv.org/abs/1406.3896},
year = {2014}
}
@article{Paleologo2010,
abstract = {The logistic regression framework has been for long time the most used statistical method when assessing customer credit risk. Recently, a more pragmatic approach has been adopted, where the first issue is credit risk prediction, instead of explanation. In this context, several classification techniques have been shown to perform well on credit scoring, such as support vector machines among others. While the investigation of better classifiers is an important research topic, the specific methodology chosen in real world applications has to deal with the challenges arising from the real world data collected in the industry. Such data are often highly unbalanced, part of the information can be missing and some common hypotheses, such as the i.i.d. one, can be violated. In this paper we present a case study based on a sample of IBM Italian customers, which presents all the challenges mentioned above. The main objective is to build and validate robust models, able to handle missing information, class unbalancedness and non-iid data points. We define a missing data imputation method and propose the use of an ensemble classification technique, subagging, particularly suitable for highly unbalanced data, such as credit scoring data. Both the imputation and subagging steps are embedded in a customized cross-validation loop, which handles dependencies between different credit requests. The methodology has been applied using several classifiers (kernel support vector machines, nearest neighbors, decision trees, Adaboost) and their subagged versions. The use of subagging improves the performance of the base classifier and we will show that subagging decision trees achieve better performance, still keeping the model simple and reasonably interpretable. © 2009 Elsevier B.V. All rights reserved.},
author = {Paleologo, Giuseppe and Elisseeff, Andr\'{e} and Antonini, Gianluca},
doi = {10.1016/j.ejor.2009.03.008},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Paleologo, Elisseeff, Antonini - 2010 - Subagging for credit scoring models.pdf:pdf},
issn = {03772217},
journal = {European Journal of Operational Research},
keywords = {Classification,Credit scoring,Decision Support Systems,Risk analysis},
number = {2},
pages = {490--499},
publisher = {Elsevier B.V.},
title = {{Subagging for credit scoring models}},
url = {http://dx.doi.org/10.1016/j.ejor.2009.03.008},
volume = {201},
year = {2010}
}
@article{Wright,
author = {Wright, Jonathan H},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Wright - Unknown - Bayesian Model Averaging and Exchange Rate Forecasts Jonathan H. Wright.pdf:pdf},
keywords = {ben bernanke,board of governors of,bootstrap,c32,c53,david bowman,dc 20551,exchange rates,f31,forecasting,i am grateful to,international finance division,jel classification,jon faust,matt pritsker and pietro,model uncertainty,shrinkage,the federal reserve system,washington},
number = {1983},
title = {{Bayesian Model Averaging and Exchange Rate Forecasts Jonathan H. Wright *}}
}
@article{Ruiz2008,
abstract = {The aim of this study is to compare two supervised classification methods on a crucial meteorological problem. The data consist of satellite measurements of cloud systems which are to be classified either in convective or non convective systems. Convective cloud systems correspond to lightning and detecting such systems is of main importance for thunderstorm monitoring and warning. Because the problem is highly unbalanced, we consider specific performance criteria and different strategies. This case study can be used in an advanced course of data mining in order to illustrate the use of logistic regression and random forest on a real data set with unbalanced classes.},
archivePrefix = {arXiv},
arxivId = {0804.0650},
author = {Ruiz, Anne and Villa, Nathalie},
eprint = {0804.0650},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Ruiz, Villa - 2008 - Storms prediction Logistic regression vs random forest for unbalanced data.pdf:pdf},
pages = {1--11},
title = {{Storms prediction : Logistic regression vs random forest for unbalanced data}},
url = {http://arxiv.org/abs/0804.0650},
year = {2008}
}
@article{Wilson1972,
abstract = {The convergence properties of a nearest neighbor rule that uses an editing procedure to reduce the number of preclassified samples and to improve the performance of the rule are developed. Editing of the preclassified samples using the three-nearest neighbor rule followed by classification using the single-nearest neighbor rule with the remaining preclassified samples appears to produce a decision procedure whose risk approaches the Bayes' risk quite closely in many problems with only a few preclassified samples. The asymptotic risk of the nearest neighbor rules and the nearest neighbor rules using edited preclassified samples is calculated for several problems.},
author = {Wilson, Dennis L.},
doi = {10.1109/TSMC.1972.4309137},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Wilson - 1972 - Asymptotic Properties of Nearest Neighbor Rules Using Edited Data.pdf:pdf},
isbn = {0018-9472},
issn = {0018-9472},
journal = {IEEE Transactions on Systems, Man, and Cybernetics},
number = {3},
pages = {408--421},
title = {{Asymptotic Properties of Nearest Neighbor Rules Using Edited Data}},
volume = {2},
year = {1972}
}
@article{Achlioptas2001,
author = {Achlioptas, Dimitris},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Achlioptas - 2001 - Database-friendly Random Projections.pdf:pdf},
isbn = {1581133618},
pages = {274--281},
title = {{Database-friendly Random Projections}},
year = {2001}
}
@article{Fawcett2008,
abstract = {Rules are commonly used for classification because they are modular, intelligible and easy to learn. Existing work in classification rule learning assumes the goal is to produce categorical classifications to maximize classification accuracy. Recent work in machine learning has pointed out the limitations of classification accuracy: when class distributions are skewed, or error costs are unequal, an accuracy maximizing classifier can perform poorly. This paper presents a method for learning rules directly from ROC space when the goal is to maximize the area under the ROC curve (AUC). Basic principles from rule learning and computational geometry are used to focus search for promising rule combinations. The result is a system that can learn intelligible rulelists with good ROC performance.},
author = {Fawcett, Tom},
doi = {10.1007/s10618-008-0089-y},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Fawcett - 2008 - PRIE A system for generating rulelists to maximize ROC performance.pdf:pdf},
issn = {13845810},
journal = {Data Mining and Knowledge Discovery},
keywords = {Classification,Cost-sensitive learning,ROC analysis,Rule learning},
number = {2},
pages = {207--224},
title = {{PRIE: A system for generating rulelists to maximize ROC performance}},
volume = {17},
year = {2008}
}
@article{Hoeting1999b,
abstract = {Standard statistical practice ignores model uncertainty. Data analysts typically select a model from some class of models and then proceed as if the selected model had generated the data. This approach ignores the uncertainty in model selection, leading to over-confident inferences and decisions that are more risky than one thinks they are. Bayesian model averaging (BMA) provides a coherent mechanism for accounting for this model uncertainty. Several methods for implementing BMA have recently emerged. We discuss these methods and present a number of examples. In these examples, BMA provides improved out-of- sample predictive performance. We also provide a catalogue of currently available BMA software.},
author = {Hoeting, Jennifer a and Madigan, David and Raftery, Adrian E and Volinsky, C T},
doi = {10.2307/2676803},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Hoeting et al. - 1999 - Bayesian model averaging a tutorial.pdf:pdf},
issn = {08834237},
journal = {Statistical Science},
keywords = {Bayesian graphical models,Bayesian model averaging,Markov chain Monte Carlo.,learning,model uncertainty},
number = {4},
pages = {382--417},
title = {{Bayesian model averaging: a tutorial}},
url = {http://www.jstor.org/stable/2676803},
volume = {14},
year = {1999}
}
@book{Swersky,
abstract = {Bayesian optimization has recently been proposed as a framework for automati- cally tuning the hyperparameters of machine learning models and has been shown to yield state-of-the-art performance with impressive ease and efficiency. In this paper, we explore whether it is possible to transfer the knowledge gained from previous optimizations to new tasks in order to find optimal hyperparameter set- tings more efficiently. Our approach is based on extending multi-task Gaussian processes to the framework of Bayesian optimization. We show that this method significantly speeds up the optimization process when compared to the standard single-task approach. We further propose a straightforward extension of our al- gorithm in order to jointly minimize the average error across multiple tasks and demonstrate how this can be used to greatly speed up k-fold cross-validation. Lastly, we propose an adaptation of a recently developed acquisition function, en- tropy search, to the cost-sensitive, multi-task setting. We demonstrate the utility of this new acquisition function by leveraging a small dataset to explore hyper- parameter settings for a large dataset. Our algorithm dynamically chooses which dataset to query in order to yield the most information per unit cost.},
author = {Swersky, Kevin and Snoek, Jasper and Adams, Ryan P.},
booktitle = {Advances in Neural Information Processing Systems 26},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Swersky, Snoek, Adams - 2013 - Multi-Task Bayesian Optimization.pdf:pdf},
pages = {2004--2012},
title = {{Multi-Task Bayesian Optimization}},
url = {http://papers.nips.cc/paper/5086-multi-task-bayesian-optimization.pdf},
year = {2013}
}
@article{Ataman2007,
author = {Ataman, Kaan},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Ataman - 2007 - Learning to rank by maximizing the AUC with linear programming for problems with binary output.pdf:pdf},
title = {{Learning to rank by maximizing the AUC with linear programming for problems with binary output}},
year = {2007}
}
@article{Lam2008,
author = {Lam, Patrick},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Lam - 2008 - MCMC Methods Gibbs Sampling and the Metropolis-Hastings Algorithm.pdf:pdf},
title = {{MCMC Methods : Gibbs Sampling and the Metropolis-Hastings Algorithm}},
year = {2008}
}
@article{Wolpert1997,
abstract = {A framework is developed to explore the connection between
effective optimization algorithms and the problems they are solving. A
number of \“no free lunch\” (NFL) theorems are presented which
establish that for any algorithm, any elevated performance over one
class of problems is offset by performance over another class. These
theorems result in a geometric interpretation of what it means for an
algorithm to be well suited to an optimization problem. Applications of
the NFL theorems to information-theoretic aspects of optimization and
benchmark measures of performance are also presented. Other issues
addressed include time-varying optimization problems and a priori
\“head-to-head\” minimax distinctions between optimization
algorithms, distinctions that result despite the NFL theorems' enforcing
of a type of uniformity over all algorithms},
author = {Wolpert, David H. and Macready, William G.},
doi = {10.1109/4235.585893},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Wolpert, Macready - 1997 - No free lunch theorems for optimization.pdf:pdf},
isbn = {1089778X},
issn = {1089778X},
journal = {IEEE Transactions on Evolutionary Computation},
keywords = {Evolutionary algorithms,Information theory,Optimization},
number = {1},
pages = {67--82},
pmid = {97063701362},
title = {{No free lunch theorems for optimization}},
volume = {1},
year = {1997}
}
@article{Liaw2002,
abstract = {Recently there has been a lot of interest in “ensemble learning” — methods that generate many classifiers and aggregate their results. Two well-known methods are boosting (see, e.g., Shapire et al., 1998) and bagging Breiman (1996) of classification trees. In boosting, successive trees give extra weight to points incorrectly predicted by earlier predictors. In the end, a weighted vote is taken for prediction. In bagging, successive trees do not depend on earlier trees — each is independently constructed using a bootstrap sample of the data set. In the end, a simple majority vote is taken for prediction.},
archivePrefix = {arXiv},
arxivId = {1609-3631},
author = {Liaw, Andy and Wiener, Matthew},
doi = {10.1177/154405910408300516},
eprint = {1609-3631},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Liaw, Wiener - 2002 - Classification and Regression by randomForest.pdf:pdf},
isbn = {1609-3631},
issn = {16093631},
journal = {R news},
number = {December},
pages = {18--22},
pmid = {21196786},
title = {{Classification and Regression by randomForest}},
volume = {2},
year = {2002}
}
@article{Hand1997,
abstract = {Credit scoring is the term used to describe formal statistical methods used for classifying applicants for credit into 'good' and 'bad' risk classes. Such methods have become in- creasingly important with the dramatic growth in consumer credit in recent years. A wide range of statistical methods has been applied, though the literature available to the public is limited for reasons of commercial confidentiality. Particular problems arising in the credit scoring context are examined and the statistical methods which have been applied are reviewed.},
author = {Hand, David J. and Henley, W. E.},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Hand, Henley - 1997 - Statistical Classification Methods in Consumer Credit Scoring a Review.pdf:pdf},
journal = {Journal of the Royal Statistical Society},
keywords = {classification,consumer loans,credit control,credit scoring,discriminant analysis,finance,reject inference,risk assessment},
number = {3},
pages = {523--541},
title = {{Statistical Classification Methods in Consumer Credit Scoring: a Review}},
volume = {160},
year = {1997}
}
@article{Xu2014,
author = {Xu, Wei and Chen, Xi and Coleman, Thomas F},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Xu, Chen, Coleman - 2014 - The Efficient Application of Automatic Differentiation for Computing Gradients in Financial Applications ∗.pdf:pdf},
pages = {1--24},
title = {{The Efficient Application of Automatic Differentiation for Computing Gradients in Financial Applications ∗}},
year = {2014}
}
@article{Park2008,
abstract = {The Lasso estimate for linear regression parameters can be interpreted as a Bayesian posterior mode estimate when the regression parameters have independent Laplace (i.e., double-exponential) priors. Gibbs sampling from this posterior is possible using an expanded hierarchy with conjugate normal priors for the regression parameters and independent exponential priors on their variances. A connection with the inverse-Gaussian distribution provides tractable full conditional distributions. The Bayesian Lasso provides interval estimates (Bayesian credible intervals) that can guide variable selection. Moreover, the structure of the hierarchical model provides both Bayesian and likelihood methods for selecting the Lasso parameter. Slight modifications lead to Bayesian versions of other Lasso-related estimation methods, including bridge regression and a robust variant.},
author = {Park, Trevor and Casella, George},
doi = {10.1198/016214508000000337},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Park, Casella - 2008 - The Bayesian Lasso.pdf:pdf},
isbn = {0162-1459},
issn = {0162-1459},
journal = {Journal of the American Statistical Association},
keywords = {empirical bayes,gibbs sampler,hierarchical model,inverse gaussian,linear regression,penalized regression,scale},
number = {482},
pages = {681--686},
pmid = {21156729},
title = {{The Bayesian Lasso}},
volume = {103},
year = {2008}
}
@article{Brefeld2005,
abstract = {The area under the ROC curve (AUC) is a natural performance measure when the goal is to find a discriminative decision function. We present a rigorous derivation of an AUC maximizing Support Vector Machine; its optimization criterion is composed of a convex bound on the AUC and a margin term. The number of constraints in the optimization problem grows quadratically in the number of examples. We discuss an approximation for large data sets that clusters the constraints. Our experiments show that the AUC maximizing Support Vector Machine does in fact lead to higher AUC values.},
author = {Brefeld, Ulf and Scheffer, Tobias},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Brefeld, Scheffer - 2005 - AUC maximizing support vector learning.pdf:pdf},
journal = {\ldots workshop on ROC Analysis in Machine Learning},
number = {1},
title = {{AUC maximizing support vector learning}},
url = {http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:AUC+Maximizing+Support+Vector+Learning\#0},
year = {2005}
}
@article{Shotton2013a,
abstract = {Randomized decision trees and forests have a rich history in machine learning and have seen considerable success in application, perhaps particularly so for com- puter vision. However, they face a fundamental limitation: given enough data, the number of nodes in decision trees will grow exponentially with depth. For certain applications, for example on mobile or embedded processors, memory is a limited resource, and so the exponential growth of trees limits their depth, and thus their potential accuracy. This paper proposes decision jungles, revisiting the idea of ensembles of rooted decision directed acyclic graphs (DAGs), and shows these to be compact and powerful discriminative models for classification. Unlike conventional decision trees that only allow one path to every node, a DAG in a decision jungle allows multiple paths from the root to each leaf. We present and compare two new node merging algorithms that jointly optimize both the features and the structure of the DAGs efficiently. During training, node splitting and node merging are driven by the minimization of exactly the same objective function, here the weighted sum of entropies at the leaves. Results on varied datasets show that, compared to decision forests and several other baselines, decision jungles require dramatically less memory while considerably improving generalization. 1},
author = {Shotton, Jamie and Nowozin, Sebastian and Sharp, Toby and Winn, John and Kohli, Pushmeet and Criminisi, Antonio},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Shotton et al. - 2013 - Decision Jungles Compact and Rich Models for Classification.pdf:pdf},
journal = {Advances in Neural \ldots},
pages = {1--9},
title = {{Decision Jungles: Compact and Rich Models for Classification}},
url = {http://papers.nips.cc/paper/5199-decision-jungles-compact-and-rich-models-for-classification},
year = {2013}
}
@article{Smith2002,
abstract = {This tutorial is designed to give the reader an understanding of Principal Components Analysis (PCA). PCA is a useful statistical technique that has found application in fields such as face recognition and image compression, and is a common technique for finding patterns in data of high dimension. Before getting to a description of PCA, this tutorial first introduces mathematical concepts that will be used in PCA. It covers standard deviation, covariance, eigenvec- tors and eigenvalues. This background knowledge is meant to make the PCA section very straightforward, but can be skipped if the concepts are already familiar. There are examples all the way through this tutorial that are meant to illustrate the concepts being discussed. If further information is required, the mathematics textbook Elementary Linear Algebra 5e by Howard Anton, Publisher JohnWiley \& Sons Inc, ISBN 0-471-85223-6 is a good source of information regarding the mathematical back- ground. 1},
author = {Smith, Lindsay I},
doi = {10.1080/03610928808829796},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Smith - 2002 - A tutorial on Principal Components Analysis Introduction.pdf:pdf},
isbn = {0471852236},
issn = {03610926},
journal = {Statistics},
pages = {52},
pmid = {16765218},
title = {{A tutorial on Principal Components Analysis Introduction}},
url = {http://www.mendeley.com/research/computational-genome-analysis-an-introduction-statistics-for-biology-and-health/},
volume = {51},
year = {2002}
}
@techreport{Goncalves2015,
author = {Gon\c{c}alves, Paulo},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Gon\c{c}alves - 2015 - Echantillonneurs de Monte Carlo Application \`{a} l’Estimation Statistique Bay\'{e}sienne.pdf:pdf},
title = {{Echantillonneurs de Monte Carlo Application \`{a} l’Estimation Statistique Bay\'{e}sienne}},
year = {2015}
}
@article{Nowozin2012,
abstract = {Ensembles of classification and regression trees remain popular machine learning methods because they define flexible non- parametric models that predict well and are computationally efficient both during train- ing and testing. During induction of deci- sion trees one aims to find predicates that are maximally informative about the pre- diction target. To select good predicates most approaches estimate an information- theoretic scoring function, the information gain, both for classification and regression problems. We point out that the common es- timation procedures are biased and show that by replacing them with improved estimators of the discrete and the differential entropy we can obtain better decision trees. In effect our modifications yield improved predictive per- formance and are simple to implement in any decision tree code.},
archivePrefix = {arXiv},
arxivId = {1206.4620},
author = {Nowozin, Sebastian},
eprint = {1206.4620},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Nowozin - 2012 - Improved Information Gain Estimates for Decision Tree Induction.pdf:pdf},
isbn = {978-1-4503-1285-1},
journal = {Proceedings of the 29th International Conference on Machine Learning (ICML-12)},
pages = {297--304},
title = {{Improved Information Gain Estimates for Decision Tree Induction}},
year = {2012}
}
@article{Hinton,
author = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeffrey},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Hinton, Vinyals, Dean - Unknown - Dark knowledge.pdf:pdf},
title = {{Dark knowledge}}
}
@techreport{Jahrer2009a,
author = {T\"{o}scher, Andreas and Jahrer, Michael and Bell, Robert M.},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/T\"{o}scher, Jahrer, Bell - 2009 - The BigChaos Solution to the Netflix Grand Prize.pdf:pdf},
pages = {1--52},
title = {{The BigChaos Solution to the Netflix Grand Prize}},
year = {2009}
}
@article{Stefanowski,
author = {Stefanowski, Jerzy},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Stefanowski - Unknown - Random forest.pdf:pdf},
title = {{Random forest}}
}
@article{Wilson1997,
abstract = {Instance-based learning techniques typically handle continuous and linear input values well, but often do not handle nominal input attributes appropriately. The Value Difference Metric (VDM) was designed to find reasonable distance values between nominal attribute values, but it largely ignores continuous attributes, requiring discretization to map continuous values into nominal values. This paper proposes three new heterogeneous distance functions, called the Heterogeneous Value Difference Metric (HVDM), the Interpolated Value Difference Metric (IVDM), and the Windowed Value Difference Metric (WVDM). These new distance functions are designed to handle applications with nominal attributes, continuous attributes, or both. In experiments on 48 applications the new distance metrics achieve higher classification accuracy on average than three previous distance functions on those datasets that have both nominal and continuous attributes.},
archivePrefix = {arXiv},
arxivId = {cs/9701101},
author = {Wilson, D. Randall and Martinez, Tony R.},
doi = {10.1613/jair.346},
eprint = {9701101},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Wilson, Martinez - 1997 - Improved heterogeneous distance functions.pdf:pdf},
issn = {10769757},
journal = {Journal of Artificial Intelligence Research},
pages = {1--34},
primaryClass = {cs},
title = {{Improved heterogeneous distance functions}},
volume = {6},
year = {1997}
}
@article{Arlot2014,
abstract = {Random forests are a very effective and commonly used statistical method, but their full theoretical analysis is still an open problem. As a first step, simplified models such as purely random forests have been introduced, in order to shed light on the good performance of random forests. In this paper, we study the approximation error (the bias) of some purely random forest models in a regression framework, focusing in particular on the influence of the number of trees in the forest. Under some regularity assumptions on the regression function, we show that the bias of an infinite forest decreases at a faster rate (with respect to the size of each tree) than a single tree. As a consequence, infinite forests attain a strictly better risk rate (with respect to the sample size) than single trees. Furthermore, our results allow to derive a minimum number of trees sufficient to reach the same rate as an infinite forest. As a by-product of our analysis, we also show a link between the bias of purely random forests and the bias of some kernel estimators. 1},
archivePrefix = {arXiv},
arxivId = {arXiv:1407.3939v1},
author = {Arlot, Sylvain and Genuer, Robin},
eprint = {arXiv:1407.3939v1},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Arlot, Genuer - 2014 - Analysis of purely random forests bias.pdf:pdf},
number = {2008},
title = {{Analysis of purely random forests bias}},
year = {2014}
}
@article{Bengio2013,
abstract = {The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, auto-encoders, manifold learning, and deep networks. This motivates longer-term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation and manifold learning.},
archivePrefix = {arXiv},
arxivId = {1206.5538},
author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
doi = {10.1109/TPAMI.2013.50},
eprint = {1206.5538},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Bengio, Courville, Vincent - 2013 - Representation learning A review and new perspectives.pdf:pdf},
isbn = {0162-8828 VO - 35},
issn = {01628828},
journal = {Pattern Analysis and \ldots},
number = {1993},
pages = {1--30},
pmid = {23459267},
title = {{Representation learning: A review and new perspectives}},
url = {http://arxiv.org/abs/1206.5538$\backslash$nhttp://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=6472238},
year = {2013}
}
@book{Vapnik2000,
abstract = {The aim of this book is to discuss the fundamental ideas which lie behind the statistical theory of learning and generalization. It considers learning as a general problem of function estimation based on empirical data. Omitting proofs and technical details, the author concentrates on discussing the main results of learning theory and their connections to fundamental problems in statistics. These include: * the setting of learning problems based on the model of minimizing the risk functional from empirical data * a comprehensive analysis of the empirical risk minimization principle including necessary and sufficient conditions for its consistency * non-asymptotic bounds for the risk achieved using the empirical risk minimization principle * principles for controlling the generalization ability of learning machines using small sample sizes based on these bounds * the Support Vector methods that control the generalization ability when estimating function using small sample size. The second edition of the book contains three new chapters devoted to further development of the learning theory and SVM techniques. These include: * the theory of direct method of learning based on solving multidimensional integral equations for density, conditional probability, and conditional density estimation * a new inductive principle of learning. Written in a readable and concise style, the book is intended for statisticians, mathematicians, physicists, and computer scientists. Vladimir N. Vapnik is Technology Leader AT\&T Labs-Research and Professor of London University. He is one of the founders of statistical learning theory, and the author of seven books published in English, Russian, German, and Chinese.},
author = {Vapnik, Vladimir},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Vapnik - 2000 - The Nature of Statistical Learning Theory.pdf:pdf},
isbn = {0387987800},
pages = {314},
title = {{The Nature of Statistical Learning Theory}},
url = {http://books.google.com/books?hl=es\&id=sna9BaxVbj8C\&pgis=1$\backslash$nhttp://infoscience.epfl.ch/record/82790/files/com02-04.pdf},
year = {2000}
}
@article{Aronszajn1950,
author = {Aronszajn, N.},
doi = {10.1090/S0002-9947-1950-0051437-7},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Aronszajn - 1950 - Theory of reproducing kernels.pdf:pdf},
issn = {0002-9947},
journal = {Transactions of the American Mathematical Society},
keywords = {RKHS},
mendeley-tags = {RKHS},
month = mar,
number = {3},
pages = {337--337},
title = {{Theory of reproducing kernels}},
url = {http://www.ams.org/tran/1950-68-03/S0002-9947-1950-0051437-7/},
volume = {68},
year = {1950}
}
@techreport{Gonzalez,
author = {Gonzalez, Pierre-louis},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Gonzalez - Unknown - Segmentation.pdf:pdf},
title = {{Segmentation}}
}
@article{Barros2014,
abstract = {Decision-tree induction algorithms are widely used in knowledge discovery and data mining, specially in scenarios where model comprehensibility is desired. A variation of the traditional univariate approach is the so-called oblique decision tree, which allows multivariate tests in its non-terminal nodes. Oblique decision trees can model decision boundaries that are oblique to the attribute axes, whereas univariate trees can only perform axis-parallel splits. The vast majority of the oblique and univariate decision-tree induction algorithms employ a top-down strategy for growing the tree, relying on an impurity-based measure for splitting nodes. In this paper, we propose BUTIF-a novel Bottom- Up Oblique Decision- Tree Induction Framework. BUTIF does not rely on an impurity-measure for dividing nodes, since the data resulting from each split is known a priori. For generating the initial leaves of the tree and the splitting hyperplanes in its internal nodes, BUTIF allows the adoption of distinct clustering algorithms and binary classifiers, respectively. It is also capable of performing embedded feature selection, which may reduce the number of features in each hyperplane, thus improving model comprehension. Different from virtually every top-down decision-tree induction algorithm, BUTIF does not require the further execution of a pruning procedure in order to avoid overfitting, due to its bottom-up nature that does not overgrow the tree. We compare distinct instances of BUTIF to traditional univariate and oblique decision-tree induction algorithms. Empirical results show the effectiveness of the proposed framework. © 2014 Elsevier B.V.},
author = {Barros, Rodrigo C. and Jaskowiak, Pablo a. and Cerri, Ricardo and {De Carvalho}, Andre C P L F},
doi = {10.1016/j.neucom.2013.01.067},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Barros et al. - 2014 - A framework for bottom-up induction of oblique decision trees.pdf:pdf},
issn = {18728286},
journal = {Neurocomputing},
keywords = {Bottom-up induction,Clustering,Oblique decision trees},
number = {August 2015},
pages = {3--12},
title = {{A framework for bottom-up induction of oblique decision trees}},
volume = {135},
year = {2014}
}
@article{Ram2011,
abstract = {In this paper we develop density estimation trees (DETs), the natural analog of classification trees and regression trees, for the task of density estimation. We consider the estimation of a joint probability density function of a d-dimensional random vector X and define a piecewise constant estimator structured as a decision tree. The integrated squared error is minimized to learn the tree. We show that the method is nonparametric: under standard conditions of nonparametric density estimation, DETs are shown to be asymptotically consistent. In addition, being decision trees, DETs perform automatic feature selection. They empirically exhibit the interpretability, adaptability and feature selection properties of supervised decision trees while incurring slight loss in accuracy over other nonparametric density estimators. Hence they might be able to avoid the curse of dimensionality if the true density is sparse in dimensions. We believe that density estimation trees provide a new tool for exploratory data analysis with unique capabilities.},
author = {Ram, Parikshit and Ga, Atlanta and Gray, Alexander G and Models, I Pattern Recognition and Probability, G},
doi = {10.1145/2020408.2020507},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Ram et al. - 2011 - Density Estimation Trees.pdf:pdf},
isbn = {9781450308137},
journal = {Methods},
keywords = {data analysis,decision trees,density estimation},
pages = {627--635},
title = {{Density Estimation Trees}},
url = {http://users.cis.fiu.edu/~lzhen001/activities/KDD2011Program/docs/p627.pdf},
year = {2011}
}
@article{Ogutu2011,
abstract = {Genomic selection (GS) involves estimating breeding values using molecular markers spanning the entire genome. Accurate prediction of genomic breeding values (GEBVs) presents a central challenge to contemporary plant and animal breeders. The existence of a wide array of marker-based approaches for predicting breeding values makes it essential to evaluate and compare their relative predictive performances to identify approaches able to accurately predict breeding values. We evaluated the predictive accuracy of random forests (RF), stochastic gradient boosting (boosting) and support vector machines (SVMs) for predicting genomic breeding values using dense SNP markers and explored the utility of RF for ranking the predictive importance of markers for pre-screening markers or discovering chromosomal locations of QTLs.},
author = {Ogutu, Joseph O and Piepho, Hans-Peter and Schulz-Streeck, Torben},
doi = {10.1186/1753-6561-5-S3-S11},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Ogutu, Piepho, Schulz-Streeck - 2011 - A comparison of random forests, boosting and support vector machines for genomic selection.pdf:pdf},
isbn = {1753-6561 (Electronic)$\backslash$r1753-6561 (Linking)},
issn = {1753-6561},
journal = {BMC proceedings},
number = {Suppl 3},
pages = {S11},
pmid = {21624167},
publisher = {BioMed Central Ltd},
title = {{A comparison of random forests, boosting and support vector machines for genomic selection.}},
url = {http://www.biomedcentral.com/1753-6561/5/S3/S11},
volume = {5 Suppl 3},
year = {2011}
}
@inproceedings{Menze2011,
abstract = {In his original paper on random forests, Breiman proposed two different decision tree ensembles: one generated from “orthogonal” trees with thresholds on individual features in every split, and one from “oblique” trees separating the feature space by randomly oriented hy- perplanes. In spite of a rising interest in the random forest framework, however, ensembles built from orthogonal trees (RF) have gained most, if not all, attention so far. In the present work we propose to employ “oblique” random forests (oRF) built from multivariate trees which explicitly learn optimal split directions at internal nodes using linear discriminative models, rather than using random coefficients as the original oRF. This oRF outper- forms RF, as well as other classifiers, on nearly all data sets but those with discrete factorial features. Learned node models perform distinc- tively better than random splits. An oRF feature importance score shows to be preferable over standard RF feature importance scores such as Gini or permutation importance. The topology of the oRF decision space ap- pears to be smoother and better adapted to the data, resulting in im- proved generalization performance. Overall, the oRF propose here may be preferred over standard RF on most learning tasks involving numeri- cal and spectral data.},
author = {Menze, Bjoern H. and Kelm, B. Michael and Splitthoff, Daniel N. and Koethe, Ullrich and Hamprecht, Fred A.},
booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
doi = {10.1007/978-3-642-23783-6\_29},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Menze et al. - 2011 - On oblique random forests.pdf:pdf},
isbn = {9783642237829},
issn = {03029743},
number = {PART 2},
pages = {453--469},
title = {{On oblique random forests}},
volume = {6912 LNAI},
year = {2011}
}
@article{Lin2013,
abstract = {A class-imbalanced classifier is a decision rule to predict the class membership of new samples from an available data set where the class sizes differ considerably. When the class sizes are very different, most standard classification algorithms may favor the larger (majority) class resulting in poor accuracy in the minority class prediction. A class-imbalanced classifier typically modifies a standard classifier by a correction strategy or by incorporating a new strategy in the training phase to account for differential class sizes. This article reviews and evaluates some most important methods for class prediction of high-dimensional imbalanced data. The evaluation addresses the fundamental issues of the class-imbalanced classification problem: imbalance ratio, small disjuncts and overlap complexity, lack of data and feature selection. Four class-imbalanced classifiers are considered. The four classifiers include three standard classification algorithms each coupled with an ensemble correction strategy and one support vector machines (SVM)-based correction classifier. The three algorithms are (i) diagonal linear discriminant analysis (DLDA), (ii) random forests (RFs) and (ii) SVMs. The SVM-based correction classifier is SVM threshold adjustment (SVM-THR). A Monte-Carlo simulation and five genomic data sets were used to illustrate the analysis and address the issues. The SVM-ensemble classifier appears to perform the best when the class imbalance is not too severe. The SVM-THR performs well if the imbalance is severe and predictors are highly correlated. The DLDA with a feature selection can perform well without using the ensemble correction.},
author = {Lin, Wei Jiun and Chen, James J.},
doi = {10.1093/bib/bbs006},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Lin, Chen - 2013 - Class-imbalanced classifiers for high-dimensional data.pdf:pdf},
isbn = {1477-4054},
issn = {14675463},
journal = {Briefings in Bioinformatics},
keywords = {Class-imbalanced prediction,Feature selection,Lack of data,Performance metrics,Threshold adjustment,Under-sampling ensemble},
number = {1},
pages = {13--26},
pmid = {22408190},
title = {{Class-imbalanced classifiers for high-dimensional data}},
volume = {14},
year = {2013}
}
@article{Furry,
abstract = {We introduce canonical correlation forests (CCFs), a new decision tree ensem- ble method for classification. Individual canonical correlation trees are binary decision trees with hyperplane splits based on canonical correlation components. Unlike axis-aligned alternatives, the decision surfaces of CCFs are not restricted to the coordinate system of the input features and therefore more naturally rep- resents data with correlation between the features. Additionally we introduce a novel alternative to bagging, the projection bootstrap, which maintains use of the full dataset in selecting split points. CCFs do not require parameter tuning and our experiments show that they significantly out-perform axis-aligned random forests and other state-of-the-art tree ensemble methods. 1},
archivePrefix = {arXiv},
arxivId = {1507.05444},
author = {Rainforth, Tom and Wood, Frank},
eprint = {1507.05444},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Rainforth, Wood - Unknown - Canonical Correlation Forests.pdf:pdf},
pages = {1--13},
title = {{Canonical Correlation Forests}}
}
@article{Chipman2008,
abstract = {We develop a Bayesian "sum-of-trees" model where each tree is constrained by a regularization prior to be a weak learner, and fitting and inference are accomplished via an iterative Bayesian backfitting MCMC algorithm that generates samples from a posterior. Effectively, BART is a nonparametric Bayesian regression approach which uses dimensionally adaptive random basis elements. Motivated by ensemble methods in general, and boosting algorithms in particular, BART is defined by a statistical model: a prior and a likelihood. This approach enables full posterior inference including point and interval estimates of the unknown regression function as well as the marginal effects of potential predictors. By keeping track of predictor inclusion frequencies, BART can also be used for model-free variable selection. BART's many features are illustrated with a bake-off against competing methods on 42 different data sets, with a simulation experiment and on a drug discovery classification problem.},
archivePrefix = {arXiv},
arxivId = {0806.3286},
author = {Chipman, Hugh a. and George, Edward I. and McCulloch, Robert E.},
doi = {10.1214/09-AOAS285},
eprint = {0806.3286},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Chipman, George, McCulloch - 2008 - BART Bayesian additive regression trees.pdf:pdf},
isbn = {1932-6157},
issn = {19326157},
journal = {Annals of Applied Statistics},
keywords = {Bayesian backfitting,Boosting,CART,Classification,Ensemble,MCMC,Nonparametric regression,Probit model,Random basis,Regularizatio,Sum-of-trees model,Variable selection,Weak learner},
number = {1},
pages = {266--298},
title = {{BART: Bayesian additive regression trees}},
volume = {6},
year = {2008}
}
@article{Capriotti2010,
abstract = {We show how Algorithmic Differentiation can be used to implement efficiently the Pathwise Deriva- tive method for the calculation of option sensitivities with Monte Carlo. The main practical diffi- culty of the Pathwise Derivative method is that it requires the differentiation of the payout func- tion. For the type of structured options for which Monte Carlo simulations are usually employed, these derivatives are typically cumbersome to calculate analytically, and too time consuming to evaluate with standard finite-differences approaches. In this paper we address this problem and show how Algorithmic Differentiation can be employed to calculate very efficiently and with ma- chine precision accuracy these derivatives. We illustrate the basic workings of this computational technique by means of simple examples, and we demonstrate with several numerical tests how the Pathwise Derivative method combined with Algorithmic Differentiation – especially in the adjoint mode – can provide speed-ups of several orders of magnitude with respect to standard methods},
author = {Capriotti, Luca},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Capriotti - 2010 - Fast Greeks by Algorithmic Differentiation.pdf:pdf},
journal = {SSRN},
keywords = {algorithmic differentiation,derivatives pricing,monte carlo simulations},
number = {3},
pages = {1--15},
title = {{Fast Greeks by Algorithmic Differentiation}},
url = {http://papers.ssrn.com/sol3/papers.cfm?abstract\_id=1619626$\backslash$nhttp://www.luca-capriotti.net/pdfs/Finance/GD11LucaCapriotti.pdf},
volume = {14},
year = {2010}
}
@techreport{Orhan2014,
author = {Orhan, Emin},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Orhan - 2014 - Cover’s Function Counting Theorem ( 1965 ).pdf:pdf},
number = {1965},
pages = {1--2},
title = {{Cover’s Function Counting Theorem ( 1965 )}},
year = {2014}
}
@misc{Buuren2015,
author = {Buuren, Stef Van},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Buuren - 2015 - Package ‘ mice ’.pdf:pdf},
title = {{Package ‘ mice ’}},
year = {2015}
}
@article{Wolpert1992,
abstract = {: This paper introduces stacked generalization, a scheme for minimizing the generalization error rate of one or more generalizers. Stacked generalization works by deducing the biases of the generalizer(s) with respect to a provided learning set. This deduction proceeds by generalizing in a second space whose inputs are (for example) the guesses of the original generalizers when taught with part of the learning set and trying to guess the rest of it, and whose output is (for example) the correct guess. When used with multiple generalizers, stacked generalization can be seen as a more sophisticated version of cross-validation, exploiting a strategy more sophisticated than cross-validation 's crude winner-takes-all for combining the individual generalizers. When used with a single generalizer, stacked generalization is a scheme for estimating (and then correcting for) the error of a generalizer which has been trained on a particular learning set and then asked a particular question. After...},
author = {Wolpert, David H.},
doi = {10.1016/S0893-6080(05)80023-1},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Wolpert - 1992 - Stacked generalization.pdf:pdf},
isbn = {0893-6080},
issn = {08936080},
journal = {Neural Networks},
number = {2},
pages = {241--259},
pmid = {17947137},
title = {{Stacked generalization}},
volume = {5},
year = {1992}
}
@article{Thakur2015,
abstract = {In this paper, we propose AutoCompete, a highly automated machine learning framework for tackling machine learning competitions. This framework has been learned by us, validated and improved over a period of more than two years by participating in online machine learning competitions. It aims at minimizing human interference required to build a first useful predictive model and to assess the practical difficulty of a given machine learning challenge. The proposed system helps in identifying data types, choosing a machine learn- ing model, tuning hyper-parameters, avoiding over-fitting and optimization for a provided evaluation metric. We also observe that the proposed system produces better (or comparable) results with less runtime as compared to other approaches.},
archivePrefix = {arXiv},
arxivId = {1507.02188},
author = {Thakur, Abhishek and Krohn-Grimberghe, Artus},
eprint = {1507.02188},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Thakur, Krohn-Grimberghe - 2015 - AutoCompete A Framework for Machine Learning Competition.pdf:pdf},
keywords = {auto-machine learning,predictive modelling},
title = {{AutoCompete: A Framework for Machine Learning Competition}},
url = {http://arxiv.org/abs/1507.02188},
year = {2015}
}
@article{Gneiting2007,
abstract = {Scoring rules assess the quality of probabilistic forecasts, by assigning a numerical score based on the predictive distribution and on the event or value that materializes. A scoring rule is proper if the forecaster maximizes the expected score for an observation drawn from the distribution F if he or she issues the probabilistic forecast F, rather than G?=F. It is strictly proper if the maximum is unique. In prediction problems, proper scoring rules encourage the forecaster tomake careful assessments and to be honest. In estimation problems, strictly proper scoring rules provide attractive loss and utility functions that can be tailored to the problem at hand. This article reviews and develops the theory of proper scoring rules on general probability spaces, and proposes and discusses examples thereof. Proper scoring rules derive from convex functions and relate to information measures, entropy functions, and Bregman divergences. In the case of categorical variables, we prove a rigorous version of the Savage representation. Examples of scoring rules for probabilistic forecasts in the form of predictive densities include the logarithmic, spherical, pseudospherical, and quadratic scores. The continuous ranked probability score applies to probabilistic forecasts that take the form of predictive cumulative distribution functions. It generalizes the absolute error and forms a special case of a new and very general type of score, the energy score. Like many other scoring rules, the energy score admits a kernel representation in terms of negative definite functions, with links to inequalities of Hoeffding type, in both univariate and multivariate settings. Proper scoring rules for quantile and interval forecasts are also discussed.We relate proper scoring rules to Bayes factors and to cross-validation, and propose a novel form of cross-validation known as random-fold cross-validation. A case study on probabilistic weather forecasts in the North American Pacific Northwest illustrates the importance of propriety. We note optimum score approaches to point and quantile estimation, and propose the intuitively appealing interval score as a utility function in interval estimation that addresses width as well as coverage.},
author = {Gneiting, Tilmann and Raftery, Adrian E},
doi = {10.1198/016214506000001437},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Gneiting, Raftery - 2007 - Strictly Proper Scoring Rules, Prediction, and Estimation.pdf:pdf},
isbn = {0162-1459},
issn = {0162-1459},
journal = {Journal of the American Statistical Association},
number = {477},
pages = {359--378},
title = {{Strictly Proper Scoring Rules, Prediction, and Estimation}},
volume = {102},
year = {2007}
}
@article{Nan2015,
abstract = {We seek decision rules for prediction-time cost reduction, where complete data is available for training, but during prediction-time, each feature can only be acquired for an additional cost. We propose a novel random forest algorithm to min- imize prediction error for a user-specified aver- age feature acquisition budget. While random forests yield strong generalization performance, they do not explicitly account for feature costs and furthermore require low correlation among trees, which amplifies costs. Our random for- est grows trees with low acquisition cost and high strength based on greedy minimax cost- weighted-impurity splits. Theoretically, we es- tablish near-optimal acquisition cost guarantees for our algorithm. Empirically, on a number of benchmark datasets we demonstrate compet- itive accuracy-cost curves against state-of-the-art prediction-time},
author = {Nan, Feng and Wang, Joseph and Saligrama, Venkatesh},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Nan, Wang, Saligrama - 2015 - Feature-Budgeted Random Forest.pdf:pdf},
title = {{Feature-Budgeted Random Forest}},
volume = {37},
year = {2015}
}
@article{Cortes2003,
abstract = {The area under an ROC curve (AUC) is a criterion used in many applications to measure the quality of a classification algorithm. However, the objective function optimized in most of these algorithms is the error rate and not the AUC value. We give a detailed statistical analysis of the relationship between the AUC and the error rate, including the first exact expression of the expected value and the variance of the AUC for a fixed error rate. Our results show that the average AUC is monotonically increasing as a function of the classification accuracy, but that the standard deviation for uneven distributions and higher error rates is noticeable. Thus, algorithms designed to minimize the error rate may not lead to the best possible AUC values. We show that, under certain conditions, the global function optimized by the RankBoost algorithm is exactly the AUC.We report the results of our experiments with RankBoost in several datasets demonstrating the benefits of an algorithmspecifically designed to globally optimize the AUC over other existing algorithms optimizing an approximation of the AUC or only locally optimizing the AUC. 1},
author = {Cortes, Corinna and Mohri, Mehryar},
doi = {10.1.1.9.3518},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Cortes, Mohri - 2003 - AUC Optimization vs. Error Rate Minimization.pdf:pdf},
isbn = {0262201526},
issn = {10495258},
journal = {Advances in Neural Information Processing Systems},
pages = {313--320},
title = {{AUC Optimization vs. Error Rate Minimization.}},
url = {https://papers.nips.cc/paper/2518-auc-optimization-vs-error-rate-minimization.pdf},
year = {2003}
}
@article{Menon2009,
abstract = {Support Vector Machines (SVMs) are a very popular method for binary classification. Traditional training algorithms for SVMs, such as chunking and SMO},
author = {Menon, Aditya Krishna},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Menon - 2009 - Large-scale support vector machines algorithms and theory.pdf:pdf},
journal = {Research Exam, University of California, San Diego},
title = {{Large-scale support vector machines: algorithms and theory}},
url = {http://cseweb.ucsd.edu/~akmenon/ResearchExamTalk.pdf$\backslash$npapers2://publication/uuid/57F5C13E-738D-4EB1-9DA6-EC3D6E3B3761},
year = {2009}
}
@article{Snoek2012a,
abstract = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a "black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force search. Much more appealing is the idea of developing automatic approaches which can optimize the performance of a given learning algorithm to the task at hand. In this work, we consider the automatic tuning problem within the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). The tractable posterior distribution induced by the GP leads to efficient use of the information gathered by previous experiments, enabling optimal choices about what parameters to try next. Here we show how the effects of the Gaussian process prior and the associated inference procedure can have a large impact on the success or failure of Bayesian optimization. We show that thoughtful choices can lead to results that exceed expert-level performance in tuning machine learning algorithms. We also describe new algorithms that take into account the variable cost (duration) of learning experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization on a diverse set of contemporary algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.},
archivePrefix = {arXiv},
arxivId = {1206.2944},
author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P. Rp},
eprint = {1206.2944},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Snoek, Larochelle, Adams - 2012 - Practical Bayesian Optimization of Machine Learning Algorithms.pdf:pdf},
isbn = {9781627480031},
journal = {arXiv preprint arXiv:1206.2944},
pages = {1--12},
title = {{Practical Bayesian Optimization of Machine Learning Algorithms}},
url = {http://arxiv.org/abs/1206.2944},
year = {2012}
}
@article{Taddy2015,
abstract = {We derive ensembles of decision trees through a nonparametric Bayesian model, allowing us to view random forests as samples from a posterior distribution. This insight provides large gains in interpretability, and motivates a class of Bayesian forest (BF) algorithms that yield small but reli- able performance gains. Based on the BF frame- work, we are able to show that high-level tree hi- erarchy is stable in large samples. This leads to an empirical Bayesian forest (EBF) algorithm for building approximate BFs on massive distributed datasets and we show that EBFs outperform sub- sampling based alternatives by a large margin. 1.},
archivePrefix = {arXiv},
arxivId = {arXiv:1502.02312v2},
author = {Taddy, Matt and Chen, Chun-sheng and Com, Chunschen Ebay and Wyle, Mitch},
eprint = {arXiv:1502.02312v2},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Taddy et al. - 2015 - Bayesian and Empirical Bayesian Forests.pdf:pdf},
title = {{Bayesian and Empirical Bayesian Forests}},
volume = {37},
year = {2015}
}
@book{Bengio2015,
author = {Bengio, Yoshua and Goodfellow, Ian J. and Courville, Aaron},
publisher = {MIT Press},
title = {{Deep Learning}},
url = {http://www.iro.umontreal.ca/~bengioy/dlbook},
year = {2015}
}
@article{Bergstra2012,
abstract = {Grid search and manual search are the most widely used strategies for hyper-parameter optimiza- tion. This paper shows empirically and theoretically that randomly chosen trials are more efficient for hyper-parameter optimization than trials on a grid. Empirical evidence comes from a compar- ison with a large previous study that used grid search and manual search to configure neural net- works and deep belief networks. Compared with neural networks configured by a pure grid search, we find that random search over the same domain is able to find models that are as good or better within a small fraction of the computation time. Granting random search the same computational budget, random search finds better models by effectively searching a larger, less promising con- figuration space. Compared with deep belief networks configured by a thoughtful combination of manual search and grid search, purely random search over the same 32-dimensional configuration space found statistically equal performance on four of seven data sets, and superior performance on one of seven. A Gaussian process analysis of the function from hyper-parameters to validation set performance reveals that for most data sets only a few of the hyper-parameters really matter, but that different hyper-parameters are important on different data sets. This phenomenon makes grid search a poor choice for configuring algorithms for new data sets. Our analysis casts some light on why recent “High Throughput”methods achieve surprising success—they appear to search through a large number of hyper-parameters because most hyper-parameters do not matter much. We anticipate that growing interest in large hierarchical models will place an increasing burden on techniques for hyper-parameter optimization; this work shows that randomsearch is a natural base- line against which to judge progress in the development of adaptive (sequential) hyper-parameter optimization algorithms.},
author = {Bergstra, James and Bengio, Yoshua},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Bergstra, Bengio - 2012 - Random Search for Hyper-Parameter Optimization.pdf:pdf},
isbn = {1532-4435},
issn = {1532-4435},
journal = {Journal ofMachine Learning Research},
keywords = {deep learning,global optimization,model selection,neural networks,response surface},
pages = {281--305},
title = {{Random Search for Hyper-Parameter Optimization}},
volume = {13},
year = {2012}
}
@article{Taylor2015,
author = {Taylor, Phillip and Griffiths, Nathan and Bhalerao, Abhir},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Taylor, Griffiths, Bhalerao - 2015 - Redundant Feature Selection using Permutation Methods.pdf:pdf},
journal = {ICML 2014 AutoML Workshop},
title = {{Redundant Feature Selection using Permutation Methods}},
year = {2015}
}
@article{Pittman2004,
abstract = {Classification tree models are flexible analysis tools which have the ability to evaluate interactions among predictors as well as generate predictions for responses of interest. We describe Bayesian analysis of a specific class of tree models in which binary response data arise from a retrospective case-control design. We are also particularly interested in problems with potentially very many candidate predictors. This scenario is common in studies concerning gene expression data, which is a key motivating example context. Innovations here include the introduction of tree models that explicitly address and incorporate the retrospective design, and the use of nonparametric Bayesian models involving Dirichlet process priors on the distributions of predictor variables. The model specification influences the generation of trees through Bayes' factor based tests of association that determine significant binary partitions of nodes during a process of forward generation of trees. We describe this constructive process and discuss questions of generating and combining multiple trees via Bayesian model averaging for prediction. Additional discussion of parameter selection and sensitivity is given in the context of an example which concerns prediction of breast tumour status utilizing high-dimensional gene expression data; the example demonstrates the exploratory/explanatory uses of such models as well as their primary utility in prediction. Shortcomings of the approach and comparison with alternative tree modelling algorithms are also discussed, as are issues of modelling and computational extensions.},
author = {Pittman, Jennifer and Huang, Erich and Nevins, Joseph and Wang, Quanli and West, Mike},
doi = {10.1093/biostatistics/kxh011},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Pittman et al. - 2004 - Bayesian analysis of binary prediction tree models for retrospectively sampled outcomes.pdf:pdf},
isbn = {1465-4644 (Print)$\backslash$n1465-4644 (Linking)},
issn = {14654644},
journal = {Biostatistics},
keywords = {Bayesian analysis,Binary classification tree,Bioinformatics,Case-control design,Metagenes,Molecular classification,Predictive classification,Retrospective sampling,Tree models},
number = {4},
pages = {587--601},
pmid = {15475421},
title = {{Bayesian analysis of binary prediction tree models for retrospectively sampled outcomes}},
volume = {5},
year = {2004}
}
@article{Rakotomamonjy2004,
abstract = {For many years now, there is a growing interest around ROC$\backslash$ncurve for characterizing machine learning performances. This$\backslash$nis particularly due to the fact that in real-world problems$\backslash$nmisclassification costs are not known and thus, ROC curve$\backslash$nand related metrics such as the Area Under ROC curve (AUC)$\backslash$ncan be a more meaningful performance measures. In this$\backslash$npaper, we propose a SVMs based algorithm for AUC$\backslash$nmaximization and show that under certain conditions this$\backslash$nalgorithm is related to 2-norm soft margin Support Vector$\backslash$nMachines. We present experiments that compare SVMs$\backslash$nperformances to those of other AUC maximization based$\backslash$nalgorithms and provide empirical analysis of SVMs behavior$\backslash$nwith regards to ROC- based metrics. Our main conclusion is$\backslash$nthat SVMs can maximize both AUC and accuracy.},
author = {Rakotomamonjy, Alain},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Rakotomamonjy - 2004 - Optimizing Area Under Roc Curve with SVMs.pdf:pdf},
journal = {Optimization},
title = {{Optimizing Area Under Roc Curve with SVMs}},
year = {2004}
}
@article{Bostr,
author = {Bostr, Henrik},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Bostr - Unknown - Estimating Class Probabilities in Random Forests.pdf:pdf},
title = {{Estimating Class Probabilities in Random Forests}}
}
@article{Laurikkala2001,
abstract = {We studied three methods to improve identification of difficult small classes by balancing imbalanced class distribution with data reduction. The new method, neighborhood cleaning rule (NCL), outperformed simple random and one-sided selection methods in experiments with ten data sets. All reduction methods improved identification of small classes (20-30\%), but the differences were insignificant. However, significant differences in accuracies, true-positive rates and true-negative rates obtained with the 3-nearest neighbor method and C4.5 from the reduced data favored NCL. The results suggest that NCL is a useful method for improving the modeling of difficult small classes, and for building classifiers to identify these classes from the real-world data.},
author = {Laurikkala, Jorma},
doi = {10.1007/3-540-48229-6\_9},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Laurikkala - 2001 - Improving Identification of Difficult Small Classes by Balancing Class Distribution.pdf:pdf},
isbn = {0302-9743},
journal = {8th Conference on Artificial Intelligence in Medicine in Europe},
pages = {63--66},
title = {{Improving Identification of Difficult Small Classes by Balancing Class Distribution}},
year = {2001}
}
@article{Gubinelli2010,
author = {Massimiliano, Gubinelli},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Massimiliano - 2010 - III Th\'{e}or\`{e}mes limites pour les cha\^{\i}nes de Markov.pdf:pdf},
pages = {1--5},
title = {{III Th\'{e}or\`{e}mes limites pour les cha\^{\i}nes de Markov}},
year = {2010}
}
@article{Liu2009,
abstract = {Under-sampling is a class-imbalance learning method which uses only a subset of major class examples and thus is very efficient. The main deficiency is that many major class examples are ignored. We propose two algorithms to overcome the deficiency. EasyEnsemble samples several subsets from the major class, trains a learner using each of them, and combines the outputs of those learners. BalanceCascade is similar to EasyEnsemble except that it removes correctly classified major class examples of trained learners from further consideration. Experiments show that both of the proposed algorithms have better AUC scores than many existing class-imbalance learning methods. Moreover, they have approximately the same training time as that of under-sampling, which trains significantly faster than other methods.$\backslash$n$\backslash$nRecommended by Xin; Her experiment design is very good; Compare with my work; She seperates data sets into easy and hard tasks according to AUC value;},
author = {Liu, Xu-Ying and Wu, Jianxin and Zhou, Zhi-Hua},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Liu, Wu, Zhou - 2009 - Exploratory Undersampling for Class Imbalance Learning.pdf:pdf},
journal = {IEEE Transactions on Systems, Man and Cybernetics},
keywords = {Ensemble learning,sampling},
number = {2},
pages = {539--550},
title = {{Exploratory Undersampling for Class Imbalance Learning}},
volume = {39},
year = {2009}
}
@article{Blum,
abstract = {The empirical error on a test set, the hold-out esti- mate, often is a more reliable estimate of general- ization error than the observed error on the training set, the training estimate. K-fold cross validation is used in practice with the hope of being more ac- curate than the hold-out estimate without reducing the number of training examples. We argue that the k-fold estimate does in fact achieve this goal. Specifically, we showthat for any nontrivial learn- ing problem and learning algorithm that is insen- sitive to example ordering, the k-fold estimate is strictly more accurate than a single hold-out esti- mate on 1/k of the data, for ( is leave-one-out), based on its variance and all higher moments. Previous bounds were termed sanity- check because they compared the k-fold estimate to the training estimate and, further, restricted the ????????? ? ? VC dimension and required a notion of hypothesis stability [2]. In order to avoid these dependencies, we consider a k-fold hypothesis that is a random- ized combination or average of the individual hy- potheses. We introduce progressive validation as another pos- sible improvement on the hold-out estimate. This estimate of the generalization error is, in many ways, ? as good as that of a single hold-out, but it uses an average of half as many examples for testing. The procedure also involves a hold-out set, but after an example has been tested, it is added to the training set and the learning algorithm is rerun.},
author = {Blum, Avrim and Kalai, Adam and Langford, John},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Blum, Kalai, Langford - Unknown - Beating the Hold-Out Bounds for K-Fold and Progressive Cross-Validation.pdf:pdf},
pages = {6--11},
title = {{Beating the Hold-Out : Bounds for K-Fold and Progressive Cross-Validation}}
}
@article{Leistner,
abstract = {Random Forests (RFs) have become common- place in many computer vision applications. Their popularity is mainly driven by their high computa- tional efficiency during both training and evaluation while still being able to achieve state-of-the-art ac- curacy. This work extends the usage of Random Forests to Semi-Supervised Learning (SSL) problems. We show that traditional decision trees are optimizing multi- class margin maximizing loss functions. From this intuition, we develop a novel multi-class margin def- inition for the unlabeled data, and an iterative deter- ministic annealing-style training algorithm maximiz- ing both the multi-class margin of labeled and un- labeled samples. In particular, this allows us to use the predicted labels of the unlabeled data as addi- tional optimization variables. Furthermore, we pro- pose a control mechanism based on the out-of-bag error, which prevents the algorithm from degrada- tion if the unlabeled data is not useful for the task. Our experiments demonstrate state-of-the-art semi- supervised learning performance in typical machine learning problems and constant improvement using unlabeled data for the Caltech-101 object catego- rization task. 1.},
author = {Leistner, Christian and Saffari, Amir and Santner, Jakob and Bischof, Horst},
doi = {10.1109/ICCV.2009.5459198},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Leistner et al. - 2009 - Semi-Supervised Random Forests.pdf:pdf},
isbn = {9781424444205},
issn = {1550-5499},
journal = {Proceedings of the IEEE International Conference on Computer Vision},
pages = {506--513},
title = {{Semi-Supervised Random Forests}},
year = {2009}
}
@article{Rodriguez2006,
abstract = {We propose a method for generating classifier ensembles based on feature extraction. To create the training data for a base classifier, the feature set is randomly split into K subsets (K is a parameter of the algorithm) and Principal Component Analysis (PCA) is applied to each subset. All principal components are retained in order to preserve the variability information in the data. Thus, K axis rotations take place to form the new features for a base classifier. The idea of the rotation approach is to encourage simultaneously individual accuracy and diversity within the ensemble. Diversity is promoted through the feature extraction for each base classifier. Decision trees were chosen here because they are sensitive to rotation of the feature axes, hence the name "forest." Accuracy is sought by keeping all principal components and also using the whole data set to train each base classifier. Using WEKA, we examined the Rotation Forest ensemble on a random selection of 33 benchmark data sets from the UCI repository and compared it with Bagging, AdaBoost, and Random Forest. The results were favorable to Rotation Forest and prompted an investigation into diversity-accuracy landscape of the ensemble models. Diversity-error diagrams revealed that Rotation Forest ensembles construct individual classifiers which are more accurate than these in AdaBoost and Random Forest, and more diverse than these in Bagging, sometimes more accurate as well.},
author = {Rodr\'{\i}guez, Juan J. and Kuncheva, Ludmila I. and Alonso, Carlos J.},
doi = {10.1109/TPAMI.2006.211},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Rodr\'{\i}guez, Kuncheva, Alonso - 2006 - Rotation forest A New classifier ensemble method.pdf:pdf},
isbn = {0162-8828 (Print)},
issn = {01628828},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {AdaBoost,Bagging,Classifier ensembles,Feature extraction,Kappa-error diagrams,PCA,Random forest},
number = {10},
pages = {1619--1630},
pmid = {16986543},
title = {{Rotation forest: A New classifier ensemble method}},
volume = {28},
year = {2006}
}
@article{Janitza2012,
abstract = {The Random Forest (RF) algorithm by Leo Breiman has become a standard data analysis tool in bioinformatics. It has shown excellent performance in settings where the number of variables is much larger than the number of observations, can cope with complex interaction structures as well as highly correlated variables and returns measures of variable importance. This paper synthesizes ten years of RF devel- opment with emphasis on applications to bioinformatics and compu- tational biology. Special attention is given to practical aspects such as the selection of parameters, available RF implementations, and im- portant pitfalls and biases of RF and its variable importance measures (VIMs). The paper surveys recent developments of the methodology relevant to bioinformatics as well as some representative examples of RF applications in this context and possible directions for future re- search.},
author = {Boulesteix, Anne-laure and Janitza, Silke and Kruppa, Jochen and K\"{o}nig, Inke R.},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Boulesteix et al. - 2012 - Overview of Random Forest Methodology and Practical Guidance with Emphasis on Computational Biologyand Bioinf.pdf:pdf},
journal = {Technical Report - Institut f\"{u}r Statistik - University of Munich},
number = {129},
title = {{Overview of Random Forest Methodology and Practical Guidance with Emphasis on Computational Biologyand Bioinformatics}},
year = {2012}
}
@article{Gehrke1998,
abstract = {Classification of large datasets is an important data mining problem. Many classification algorithms have been proposed in the literature, but studies have shown that so far no algorithm uniformly outperforms all other algorithms in terms of quality. In this paper, we present a unifying framework for decision tree classifiers that separates the scalability aspects of algorithms for constructing a decision tree from the central features that determine the quality of the tree. This generic algorithm is easy to instantiate with specific algorithms from the literature (including C4.5, CART, CHAID, FACT, ID3 and extensions, SLIQ, Sprint and QUEST). In addition to its generality, in that it yields scalable versions of a wide range of classification algorithms, our approach also offers performance improvements of over a factor of five over the Sprint algorithm, the fastest scalable classification algorithm proposed previously. In contrast to Sprint, however, our generic algorithm requires a certain minimum amount of main memory, proportional to the set of distinct values in a column of the input relation. Given current main memory costs, this reuirement is readily met in most if not all workloads.},
author = {Gehrke, Johannes and Ramakrishnan, R and Ganti, V},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Gehrke, Ramakrishnan, Ganti - 1998 - RainForest—a framework for fast decision tree construction of large datasets.pdf:pdf},
journal = {24rd International Conference on Very Large Data Bases},
pages = {416--427},
title = {{RainForest—a framework for fast decision tree construction of large datasets}},
url = {http://www.springerlink.com/index/G652567687177764.pdf},
year = {1998}
}
@article{Friedman1999a,
author = {Friedman, Jerome H},
doi = {10.2307/2699986},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Friedman - 1999 - Greedy Function Approximation A Gradient Boosting Machine 1 Function estimation 2 Numerical optimization in function.pdf:pdf},
isbn = {0090-5364},
issn = {00905364},
journal = {North},
number = {3},
pages = {1--10},
title = {{Greedy Function Approximation : A Gradient Boosting Machine 1 Function estimation 2 Numerical optimization in function space}},
volume = {1},
year = {1999}
}
@article{Ghattas2000,
author = {Ghattas, B},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Ghattas - 2000 - Importance des variables dans les m\'{e}thodes CART.pdf:pdf},
title = {{Importance des variables dans les m\'{e}thodes CART}},
url = {http://greqam.univ-mrs.fr/IMG/working\_papers/2000/00b04.pdf},
year = {2000}
}
@article{Culver2006a,
author = {Culver, Matt and Kun, Deng and Scott, Stephen},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Culver, Kun, Scott - 2006 - Active Learning to Maximize Area Under the ROC Curve.pdf:pdf},
isbn = {0769527019},
journal = {Proceedings of the Sixth International Conference on Data Mining},
pages = {149--158},
title = {{Active Learning to Maximize Area Under the ROC Curve}},
year = {2006}
}
@article{Pavlidis2012,
abstract = {Credit scoring methods for predicting creditworthiness have proven very effective in consumer finance. In light of the present financial crisis, such methods will become even more important. One of the outstanding issues in credit risk classification is population drift. This term refers to changes occurring in the population due to unexpected changes in economic conditions and other factors. In this paper, we propose a novel methodology for the classification of credit applications that has the potential to adapt to population drift as it occurs. This provides the opportunity to update the credit risk classifier as new labelled data arrives. Assorted experimental results suggest that the proposed method has the potential to yield significant performance improvement over standard approaches, without sacrificing the classifier's descriptive capabilities.},
author = {Pavlidis, N. G. and Tasoulis, Dimitrios K. and Adams, N. M. and Hand, David J.},
doi = {10.1057/jors.2012.15},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Pavlidis et al. - 2012 - Adaptive consumer credit classification.pdf:pdf},
issn = {0160-5682},
keywords = {HB Economic Theory},
number = {12},
pages = {1645--1654},
publisher = {Nature Publishing Group},
title = {{Adaptive consumer credit classification}},
url = {http://dx.doi.org/10.1057/jors.2012.15},
volume = {63},
year = {2012}
}
@article{CorinnaCortes2005,
abstract = {Inmany applications, good ranking is a highly desirable performance for a classifier. The criterion commonly used to measure the ranking quality of a classification algorithm is the area under the ROC curve (AUC). To report it properly, it is crucial to determine an interval of confidence for its value. This paper provides confidence intervals for the AUC based on a statistical and combinatorial analysis using only simple parameters such as the error rate and the number of positive and negative examples. The analysis is distribution-independent, it makes no assumption about the distribution of the scores of negative or positive examples. The results are of practical use and can be viewed as the equivalent for AUC of the standard confidence intervals given in the case of the error rate. They are comparedwith previous approaches in several standard classification tasks demonstrating the benefits of our analysis. 1},
author = {Cortes, Corinna and Mohri, Mehryar},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Cortes, Mohri - 2005 - Confidence intervals for the area under the ROC curve.pdf:pdf},
journal = {Advances in Neural Information Processing Systems \ldots},
pages = {305--312},
title = {{Confidence intervals for the area under the ROC curve}},
url = {http://books.google.com/books?hl=en\&lr=\&id=etp-l5VrbHsC\&oi=fnd\&pg=PA305\&dq=Confidence+Intervals+for+the+Area+under+the+ROC+Curve\&ots=\_K6x0GtGxG\&sig=\_clX-1y-IV17gIDSI5c63gBewSg},
volume = {17},
year = {2005}
}
@article{Dupret2001,
abstract = {This paper presents a technical framework to assess the impact of re-sampling on the ability of a supervised learning to correctly learn a classification problem. We use the bootstrap expression of the prediction error to identify the optimal re-sampling proportions in binary classification experiments using artificial neural networks. Based on Bayes decision rule and the a priori distribution of the objective data, an estimate for the optimal re-sampling proportion is derived as well as upper and lower bounds for the exact optimal proportion. The analytical considerations to extend the present method to cross-validation and multiple classes are also illustrated. ?? 2001 Elsevier Science B.V.},
author = {Dupret, Georges and Koda, Masato},
doi = {10.1016/S0377-2217(00)00244-7},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Dupret, Koda - 2001 - Bootstrap re-sampling for unbalanced data in supervised learning.pdf:pdf},
issn = {03772217},
journal = {European Journal of Operational Research},
keywords = {Data mining,Decision support systems,Neural networks,Simulation},
number = {1},
pages = {141--156},
title = {{Bootstrap re-sampling for unbalanced data in supervised learning}},
volume = {134},
year = {2001}
}
@article{Shannon1957,
archivePrefix = {arXiv},
arxivId = {chao-dyn/9411012},
author = {Shannon, C.},
doi = {10.1145/584091.584093},
eprint = {9411012},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Shannon - 1948 - A mathematical theory of communication.pdf:pdf},
isbn = {1559-1662},
issn = {07246811},
journal = {Bell System Technology Journal},
pages = {379:423, 623--656},
pmid = {9230594},
primaryClass = {chao-dyn},
title = {{A mathematical theory of communication}},
volume = {27},
year = {1948}
}
@article{Freund2003,
abstract = {We study the problem of learning to accurately rank a set of objects by combining a given collection of ranking or preference functions. This problem of combining preferences arises in several applications, such as that of combining the results of different search engines, or the “collaborative-filtering” problem of ranking movies for a user based on the movie rankings provided by other users. In this work, we begin by presenting a formal framework for this general problem. We then describe and analyze an efficient algorithm called RankBoost for combining preferences based on the boosting approach to machine learning. We give theoretical results describing the algorithm’s behavior both on the training data, and on new test data not seen during training. We also describe an efficient implementation of the algorithm for a particular restricted but common case. We next discuss two experiments we carried out to assess the performance of RankBoost. In the first experiment, we used the algorithm to combine different web search strategies, each of which is a query expansion for a given domain. The second experiment is a collaborative-filtering task for making movie recommendations.},
author = {Freund, Yoav and Iyer, Raj and Schapire, Robert E and Singer, Yoram},
doi = {10.1162/jmlr.2003.4.6.933},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Freund et al. - 2003 - An Efficient Boosting Algorithm for Combining Preferences.pdf:pdf},
isbn = {1581134924},
issn = {15324435},
journal = {The Journal of Machine Learning Research},
pages = {933--969},
pmid = {345},
title = {{An Efficient Boosting Algorithm for Combining Preferences}},
volume = {4},
year = {2003}
}
@article{Herschtal2004,
author = {Herschtal, Alan and Raskutti, Bhavani},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Herschtal, Raskutti - 2004 - Optimising area under the ROC curve using gradient descent.pdf:pdf},
journal = {Proceedings of the twenty-first international \ldots},
title = {{Optimising area under the ROC curve using gradient descent}},
url = {http://dl.acm.org/citation.cfm?id=1015366},
year = {2004}
}
@article{Bernard2012,
abstract = {In this paper, we introduce a new Random Forest (RF) induction algorithm called Dynamic Random Forest (DRF) which is based on an adaptative tree induction procedure. The main idea is to guide the tree induction so that each tree will complement as much as possible the existing trees in the ensemble. This is done here through a resampling of the training data, inspired by boosting algorithms, and combined with other randomization processes used in traditional RF methods. The DRF algorithm shows a significant improvement in terms of accuracy compared to the standard static RF induction algorithm. © 2012 Elsevier B.V. All rights reserved.},
author = {Bernard, Simon and Adam, S\'{e}bastien and Heutte, Laurent},
doi = {10.1016/j.patrec.2012.04.003},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Bernard, Adam, Heutte - 2012 - Dynamic Random Forests.pdf:pdf},
isbn = {0167-8655},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Dynamic induction,Ensemble of classifiers,Random feature selection,Random forests},
number = {12},
pages = {1580--1586},
title = {{Dynamic Random Forests}},
volume = {33},
year = {2012}
}
@article{Khalilia2011,
abstract = {ABSTRACT:},
author = {Khalilia, Mohammed and Chakraborty, Sounak and Popescu, Mihail},
doi = {10.1186/1472-6947-11-51},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Khalilia, Chakraborty, Popescu - 2011 - Predicting disease risks from highly imbalanced data using random forest.pdf:pdf},
isbn = {1472-6947 (Electronic)$\backslash$r1472-6947 (Linking)},
issn = {1472-6947},
journal = {BMC medical informatics and decision making},
number = {1},
pages = {51},
pmid = {21801360},
publisher = {BioMed Central Ltd},
title = {{Predicting disease risks from highly imbalanced data using random forest.}},
url = {http://www.biomedcentral.com/1472-6947/11/51},
volume = {11},
year = {2011}
}
@article{Deng,
abstract = {Tree ensembles such as randomforests and boosted trees are accurate but dif- ficult to understand, debug and deploy. In this work, we provide the inTrees (interpretable trees) framework that extracts, measures, prunes and selects rules from a tree ensemble, and calculates frequent variable interactions. An rule-based learner, referred to as the simplified tree ensemble learner (STEL), can also be formed and used for future prediction. The inTrees framework can applied to both classification and regression problems, and is applicable to many types of tree ensembles, e.g., random forests, regularized random forests, and boosted trees. We implemented the inTrees algorithms in the “inTrees” R package. Keywords:},
archivePrefix = {arXiv},
arxivId = {arXiv:1408.5456v1},
author = {Deng, Houtao},
eprint = {arXiv:1408.5456v1},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Deng - Unknown - Interpreting Tree Ensembles with inTrees.pdf:pdf},
keywords = {decision tree,random forest,rule extraction,rule-based learner},
title = {{Interpreting Tree Ensembles with inTrees}}
}
@article{Chipman1998,
abstract = {In this article we put forward a Bayesian approach for finding classification and regression tree (CART) models. The two basic components of this approach consist of prior specification and stochastic search. The basic idea is to have the prior induce a posterior distribution that will guide the stochastic search toward more promising CART models. As the search proceeds, such models can then be selected with a variety of criteria, such as posterior probability, marginal likelihood, residual sum of squares or misclassification rates. Examples are used to illustrate the potential superiority of this approach over alternative methods.},
author = {Chipman, Hugh a. and George, Edward I. and McCulloch, Robert E.},
doi = {10.2307/2669832},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Chipman, George, McCulloch - 1998 - Bayesian CART Model Search.pdf:pdf},
issn = {0162-1459},
journal = {Journal of the American Statistical Association},
keywords = {binary trees,markov chain monte carlo,mixture,stochastic search to-,that will guide the},
number = {443},
pages = {935--948},
title = {{Bayesian CART Model Search}},
url = {http://www.jstor.org/stable/2669832$\backslash$nhttp://www.jstor.org.libproxy1.nus.edu.sg/stable/pdfplus/2669832.pdf?acceptTC=true},
volume = {93},
year = {1998}
}
@article{Yan2003,
abstract = {When the goal is to achieve the best correct classification rate, cross entropy and mean squared error are typical cost functions used to optimize classifier performance. However, for many real-world classification problems, the ROC curve is a more meaningful perfor- mance measure. We demonstrate that min- imizing cross entropy or mean squared error does not necessarily maximize the area un- der the ROC curve (AUC).We then consider alternative objective functions for training a classifier to maximize the AUC directly. We propose an objective function that is an ap- proximation to the Wilcoxon-Mann-Whitney statistic, which is equivalent to the AUC. The proposed objective function is differentiable, so gradient-based methods can be used to train the classifier. We apply the new objec- tive function to real-world customer behav- ior prediction problems for a wireless service provider and a cable service provider, and achieve reliable improvements in the ROC curve.},
author = {Yan, Lian and Dodier, Robert and Mozer, Michael C and Wolniewicz, Richard},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Yan et al. - 2003 - Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.pdf:pdf},
isbn = {1577351894},
journal = {Machine Learninginternational Workshop Then Conference},
number = {2},
pages = {848},
title = {{Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic}},
url = {http://scholar.google.com/scholar?hl=en\&btnG=Search\&q=intitle:Optimizing+Classifier+Performance+via+an+Approximation+to+the+Wilcoxon-Mann-Whitney+Statistic\#0},
volume = {20},
year = {2003}
}
@article{Dahinden2009,
abstract = {Random Forests is a popular ensemble technique developed by Breiman (2001) which yields exceptional performance. These excellent results are achieved with little need to fine-tune parameters. The method is computationally effective, does not overfit, is robust to noise and can also be applied when the number of variables is much larger than the number of samples. We propose a slightly modified Random Forests scheme, with cross-validation as a means for tuning parameters and estimating error-rates. This simple and computationally very efficient approach was found to yield better predictive performance on the WCCI 2006 Performance Prediction Challenge datasets than many algorithms of much higher complexity},
author = {Dahinden, Corinne},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Dahinden - 2009 - An improved Random Forests approach with application to the performance prediction challenge datasets.pdf:pdf},
journal = {Hands on Pattern Recognition. Microtome},
keywords = {ensemble methods,random forests},
pages = {1--6},
title = {{An improved Random Forests approach with application to the performance prediction challenge datasets}},
url = {http://stat.ethz.ch/~dahinden/Paper/Bookchapter.pdf},
year = {2009}
}
@article{Blagus2010,
abstract = {The goal of class prediction studies is to develop rules to accurately predict the class membership of new samples. The rules are derived using the values of the variables available for each subject: the main characteristic of high-dimensional data is that the number of variables greatly exceeds the number of samples. Frequently the classifiers are developed using class-imbalanced data, i.e., data sets where the number of samples in each class is not equal. Standard classification methods used on class-imbalanced data often produce classifiers that do not accurately predict the minority class; the prediction is biased towards the majority class. In this paper we investigate if the high-dimensionality poses additional challenges when dealing with class-imbalanced prediction. We evaluate the performance of six types of classifiers on class-imbalanced data, using simulated data and a publicly available data set from a breast cancer gene-expression microarray study. We also investigate the effectiveness of some strategies that are available to overcome the effect of class imbalance.},
author = {Blagus, Rok and Lusa, Lara},
doi = {10.1186/1471-2105-11-523},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Blagus, Lusa - 2010 - Class prediction for high-dimensional class-imbalanced data.pdf:pdf},
issn = {1471-2105},
journal = {BMC bioinformatics},
number = {1},
pages = {523},
pmid = {20961420},
publisher = {BioMed Central Ltd},
title = {{Class prediction for high-dimensional class-imbalanced data.}},
url = {http://www.biomedcentral.com/1471-2105/11/523},
volume = {11},
year = {2010}
}
@article{Homescu2011,
abstract = {Two of the most important areas in computational finance: Greeks and, respectively, calibration, are based on efficient and accurate computation of a large number of sensitivities. This paper gives an overview of adjoint and automatic differentiation (AD), also known as algorithmic differentiation, techniques to calculate these sensitivities. When compared to finite difference approximation, this approach can potentially reduce the computational cost by several orders of magnitude, with sensitivities accurate up to machine precision. Examples and a literature survey are also provided.},
archivePrefix = {arXiv},
arxivId = {1107.1831},
author = {Homescu, Cristian},
eprint = {1107.1831},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Homescu - 2011 - Adjoints and Automatic (Algorithmic) Differentiation in Computational Finance.pdf:pdf},
pages = {23},
title = {{Adjoints and Automatic (Algorithmic) Differentiation in Computational Finance}},
url = {http://arxiv.org/abs/1107.1831},
year = {2011}
}
@unpublished{Yeh2014,
author = {Yeh, Shu-hao and Wang, Chuan-ju and Tsai, Ming-Feng},
booktitle = {Intermational Institute of Forecasting},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Yeh, Wang, Tsai - 2014 - Corporate Default Prediction via Deep Learning.pdf:pdf},
keywords = {deep learning,default prediction},
title = {{Corporate Default Prediction via Deep Learning}},
url = {http://forecasters.org/wp/wp-content/uploads/gravity\_forms/7-2a51b93047891f1ec3608bdbd77ca58d/2014/07/Yeh\_Shu-Hao\_ISF2014.pdf},
year = {2014}
}
@article{Wolpert1999,
abstract = {Bagging (Breiman, 1994a) is a technique that tries to improve a learning algorithm's performance by using bootstrap replicates of the training set (Efron \& Tibshirani, 1993, Efron, 1979). The computational requirements for estimating the resultant generalization error on a test set by means of cross-validation are often prohibitive, for leave-one-out cross-validation one needs to train the underlying algorithm on the order of m? times, where m is the size of the training set and ? is the number of replicates. This paper presents several techniques for estimating the generalization error of a bagged learning algorithm without invoking yet more training of the underlying learning algorithm (beyond that of the bagging itself), as is required by cross-validation-based estimation. These techniques all exploit the bias-variance decomposition (Geman, Bienenstock \& Doursat, 1992, Wolpert, 1996). The best of our estimators also exploits stacking (Wolpert, 1992). In a set of experiments reported here, it was found to be more accurate than both the alternative cross-validation-based estimator of the bagged algorithm's error and the cross-validation-based estimator of the underlying algorithm's error. This improvement was particularly pronounced for small test sets. This suggests a novel justification for using bagging—more accurate estimation of the generalization error than is possible without bagging.},
author = {Wolpert, David H. and Macready, William G.},
doi = {10.1023/A:1007519102914},
file = {:Users/guillaume/Dropbox/Mendeley Desktop/Wolpert, Macready - 1999 - Efficient method to estimate Bagging's generalization error.pdf:pdf},
isbn = {0885-6125},
issn = {08856125},
journal = {Machine Learning},
keywords = {bagging,bootstrap,cross-validation,generalization error,stacking},
number = {1},
pages = {41--55},
title = {{Efficient method to estimate Bagging's generalization error}},
volume = {35},
year = {1999}
}