-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsuper_learner_macro.sas
6473 lines (6200 loc) · 294 KB
/
super_learner_macro.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%PUT super learner macro v1.1.10;
/**********************************************************************************************************************
* Author: Alex Keil
* Program: super_learner_macro.sas
* Version: 1.1.10
* Contact: akeil@unc.edu
* Tasks: general purpose macro to get cross validated predictions from super learner using parametric, semiparametric,
and machine learning functions in SAS
* Requirements:
Base SAS 9.X (subversion X varies depending on learners used, 9.4+ recommended)
SAS/OR (14.1+ recommended)
SAS/STAT (14.1+ recommended)
* Tested on:
OS: Windows 8.1
Base SAS: 9.4_TS1M3, 9.4_TS1M5
SAS Analytics: 14.1, 14.3
R: 3.30
* Highly recommended:
SAS Enterprise Miner High Performance Procedures (required for many learners)
* Optional for enhancement:
SAS/IML, R v3.3+ and RLANG system option enabled in sasv9.cvg
* Description:
super learner macro: this is a macro for making predictions of a binary or continuous variable
using the super learner algorithm developed by Mark van der Laan and colleagues
This macro implements a version of super learner to generate predictions of a continuous or
binary variable. The cross-validation probability of the event from each learner is used to
estimate the superlearner coefficients that describe the weighted combination of algorithms
that go into the superlearer probability/regression function.
These coefficients are used with the algorithms fit to the full data to generate predictions.
Note that an additional cross-validation step can be used to estimate the cross-validated
superlearner algorithm risk to assess the potential for overfit of the superlearner coefficients.
This step is accessible through the CVSuperLearner macro
More info on the algorithm can be found in section 3.3.2 of "Targeted Learning" by van Der Laan and Rose (1ed);
specifically, this macro implements the algorithm shown in figure 3.2
This macro is accompanied by a manuscript:
Keil, A. P. (2018). Super Learning in the SAS system. arXiv preprint arXiv:1805.08058.
https://arxiv.org/abs/1805.08058
Currently available learners:
linear model, logistic model, lasso (linear and logit), least angle regression,
elastic net, generalized additive models, neural net, random forest
classification/regression trees (cart), boosted cart, bagged cart, multivariate adaptive
regression spline
This programs structure is:
0) main "SuperLearner" and "CVSuperLearner" macro definition (with advanced versions: _SuperLearner and _CVSuperLearner)
1) helper functions
2) learners (standardized macro input to allow modularity in adding new learners)
3) main subfunctions
4) sample usage with test (comment out for using this in an %include statement)
* Keywords: prediction, causal inference, machine learning, gam, random forest, super learner, cross validation
*
* Released under the GNU General Public License: http://www.gnu.org/copyleft/gpl.html
* MAJOR UPDATES (MINOR UPDATES LISTED AT BOTTOM OF PROGRAM):
* 26-Jun-2018: v1.0 released
**********************************************************************************************************************/
%LET suppresswarn = *; *set to * to turn off all warnings about the correct sas version for each proc, empty otherwise;
%MACRO SuperLearner( Y=,
X=,
by=,
intvars=,
binary_predictors=,
ordinal_predictors=,
nominal_predictors=,
continuous_predictors=,
id=,
weight=,
indata=,
preddata=,
outdata=sl_out,
dist=GAUSSIAN,
library= glm,
trtstrat=FALSE,
folds=10,
method=BNNLS,
/* deprecated */
predvar=,
gfstrat=,
risk=,
cv=
);
/*
Main Super Learner macro parameters:
Y=, *dependent variable, the variable you are trying to predict
by=, * by variables (do super learner for every value of the by variable - must be only a single variable, and must be sorted!;
intvars=, * variables upon which you are interventing (only capable of 1/0 contrasts at this point)
binary_predictors=, * binary variables, (including binary intervention variables)
ordinal_predictors=, * ordinal variables, (including ordinal intervention variables)
nominal_predictors=, * nominal variables, (including nominal intervention variables)
continuous_predictors=, * continuous variables, (including continuous intervention variables)
id=, * name of variable of a unique identifier for units (optional)
weight=, * variable name for observation weights/importance weights/frequency weights
indata=, * training data (sample data)
preddata=, * (optional) data in which you wish to make predictions (if blank, will make predictions in 'indata')
outdata=, * name of output dataset with SL + library specific predictions
dist=, * assumed distribution of Y: only supports GAUSSIAN or BERNOULLI;
library= logit, * superlearner library (* Enterprise Miner procedure), CASE SENSITIVE (use lowercase), limited to space separated list of:
back (backward selection by BIC)
bagging (* Bootstrap aggregation of regression/classification trees)
bayesnet (* Bayesian network [binary only])
boxcox (Box-Cox transformation of target variables, positively bound continuous Y only)
boost (* Gradient boosting of regression/classification trees)
cvcart (classification/regression tree with cross validated selection of meta parameters)
cart (classification/regression tree, no cross validation)
enet (elastic net - warning for binary outcome: does not respect [0,1] probability space)
gam (generalized additive model with 3 df splines on all continuous variables)
gampl (faster generalized additive model with 3 df splines on all continuous variables)
glm (linear or logistic regression: slower wrappers for logit and linreg)
knn ([bernoulli only] k-nearest neighbors classification)
lar (least angle regression - warning for binary outcome: does not respect [0,1] probability space)
lasso (LASSO)
lassob (LASSO with glmselect [use caution with binary variables] - may be appropriate for older sas versions)
cvlasso (LASSO with cross validated selection of shrinkage parameter)
logit (main term logistic regression)
linreg (main term linear regression)
mars (multivariate adaptive regression splines)
mean (marginal mean of the prediction variable)
nbayes (* naive Bayes)
bspline (b spline regression)
pbspline (penalized basis spline regression [SINGLE CONTINUOUS PREDICTOR ONLY])
probit (main term probit regression)
ridge (ridge regression - warning for binary outcome: does not respect [0,1] probability space)
quantreg (Quantile regression for the median - warning for binary outcome: does not respect [0,1] probability space)
rf (random forest)
rfoob (random forest, using out of bag predictions and modified selection criteria)
nn (* neural network)
sherwood (* random forest, using proc ARBOR, sampling with replacement)
swise (Stepwise model selection with HPGENSELECT - may be appropriate alternative to lasso for older sas versions)
**** note that lasso, rf and cart may require >= SAS 9.4 TS1M3 + optional high powered data mining procedures to work
ALSO includes multiple versions that include all first order interaction terms:
backint, logitint, linregint, lassoint, lassobint, cvlassoint, swiseint, larint, enetint, gamint, gamplint, marsint, probitint
Included R functions (requires SAS/IML and RLANG system option enabled) allows a limited set of functions that call learners in the R programming language:
r_bagging: Bootstrap aggregation of regression/classification trees (requires ipred, rpart packages)
r_bart: Bayesian additive regression/classification trees (requires dbarts package)
r_boost: Gradient boosting regression/classification (requires xgboost)
r_enet: elastic net regression/classification with cross validated selection of shrinkage parameter (requires glmnet package)
r_gam: generalized additive model (requires gam, foreach, splines packages)
r_lasso: LASSO regression/classification with cross validated selection of shrinkage parameter(requires glmnet package)
r_mars: MARS - multivariate adaptive regression splines (requires earth package)
r_polymars: MARS - multivariate adaptive polynomial regression splines (requires polspline package)
r_ridge: ridge regression/classification with cross validated selection of shrinkage parameter (requires glmnet package)
r_rf: random forest using R superlearner defaults (requires randomForest package)
r_sl: R based super learner with several default learners (glm, lasso, bart, rf, mars) (requires SuperLearner package)
r_svm: support vector machine regression/classification (requires e1071)
**** note: package installation should happen automatically when these are used for the first time
if RLANG configured correctly
trtstrat=false, * whether to use stratified (by treatment variable) models (true or false) - deprecated version is gfstrat
folds=10, * number of folds for cross validation
method=NNLS, * fitting approach to estimate coefficients for super learner, the level-1 model using Wolperts term
Defaults: NNLS[default for gaussian dist] NNLOGLIK[default for bernoulli dist],
Other options: CCLOGLIK, LOGLIK, CCLS, OLS, CCLAE, NNLAE, LAE
Methods are possibly indexed by prefixes: NN, CC, [none],
where NN implies non-negative coefficients that are standardized after fitting to sum to 1.
CC implies a convexity constraint where the super learner fit is subject to a constraint
that forces the coefficients to fall in [0,1] and sum to 1.0. No prefix implies no
constraints (which results in some loss of asymptotic properties such as the oracle property).
Note: OLS violates this naming convention, but LS will also be accepted and is equivalent to OLS
OLS/LS methods use an L2 loss function (least squares)
LOGLIK methods use a loss function corresponding to the binomial likelihood with a logit
link function
LAE methods use an L1 loss function (least absolute error), which will not penalize
outliers as much as L2 methods, and is also non-differentiable at the minimum
which may cause computational difficulties
AUC method also exists for binary outcomes, but current implementation is slow and fails frequently
and is not recommended
outresults=sl_summary,
outslrisks=sl_cvrisk,
outslcoefs=sl_coefs,
* other deprecated parameters
gfstrat=false, * [DEPRECATED] whether to use stratified (by treatment variable) models (true or false), changed to 'trtstrat'
cv=true, * [DEPRECATED] whether to cross validate (true or false) - always true
risk= * [DEPRECATED] loss function for superlearner coefficient estimation (can be L2 [not case sensitive] or ENTROPY) - replaced by 'dist' and 'method'
)
NOTES:
GLOBAL MACRO VARIABLES CREATED (IF USER DEFINED VERSIONS OF THESE ARE CREATED THEY WILL BE OVERWRITTEN):
SLseed, SLShowOnce, SLSampSize, SLsas94, SLsas93, SLsas92, SLPredMiss, SLPBinary, SLIXterms
SAMPLE USAGE (simple confounding problem with all binary covariates, using 4 learners and 5 fold cross-validation):
%SuperLearner(Y=y,
intvars=x,
binary_predictors= x l,
ordinal_predictors=,
nominal_predictors=,
continuous_predictors=,
weight=,
indata=a,
preddata=,
outdata=sl_testdata,
library= logit logitint cart nn rf,
cv=true,
trtstrat=true,
method=NNLOGLIK,
dist=BERNOULLI);
*/
/***************************************************************************
Future plans:
1) Save model fits to allow prediction with trained algorithms
2) Enrich the possible treatment interventions
3) Make custom learners easier to create (needs stable syntax, so not possible with some learners)
4) Improve documentation
5) Add learners: better variable selection (hpreduce)
6) More robust R coding
***************************************************************************/
/*
SuperLearner:
this macro is a wrapper for the more advanced _superlearner macro, which may be called by the user
*/
%_SuperLearner( Y=&Y,
X = &X,
by=&by,
intvars= &intvars ,
binary_predictors= &binary_predictors ,
ordinal_predictors= &ordinal_predictors ,
nominal_predictors= &nominal_predictors ,
continuous_predictors= &continuous_predictors,
id = &id,
weight=&weight, /* observation weights/importance weights/frequency weights */
indata= &indata ,
preddata= &preddata ,
outdata= &outdata ,
dist=&dist ,
library=%str(&library) ,
trtstrat=&trtstrat ,
folds=&folds ,
method=&method ,
outslrisks=sl_cvrisk ,
outslcoefs=sl_coefs ,
outresults=sl_summary,
printres=TRUE ,
printlearner=FALSE,
cleanup=TRUE ,
trimbound=0.001 ,
shuffle=TRUE,
seed=12345,
slridgepen=0.3, /* ridge regression penalty for methods: RIDGE, NNRIDGE, CCRIDGE */
/* kid gloves */
runchecks=TRUE ,
checkvalid=TRUE,
logreport=TRUE,
printfolds=TRUE,
verbose=FALSE,
speedup=FALSE,
simple=TRUE,
quietwarning=TRUE,
quietnotes=TRUE,
timer=TRUE, /* suppress notes being printed to the log */
forcer=FALSE,
/* deprecated */
getfoldrisks=,
predvar= &predvar ,
risk=&risk ,
cv=&cv ,
gfstrat=&gfstrat
);
%MEND SuperLearner;
%MACRO CVSuperLearner(
Y=,
X=,
binary_predictors= ,
ordinal_predictors= ,
nominal_predictors= ,
continuous_predictors= ,
id=,
weight=, /* observation weights/importance weights/frequency weights */
indata= ,
dist= GAUSSIAN,
library= ,
folds= 10,
method= BNNLS
);
/*
CVSuperLearner:
this macro is a wrapper for the more advanced _cvsuperlearner macro, which may be called by the user
*/
%_CVSuperLearner(
Y=&Y,
X=&X,
binary_predictors= &binary_predictors ,
ordinal_predictors= &ordinal_predictors ,
nominal_predictors= &nominal_predictors ,
continuous_predictors= &continuous_predictors ,
id=&id,
weight=&weight, /* observation weights/importance weights/frequency weights */
indata= &indata ,
dist=&dist ,
library=%str(&library) ,
slfolds=&folds ,
cvslfolds=&folds,
method=&method ,
outcvresults=cvsl_summary,
printfoldres=FALSE ,
printlearner=FALSE,
cleanup=TRUE ,
trimbound=0.001 ,
shuffle=TRUE,
seed=12345,
slridgepen=0.3, /* ridge regression penalty for methods: RIDGE, NNRIDGE, CCRIDGE */
/* kid gloves */
cvrunchecks=TRUE ,
cvcheckvalid=TRUE,
logreport=FALSE,
printfolds=FALSE,
reportcvfolds=TRUE, /*report CV fold of folds in log */
verbose=FALSE,
speedup=FALSE,
simple=TRUE,
quietwarning=TRUE,
quietnotes=TRUE,
timer=TRUE
);
%MEND;
/*
main work horse macros: _SuperLearner and _CVSuperLearner;
*/
%MACRO _SuperLearner(Y=,
X=,
by=,
intvars=,
binary_predictors=,
ordinal_predictors=,
nominal_predictors=,
continuous_predictors=,
id=,
weight=, /* observation weights/importance weights/frequency weights */
indata=,
preddata=,
outdata=,
dist=,
library= logit,
trtstrat=false,
folds=10,
method=,
outslrisks=sl_cvrisk,
outslcoefs=sl_coefs,
outresults=sl_summary,
printres=FALSE, /* print summary + coefs and loss of each algorithm */
printlearner=FALSE, /* print learner to log each time it is used */
cleanup=FALSE, /* get rid of temporary datasets created by macro */
trimbound=0.001, /*trimming of logit variables to ensure boundedness */
shuffle=TRUE, /*randomly shuffle data prior to setting CV folds*/
seed=, /*seed for super learner (shuffling + learners with randomness)*/
slridgepen=0.3, /* ridge regression penalty for methods: RIDGE, NNRIDGE, CCRIDGE */
runchecks=FALSE, /*check for missing data, proper distribution settings, case of parameter values*/
checkvalid=FALSE, /*remove learners that produce missing values */
logreport=FALSE, /* report useful information, e.g. SL coefficients to the sas log */
printfolds=TRUE, /*report fold of folds in log */
verbose=FALSE, /* print out a few extra things*/
speedup=FALSE, /* do some things to make for faster computation */
simple=FALSE, /* if true, print out a note in final summary about speed boost with workhorse macro */
quietwarning=FALSE, /* suppress some warnings being printed to the log */
quietnotes=FALSE, /* suppress notes being printed to the log */
timer=FALSE, /* suppress notes being printed to the log */
forcer=FALSE, /* if true, will force re-install current version of all r packages called by the macro + dependencies */
/*deprecated*/
getfoldrisks=, /* report fold specific risks - too slow to use regularly */
predvar= ,
cv=,
risk=,
gfstrat=
) / MINOPERATOR MINDELIMITER=' ';
* _SuperLearner macro: workhorse function that can be called by the user and contains finer control
and less error checking than the SuperLearner macro;
%PUT NOTE: Super learner started;
%IF &speedup=TRUE %THEN %DO;
*eventually there will be more here;
OPTIONS COMPRESS=BINARY;
%END;
%IF &quietwarning=TRUE %THEN ODS SELECT NONE;;
%IF &quietnotes=TRUE %THEN OPTIONS NONOTES;;
%IF &timer=TRUE %THEN %DO;DATA __SLtime; start = time();%END;
%GLOBAL SLseed;
%IF &seed= %THEN %DO;
DATA _NULL_;
seed = ROUND(RAND('UNIFORM')*(2147483647));
CALL SYMPUT("SLseed", PUT(seed, 18.0));
%END;
%ELSE %LET slseed=&seed;
* check methods/dist specification;
%IF &dist^= AND &method= %THEN %DO;
%__SLwarning(%STR(No method specified, setting to default NNLS.));
%LET method=NNLS;;
%END;
%IF NOT (&method IN(NNLOGLIK CCLOGLIK NNLS CCLS OLS LS NNLAE CCLAE LAE LOGLIK AUC ADAPTRF RIDGE CCRIDGE NNRIDGE
BNNLOGLIK BCCLOGLIK BLOGLIK BNNLS BCCLS BOLS BLS BNNRIDGE BCCRIDGE BRIDGE BNNLASSO BCCLASSO BLASSO BNNLAE BCCLAE BLAE))
%THEN %__badSLerror(%str(Method &method not recognized));
%IF &method=LS %THEN %LET method=OLS;; *allow naming conventions to be followed strictly;
%IF &method=BLS %THEN %LET method=BOLS;; *allow naming conventions to be followed strictly;
* format to reverse outcome coding for some procs (e.g. probit);
PROC FORMAT; VALUE SLrevf 1='0' 0='1';RUN; * by default models outcome = 0;
%IF &runchecks=TRUE %THEN %DO;
* startup message;
%IF %SYMEXIST(SLShowOnce)=0 %THEN %DO;
%GLOBAL SLShowOnce;
%__SLnote(%str(SuperLearner Macro: please report bugs to akeil@unc.edu));
%LET SLShowOnce='shown';
%END;
*allow for some variation in how macro is called;
%IF &trtstrat^= %THEN %LET trtstrat=%UPCASE(&trtstrat);
%IF &risk^= %THEN %LET risk = %SYSFUNC(UPCASE(%SYSFUNC(DEQUOTE(&risk))));;
%IF &method^= %THEN %LET method = %SYSFUNC(UPCASE(%SYSFUNC(DEQUOTE(&method))));;
%IF &dist^= %THEN %LET dist = %SYSFUNC(UPCASE(%SYSFUNC(DEQUOTE(&dist))));;
%IF &library^= %THEN %LET library = %SYSFUNC(LOWCASE(%SYSFUNC(DEQUOTE(&library))));;
%IF &dist IN (BERN BIN BINARY B) %THEN %LET dist=BERNOULLI;
%IF &dist IN (GAUSS NORM NORMAL G N) %THEN %LET dist=GAUSSIAN;
*announce deprecations and set some parameters by default;
%__deprecations();
%IF (&x^= AND (&binary_predictors^= OR &ordinal_predictors^= OR &nominal_predictors^= OR &continuous_predictors^=)) %THEN %DO;
%__SLWarning(%(Macro parameters X and one of: binary_predictors, ordinal_predictors, nominal_predictors, continuous_predictors are specified. Only one should be specified. Setting X to be empty));
%LET x=;
%END;
%ELSE %IF &x^= %THEN %DO;
%LET predictors = &X;
%__SLNote(%STR(Macro parameter X is specified. All variables with num levels > .05*N assumed continuous, otherwise nominal or binary));
PROC SQL NOPRINT; SELECT COUNT(&Y)/20 into :SLcutoff from &indata ;quit;
%LET xidx=1;
%DO %WHILE(%SCAN(&X, &xidx)^=);
%__SLPNumlevels(&indata, %SCAN(&X, &xidx));
* variable %SCAN(&X, &xidx) has at least &SLPNumLevels levels;
%IF %TRIM(&SLPNumLevels)=2 %THEN %LET binary_predictors = &binary_predictors %SCAN(&X, &xidx);
%ELSE %IF %SYSEVALF(%TRIM(&SLPNumLevels)<=&SLcutoff) %THEN %LET nominal_predictors = &nominal_predictors %SCAN(&X, &xidx);
%ELSE %IF %SYSEVALF(%TRIM(&SLPNumLevels)>&SLcutoff) %THEN %LET continuous_predictors = &continuous_predictors %SCAN(&X, &xidx);
%LET xidx=%EVAL(&xidx+1);
%END;
%END;
%ELSE %DO;
%LET predictors = &binary_predictors &ordinal_predictors &nominal_predictors &continuous_predictors;
%END;
*install r packages if needed;
%__installR();
*or force install;
%IF &forcer=TRUE %THEN %__forceinstallR();;
%IF &predvar = AND &Y = %THEN %DO;
%__badSLerror(must set 'Y' to the name of the variable you wish to predict);
%RETURN;
%END;
%ELSE %__SLnote(Making predictions of &Y);
%IF &predictors = %THEN %DO;
%__badSLerror(Must have variables for at least one of: binary_predictors, ordinal_predictors,
nominal_predictors, continuous_predictors);
%END;
* check for any missing values in predictors, and fail if any;
%__CheckMissing(&predictors, &indata);
*collect some information;
DATA __sltm0017_; SET &indata; /*IF _N_<300*/; RUN; *todo: refactor so less expensive and less error prone;
%__SLPBinary(__sltm0017_, &Y); *check whether or not predicting binary variable (assumed continuous otherwise);
*catch some common errors (some will stop the macro, some will result in automatic corrections);
%__commonerrors();
%__checklibrary(&library);
%END; *runchecks;
%IF %__TrueCheck(&trtstrat) %THEN %__intspeccheck();
%IF &preddata = %THEN %DO;
*predictions based on sample data;
%LET preddata = &indata;
%LET insample = true;
%__SLnote(Making in-sample SL predictions for "&Y" in dataset "&indata");
%END;
%ELSE %DO;
*predictions based on external data;
%LET insample = false;;
PROC SQL NOPRINT; SELECT COUNT(%SCAN(&x &binary_predictors &ordinal_predictors &nominal_predictors &continuous_predictors, 1)) into :pSLSampSize from &preddata ;quit;
%__SLnote(Making SL predictions for "&Y" in dataset "&preddata" and "&indata");
%END;
*define some fcmp functions;
%__MKFUNCS();
*prepare interaction terms;
%__makeintx(bins = &binary_predictors, others = &ordinal_predictors &nominal_predictors &continuous_predictors);
/*
_main: the absolute barebones super learner macro (not to be called on its own)
*/
%MACRO _main(indata, y, id, folds, insample, preddata, library, risk, method, dist, outdata, outslcoefs, outslrisks);
* the bulk of the superlearner functions go in here;
%GLOBAL SLSampSize;
%IF &id = %THEN %DO;
PROC SQL NOPRINT; SELECT COUNT(&Y) into :SLSampSize from &indata ;quit;
%END;
%IF &id ^= %THEN %DO;
PROC SQL NOPRINT; SELECT COUNT(UNIQUE &ID) into :SLSampSize from &indata ;quit;
%END;
*create AUC methods;
%IF &method=AUC %THEN %_mkmethods();;
%__SLnote(total observations/IDs in training data: %TRIM(&SLSampSize).);
*cross validation estimation of superlearner coefficients for prediction;
%__SLnote(Using cross validation with &folds fold);
%__SLnote(Approximately %EVAL(&SLSampSize/&folds) per fold);
*prepare data for cross validation (split by training/validation sets);
%_cvsetup(Y=&Y, id=&id, indata=&indata, outdata=__sltm0013_, folds=&folds);
* set up data for possible interventions (1 or 0 only);
%_pred_setup(Y=&Y, intvars=&intvars, indata=__sltm0013_, outdata=__sltm001_, preddata=&preddata, insample=&insample);
* get cross validated predictions from each learner;
OPTIONS NOQUOTELENMAX;
%_getcvpreds&dist(Y=&Y, indata=__sltm001_, outdata=__sltm002_, library=&library, weight=&weight, id=&id, folds=&folds, seed=&slseed);;
*get (cross-validated) risk estimates for each fold and superlearner coefficients for overall fit;
%_sl_cvrisk(indata=__sltm002_, outdata=__sltm002_, library=&library, outcoef=&outslcoefs, risk=&risk, method=&method, outcvrisks=&outslrisks, seed=&slseed);
OPTIONS QUOTELENMAX;
*restrict to valid predictions (non missing);
%IF %__TrueCheck(&checkvalid) %THEN %LET library = &validlib;
%ELSE %LET droplib=none;
*final predictions from super-learner;
%_SLPredict(indata=__sltm002_, outdata=&outdata, library=&library, incoef=&outslcoefs, inrisk=&outslrisks);
*write cross validated risk (loss function) to the log;
%IF %__TrueCheck(&logreport) %THEN %_cvriskreport(indata=&outdata, library=&library);;
%MEND _main;
%IF &by ^= %THEN %DO;
%LOCAL byj bylevs;
%__CLEANUP(%str(&outdata, &outslcoefs, &outslrisks));
%IF %SCAN(&by, 2)^= %THEN %__badSLerror(%str("by" statement only supports one by variable));
PROC SQL NOPRINT; SELECT COUNT(DISTINCT &by) INTO :bylevs FROM &indata; QUIT;
*start by loop;
%LET byj=1;
%DO %WHILE(%EVAL(&byj <= &bylevs));
* find current value of by variable;
* count all levels of by variable (this table gets deleted repeatedly) - todo: refactor;
PROC FREQ DATA = &indata NOPRINT;
TABLE &by / OUT=__sltm0018(KEEP=&by);
RUN;
DATA _NULL_;
SET __sltm0018;
IF _n_ = &byj THEN CALL SYMPUT('currbyval', PUT(&by, BEST9.));
RUN;
*restrict dataset to current level of by variable;
DATA __sltm0000; SET &indata; WHERE &by = &currbyval; RUN;
%IF &preddata ^= %THEN %DO; DATA __sltm1000; SET &preddata; WHERE &by = &currbyval; RUN; %END;
*run super learner;
%PUT By processing &byj of %trim(&bylevs);
/**/
%_main(__sltm0000, &Y, &id, &folds, &insample, __sltm1000, &library, &risk, &method, &dist, __sltm0100, __sltm0101, __sltm0102);
/**/
DATA __sltm0101; SET __sltm0101; &by=&currbyval;
DATA __sltm0102; SET __sltm0102; &by=&currbyval;
*append data;
PROC APPEND DATA = __sltm0100 BASE = &OUTDATA FORCE;
PROC APPEND DATA = __sltm0101 BASE = &outslcoefs FORCE;
PROC APPEND DATA = __sltm0102 BASE = &outslrisks FORCE; RUN;
PROC SQL NOPRINT; DROP TABLE __sltm0100, __sltm0101, __sltm0102, __sltm0000, __sltm1000; QUIT;
*end by loop;
%LET byj=%EVAL(&byj+1);
%END;
*end if by;
%END;
%IF &by = %THEN %DO;
/**/
%_main(&indata, &Y, &id, &folds, &insample, &preddata, &library, &risk, &method, &dist, &outdata, &outslcoefs, &outslrisks);
/**/
%END;
PROC SORT DATA = &outdata; BY &BY __seqid__ __cvid__ __int; RUN;
%IF %__TrueCheck(&cleanup) %THEN %DO;
* delete some temporary datasets, fcmp functions;
%__FUNCLEANUP;
%__CLEANUP(%str(__sltm001_, __sltm001_0, __sltm001_1, __sltm002_, __sltm003_, __sltm004_, __sltm005_, __sltm006_, __sltm007_, __sltm008_,
__sltm009_, __sltm0010_, __sltm0011_, __sltm0012_, __sltm0013_, __sltm0014_, __sltm0015_, __sltm0016_, __sltm0017_, _namedat));
%END;
ODS SELECT ALL;
%IF %__TrueCheck(&printres) %THEN %__printsummary(Y=&Y,
predictors=&binary_predictors &ordinal_predictors &nominal_predictors &continuous_predictors,
library=&library, folds=&folds, method=&method, dist=&dist, shuffle=&shuffle, preddata=&preddata, indata=&indata,
outcoef=&outslcoefs,outcvrisk=&outslrisks,outresults=&outresults, n=&SLSampSize);;
%IF %__FalseCheck(&printres) %THEN %__noprintsummary(Y=&Y,
predictors=&binary_predictors &ordinal_predictors &nominal_predictors &continuous_predictors,
library=&library, folds=&folds, method=&method, dist=&dist, shuffle=&shuffle, preddata=&preddata, indata=&indata,
outcoef=&outslcoefs,outcvrisk=&outslrisks,outresults=&outresults, n=&SLSampSize);;
%IF %__TrueCheck(&timer) %THEN %DO;
DATA __SLtime; SET __Sltime;
end = time();
duration = (end-start)/60;
DATA _NULL_; SET __Sltime; CALL SYMPUT("runtime", PUT(duration, 10.3));
PROC SQL NOPRINT; DROP TABLE __SLTIME;
%END;
RUN; QUIT; RUN;
OPTIONS NOTES;
%PUT NOTE: Super learner completed, predictions available in dataset &outdata ;
%IF %__TrueCheck(&timer) %THEN %PUT NOTE: %str(Super learner took %TRIM(&runtime) minutes to complete);
%MEND _SuperLearner;
%MACRO _CVSuperLearner(
Y=,
X=,
binary_predictors= ,
ordinal_predictors= ,
nominal_predictors= ,
continuous_predictors= ,
id=,
weight= ,
indata= ,
dist= ,
library= ,
slfolds=10 ,
cvslfolds=10,
method=BNNLS,
outcvresults=cvsl_summary,
reportcvfolds=FALSE,
printfoldres=FALSE ,
printfolds=FALSE, /*report fold of folds in log */
printlearner=FALSE, /* print learner to log each time it is used */
cleanup=TRUE ,
trimbound=0.001 ,
shuffle=TRUE,
seed=12345,
slridgepen=0.3, /* ridge regression penalty for methods: RIDGE, NNRIDGE, CCRIDGE */
/* kid gloves */
cvrunchecks=FALSE ,
cvcheckvalid=FALSE,
logreport=FALSE,
verbose=FALSE,
speedup=FALSE,
simple=FALSE,
quietwarning=FALSE,
quietnotes=FALSE,
timer=TRUE);
*todo: add in by group processing;
OPTIONS NONOTES;
%LOCAL CVSLV cvSLseed;
%LET cvSLseed = %EVAL(&SEED);
%IF &cvSLseed= %THEN %DO;
DATA _NULL_;
seed = ROUND(RAND('UNIFORM')*(2147483647));
CALL SYMPUT("cvSLseed", PUT(seed, 18.0));
RUN;
%END;
PROC SQL NOPRINT;
CREATE TABLE __cvsltmp001__ AS SELECT *, RANUNI(&cvSLseed) AS __cvslsrt__
FROM &indata ORDER BY __cvslsrt__;
DATA __cvsltmp001__;
SET __cvsltmp001__ END=eof;
__cvslid__ = _N_;
IF eof THEN CALL SYMPUT("CVSLsampsize", PUT(_N_, 12.0));
DATA __cvsltmp001__;
SET __cvsltmp001__;
DO k = 1 TO &CVSLfolds;
IF &CVSLsampsize/&CVSLfolds*(k-1) < __cvslid__ <= &CVSLsampsize/&CVSLfolds*k THEN __CVSLfold= k;
END;
DROP k;
%__CleanUp(%STR(__cvsltmp006__)); QUIT;
OPTIONS NOTES;
%LET CVSLV = 1;
%DO %WHILE (%EVAL(&CVSLV <= &CVSLfolds));
OPTIONS NONOTES;
DATA __cvsltmp002__; SET __cvsltmp001__(WHERE=(__CVSLfold^=&CVSLV));run;
DATA __cvsltmp003__; SET __cvsltmp001__(WHERE=(__CVSLfold=&CVSLV));run;
OPTIONS NOTES;
%IF %__truecheck(&reportcvfolds) %THEN %__SLnote(CVSuperLearner fold &CVSLV of &CVSLfolds);;
%_SuperLearner(Y=&Y,
X=&X,
by=,
binary_predictors= &binary_predictors ,
ordinal_predictors= &ordinal_predictors ,
nominal_predictors= &nominal_predictors ,
continuous_predictors= &continuous_predictors ,
id=&id,
weight= &weight,
indata= __cvsltmp002__ ,
preddata=__cvsltmp003__,
outdata= __cvsltmp004__ ,
dist=&dist ,
library=%str(&library) ,
intvars= , /* should be disabled here if you want CV risk */
trtstrat= FALSE, /* should be disabled here if you want CV risk */
folds=&slfolds ,
method=&method ,
outslrisks=cvsl_cvrisk&CVSLV ,
outslcoefs=cvsl_coefs&CVSLV ,
outresults=cvsl_summary&CVSLV,
printlearner=&printlearner ,
printres=&printfoldres ,
printfolds=&printfolds,
cleanup=&cleanup ,
trimbound=&trimbound ,
shuffle=TRUE,
slridgepen=&slridgepen, /* ridge regression penalty for methods: RIDGE, NNRIDGE, CCRIDGE */
seed=&cvSLseed,
/* kid gloves */
runchecks= &cvrunchecks ,
checkvalid= &cvcheckvalid,
logreport=&logreport,
getfoldrisks=FALSE,
verbose=&verbose,
speedup=FALSE,
simple=FALSE,
quietwarning=&quietwarning,
quietnotes=&quietnotes,
timer=&timer
);
OPTIONS NONOTES;
/**/
%_get_slcvrisk(indata=__cvsltmp004__(WHERE=(__CVSLfold=&CVSLV)), Y=&y, library=&library, outrisk=__cvsltmp005__, weight=&weight, method=&METHOD, debug=FALSE)
/**/
PROC APPEND DATA = __cvsltmp005__ BASE = __cvsltmp006__ FORCE; RUN;
PROC SQL NOPRINT; DROP TABLE cvsl_summary&CVSLV, cvsl_coefs&CVSLV, cvsl_cvrisk&CVSLV; QUIT;
%LET CVSLV = %EVAL(&CVSLV + 1);
%END;
PROC MEANS DATA = __cvsltmp006__ NOPRINT;
CLASS Learner order;
VAR CVrisk ;
OUTPUT OUT = &outcvresults(WHERE=(Learner^="" AND order ^=.) DROP=_:) MEAN= STDERR= MIN= MAX=/ AUTONAME;
RUN;
PROC MEANS DATA = __cvsltmp006__ NOPRINT;
CLASS Learner order;
VAR Coefficient ;
OUTPUT OUT = __cvsltmp007__(WHERE=(Learner^="" AND order ^=.) DROP=_:) MEAN=coef_mean;
RUN;
OPTIONS MERGENOBY = NOWARN;
DATA &outcvresults;
LENGTH Learner $39 coef_mean 8 CVrisk_mean 8 CVrisk_stderr 8 CVrisk_min 8 CVrisk_max 8;
MERGE __cvsltmp007__ &outcvresults;
RUN;
PROC SORT DATA = &outcvresults;
BY order;
RUN;
PROC PRINT DATA = &outcvresults(DROP=order) NOOBS;
RUN;
%IF %__TrueCheck(&CLEANUP) %THEN %DO;
PROC SQL NOPRINT; DROP TABLE __cvsltmp001__, __cvsltmp002__, __cvsltmp003__, __cvsltmp004__, __cvsltmp005__, __cvsltmp006__, __cvsltmp007__, __cvsltmp008__, __cvsltmp009__; QUIT;
%END;
OPTIONS NOTES MERGENOBY = WARN;
%__SLnote(CVSuperLearner finished);
%MEND;
********************************************************;
* Part 1: helper functions;
********************************************************;
%MACRO _mkmethods();
%LOCAL ncutoffs;
/* note: allowing for IDs completely broke AUC methods */
%LET ncutoffs = 500; * number of cutoff points for approximate AUC calculation;
PROC FCMP OUTLIB = work.__funcs.LOGFUNCS;
FUNCTION AUC(coef[*], label[*], preds[*,*]) VARARGS;
* implementing with fixed cutoffs, yielding approximate AUC;
ARRAY __slp[&slsampsize] / NOSYMBOLS;
CALL MULT(preds, coef, __slp);
ARRAY sens[%EVAL(2+&ncutoffs)] / NOSYMBOLS;
ARRAY fpr[%EVAL(2+&ncutoffs)] / NOSYMBOLS;
__totp=0;
__totn=0;
DO i = 1 TO &slsampsize;
__totp = __totp + label[i];
__totn = __totn + (1-label[i]);
END;
auc = 0;
sens[1]=1;
fpr[1]=1;
sens[%EVAL(2+&ncutoffs)]=0;
fpr[%EVAL(2+&ncutoffs)]=0;
DO j = 2 TO %EVAL(1+&ncutoffs);
sens[j]=0;
fpr[j]=0;
DO i = 1 TO &slsampsize;
sens[j] = sens[j] + label[i]*(__slp[i]>((j-1)/&ncutoffs))/__totp;
fpr[j] = fpr[j] + (1-label[i])*(__slp[i]>((j-1)/&ncutoffs))/__totn;
END;
END;
DO j = 2 TO %EVAL(1+&ncutoffs);
auc = auc + (sens[j-1] - sens[j])*(1-fpr[j])/2 + (sens[j] - sens[j+1])*(1-fpr[j])/2; *trapezoidal sum;
END;
RETURN(auc);
ENDSUB;
FUNCTION AUCL(idx, coef[*], label[*], preds[*,*]) VARARGS;
* stupid trick to please proc optmodel ;
/* note: allowing for IDs completely broke AUC methods */
ARRAY __slp[&slsampsize] / NOSYMBOLS;
ARRAY ncoef[1] / NOSYMBOLS;
DO i = 1 TO DIM(coef);
CALL DYNAMIC_ARRAY(ncoef, i);
IF i = idx THEN ncoef[i] = 1; ELSE ncoef[i] = 0;
END;
CALL MULT(preds, ncoef, __slp);
ARRAY sens[%EVAL(2+&ncutoffs)] / NOSYMBOLS;
ARRAY fpr[%EVAL(2+&ncutoffs)] / NOSYMBOLS;
__totp=0;
__totn=0;
DO i = 1 TO &slsampsize;
__totp = __totp + label[i];
__totn = __totn + (1-label[i]);
END;
auc = 0;
sens[1]=1;
fpr[1]=1;
sens[%EVAL(2+&ncutoffs)]=0;
fpr[%EVAL(2+&ncutoffs)]=0;
DO j = 2 TO %EVAL(1+&ncutoffs);
sens[j]=0;
fpr[j]=0;
DO i = 1 TO &slsampsize;
sens[j] = sens[j] + label[i]*(__slp[i]>((j-1)/&ncutoffs))/__totp;
fpr[j] = fpr[j] + (1-label[i])*(__slp[i]>((j-1)/&ncutoffs))/__totn;
END;
auc = auc + (sens[j-1] - sens[j])*(1-fpr[j])/2 + (sens[j] - sens[j+1])*(1-fpr[j])/2; *trapezoidal sum;
END;
RETURN(auc);
ENDSUB;
RUN;
%MEND _mkmethods;
%MACRO __MKFUNCS();
OPTIONS CMPLIB = work.__funcs;
PROC FCMP OUTLIB = work.__funcs.LOGFUNCS;
FUNCTION expit(mu);
mur = mu;
IF mu<-700 THEN mur=-700;
IF mu>700 THEN mur=700;
lp = 1/(1+exp(-mur));
RETURN(lp);
ENDSUB;
FUNCTION logit(p);
pr = p;
IF p <= 0 THEN pr = 1e-12;
IF p >= 1 THEN pr = 1-(1e-12);
lmu = LOG(pr/(1-pr));
RETURN(lmu);
ENDSUB;
FUNCTION logitbound(p,l,h);
pb=MIN(MAX(p,l),h);
lmu = log(pb/(1-pb));
RETURN(lmu);
ENDSUB;
FUNCTION logbound(p,l,h);
pb=MIN(MAX(p,l),h);
lp = log(pb);
RETURN(lp);
ENDSUB;
RUN;
%MEND;
/* user-experience helper functions: error checking, custom messages, deprecation notices*/
%MACRO __TrueCheck(mvar);
%TRIM(&MVAR)=true OR %TRIM(&MVAR)=TRUE OR %TRIM(&MVAR)=True OR %TRIM(&MVAR)="true" OR %TRIM(&MVAR)="TRUE" OR %TRIM(&MVAR)="True" OR %TRIM(&MVAR)="True" OR %TRIM(&MVAR)="T" OR %TRIM(&MVAR)=T OR %TRIM(&MVAR)="1" OR %TRIM(&MVAR)=1
%MEND __TrueCheck;
%MACRO __FalseCheck(mvar);
%TRIM(&MVAR)=false OR %TRIM(&MVAR)=FALSE OR %TRIM(&MVAR)=False OR %TRIM(&MVAR)="false" OR %TRIM(&MVAR)="FALSE" OR %TRIM(&MVAR)="F" OR %TRIM(&MVAR)=F OR %TRIM(&MVAR)="0" OR %TRIM(&MVAR)=0
%MEND __FalseCheck;
%MACRO __SLwarning(msg);
%PUT WARNING: Super learner non-fatal issue. %TRIM(&msg);
%MEND __SLwarning;
%MACRO __SLnote(msg);
%PUT NOTE: Super learner %TRIM(&msg);
%MEND __SLnote;
%MACRO __SLerror(msg);
%PUT ERROR: Super learner, possibly recoverable problem happened in the call to super learner. %TRIM(&msg);
%RETURN;
%MEND __SLerror;
%MACRO __badSLerror(msg);
*fail early: error to abort completely;
%PUT ERROR: Super learner, unrecoverable problem happened in the call to super learner. %TRIM(&msg);
ODS SELECT ALL; OPTIONS NOTES;
%ABORT;
%MEND __badSLerror;
%MACRO __deprecations();
%__SLnote(%str(__deprecations));
%IF &predvar^= %THEN %DO;
%__SLwarning(%STR('predvar' macro parameter (currently set to &predvar) will be deprecated in an upcoming version. Please use 'Y' instead. Setting macro parameter 'Y' to &Y.));;
%LET Y = &Y;
%END;
%IF &cv^= %THEN
%__SLwarning(%STR('cv' macro parameter (currently set to &cv) will be deprecated in an upcoming version, CV is assumed so this parameter will be dropped.));;
%IF &gfstrat^= %THEN %DO;
%__SLwarning(%STR('gfstrat' macro parameter (currently set to &gfstrat) will be deprecated in an upcoming version, please use "trtstrat=TRUE/FALSE" in macro call instead.));
%LET trtstrat = &gfstrat;
%END;
%IF &risk^= %THEN %DO;
%IF &method^= %THEN %__badSLerror(%str('risk' is deprecated and is replaced by 'method' - both cannot be specified simultaneously));
%__SLwarning(%STR('risk' macro parameter (currently set to &risk) will be deprecated in an upcoming version, please use "dist=gaussian" or "dist=bernoulli" in macro call instead.));
%IF &risk=L2 %THEN %DO;
%LET dist=GAUSSIAN;
%LET method=NNLS;
%END;
%IF &risk=ENTROPY %THEN %DO;
%LET dist=BERNOULLI;
%LET method=CCLOGLIK;
%END;
%END;
%MEND __deprecations;
%MACRO __commonerrors(dummy=1) / MINOPERATOR MINDELIMITER=' ';
%LOCAL l_l j_j _ic book;
%__SLnote(%str(Basic error checking));
* missing parameter values;
%IF &Y = AND &predvar = %THEN %__badSLerror(must specify Y);
%IF %SUBSTR(&indata, 1, 1)= %THEN %__badSLerror(must specify existing indata data set);
%IF &outdata = %THEN %__badSLerror(must specify outdata data set name to write out results);
%IF &library = %THEN %__badSLerror(must specify at least two members of the library);
*not enough library members;
%IF (%SCAN(&library, 2)=) %THEN %__SLwarning(Super learner has one library member.);
%IF &folds= %THEN %DO;
%__SLnote(Defaulting to 10-fold cross validation);
%LET folds=10;
%END;
*check that stratified models are specified correctly;
%IF %__TrueCheck(&trtstrat) %THEN %DO;
%IF (&INTVARS=) %THEN %__badSLerror(Intervention variable (macro variable 'intvars') must be specified if trtstrat is TRUE);;
%LET j_j=1;
%LET _ic = 0;
%DO %WHILE (%SCAN(&PREDICTORS, &j_j)^=);
%IF %SCAN(&PREDICTORS, &j_j) IN &INTVARS %THEN %LET _ic = 1;
%LET j_j=%EVAL(&j_j+1);
%END;
%IF &_ic = 0 %THEN %__badSLerror(Intervention variables must also be specified as predictors);;
%END;
* screw up binary/continuous specification;
%IF %__TrueCheck(&SLPBinary) %THEN %DO;
* y is binary;
%__SLnote(%STR(&Y is assumed to be binary));
%IF %TRIM(&dist)^=BERNOULLI %THEN %DO;
* setting to bernoulli;
%__SLwarning(%STR(dist function should be BERNOULLI for binary dependent variables, &Y appears to be non-binary. Setting dist to BERNOULLI.));
%LET dist=BERNOULLI;
%END;
%__SLnote(%STR(assumed &dist distribution, using &method algorithm to estimate SL coefficients));
%END;
%IF %__FalseCheck(&SLPBinary) %THEN %DO;
* y is not binary;
%__SLnote(%STR(&Y is assumed to be continuous));
%IF &risk = ENTROPY %THEN %__SLerror(%STR(risk function should be L2 for non-binary dependent variables, &Y appears to be continuous));;
%IF %TRIM(&dist)^=GAUSSIAN %THEN %DO;
* setting to gaussian;
%__SLwarning(%STR(Dist function should be GAUSSIAN for continuous dependent variables, &Y appears to be non-continuous. Setting dist to GAUSSIAN.));
%LET dist=GAUSSIAN;
%END;
%__SLnote(%STR(assumed &dist distribution, using &method algorithm to estimate SL coefficients));
%END;
* unknown risk function;
%IF (&risk ~=L2 AND &risk ~=ENTROPY AND &method= AND &dist=) %THEN %__badSLerror(Dist and Method must be set. Risk may also be set, but will soon be deprecated.);;
* sas version checking;
%GLOBAL SLsas94 SLsas93 SLsas92;
%__SLnote(%STR(SAS version is &sysver));
%LET SLsas94 = %EVAL(&sysver>9.3);
%LET SLsas93 = %EVAL(&sysver>9.2);
%LET SLsas92 = %EVAL(&sysver>9.1);
%IF &SLsas94=0 %THEN %DO;
__SLwarning(Using version of SAS < 9.4, be aware that some procs may not work as intended!);
%LET l_l = 1;
%DO %WHILE(%SCAN(&LIBRARY, &l_l)^=);
%LET book=%SCAN(&LIBRARY, &l_l);
%IF (&book=cart OR &book=cvcart OR &book=rf OR &book=rfoob OR &book=gampl OR &book=lasso) %THEN %DO;
%IF &SLsas93=0 %THEN %__badSLerror(you are using one member of the library that does not work in this version of sas. Please respecify the library without rf, cart, or lasso.);
%IF (&SLsas94=0 AND &book=lasso) %THEN %__badSLerror(Please respecify the library without lasso (it does not work in this version of sas).);
%END;
%LET l_l = %EVAL(&l_l + 1);
%END; *scan l;
%END;
%MEND;
%MACRO __Checklibrary(library);
*check whether member of library exists;
%LOCAL l_l book;
%LET l_l = 1;
%__SLnote(%str(Checking whether library (&library) is valid));
%DO %WHILE(%SCAN(&LIBRARY, &l_l)^=);
%LET book=%SCAN(&LIBRARY, &l_l);
%IF (&dist=GAUSSIAN AND %SYSMACEXIST(&book._cn)=0) OR (&dist=BERNOULLI AND %SYSMACEXIST(&book._in)=0) %THEN
%__badSLerror(Library member &book is not available for the &dist distribution. You might try "rf" "boost" "bagging" or "mars");
%LET l_l = %EVAL(&l_l + 1);
%END; *scan l;
%MEND __CheckLibrary;
%MACRO __CheckMissing(predictors, indata);
*check for missing values;
%LOCAL j_j _pred pmis;
%LET j_j = 1;
%DO %WHILE(%SCAN(&predictors, %EVAL(&j_j))^=);
%LET _PRED = %SCAN(&predictors, &j_j);
PROC SQL NOPRINT;
SELECT SUM((&_PRED<.z)) INTO :pmis FROM &INDATA;
%IF %SYSEVALF(%TRIM(&pmis)>0) %THEN %__badSLerror(Missing values detected in &_pred.. Missing values in predictors are not yet handled in this macro. Please ensure that input data are free of missing values.);;
%LET j_j = %EVAL(&j_j+1);
%END;
QUIT;
%MEND __CheckMissing;
%MACRO __CheckSLPredMissing(Y, indata);
*check for missing values;
%GLOBAL SLPredMiss;