-
Notifications
You must be signed in to change notification settings - Fork 2
/
scorecal_CSCC_2019.Rmd
1282 lines (1057 loc) · 43.7 KB
/
scorecal_CSCC_2019.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: scorecal
subtitle: Empirical score calibration under the microscope
author: Ross W. Gayler
date: 2019-08-30 Credit Scoring & Credit Control XVI, Edinburgh, UK
institute: https://www.rossgayler.com --- https://orcid.org/0000-0003-4679-585X
output: binb::metropolis
classoption: "aspectratio=169"
fontsize: 12pt
---
```{r LICENSE, include=FALSE}
# "scorecal - Empirical score calibration under the microscope" (c) by Ross W. Gayler
#
# "scorecal - Empirical score calibration under the microscope" is licensed under a
# Creative Commons Attribution 4.0 International License.
#
# You should have received a copy of the license along with this
# work. If not, see http://creativecommons.org/licenses/by/4.0/.
```
```{r WARNING, include = FALSE}
##### WARNING #####
# When executing this notebook to create a new "version of record" of the rendered output
# you must first delete the file session_info.rds
# See the last code chunk in this notebook for the explanation.
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(cache=FALSE)
set.seed(47) # for reproducibility
# This notebook requires the binb package to enable rendering the output as a PDF presentation.
# It uses the binb metropolis template.
# This requires a variety of LaTeX tools and fonts to be installed,
# in addition to the rmarkdown and knitr infrastructure.
# See http://github.com/eddelbuettel/binb for binb installation advice.
library(tidyverse) # data manipulation
library(grid) # for composing multiple plots
library(cowplot) # add marginal density to plots
library(patchwork) # arrange multiple plots
library(sn) # skew-normal distributions
library(binom) # for confidence intervals of proportions
library(mgcv) # for smoothed fits
library(fastDummies) # for dummy coding of individual scores for spike detection
library(glue) # for constructing model formulae in spike detection
library(glmnet) # for selection of scores as predictors of spikes
library(glmnetUtils) # for selection of scores as predictors of spikes
library(sessioninfo) # to get the software versions for reproducibility
library(here) # for safe specification of file paths
library(fs) # file handling utilities
```
```{r util_fns, include=FALSE}
# utility functions
logit2p <- function(
logit # numeric vector - logits
# value # numeric vector - probabilities corresponding to logits
) {
odds <- exp(logit)
odds / (1 + odds)
}
p2logit <- function(
p # numeric vector - probabilities
# value # numeric vector - logits corresponding to probabilities
) {
log(p / (1 - p))
}
```
# Introduction
## What is score calibration?
- _Calibration_: Answers the question "What do my scores mean?"
by empirically determining function from score to expected value of some outcome statistic
- Inherently about groups (cases with the same score)
- Case outcome is binary (e.g. Good, Bad)
- Outcome statistic is some function of binary outcomes of a group of cases
(e.g. $Pr( Bad | score)$ or $logit( Pr( Good | score))$)
- Result of calibration is a function from score to outcome statistic
- Fitting a function to the data (i.e. curve fitting)
- Typically, the function is approximately linear from score to log-odds
- _Scaling_: Transform group outcome statistic to a desired scale
- e.g. 1:1 odds $\mapsto$ zero points; double odds $\mapsto$ $\Delta$ +100 points
- Think of converting temperature from Fahrenheit to Celsius
- Calibration is always on *some* scale,
maybe not the one you want
## Calibration parameters
Calibration depends on:
- Substantive parameters
- Score definition (function from case attributes to a number)
- Number is commonly integer, may be real
- Population of cases
- Outcome definition
- Technical parameters of calibration function estimation
- Curve fitting technique
- Fitting technique tuning parameters
## How is the calibration function used?
- Operational process management
- Set decision thresholds
- Make loss predictions
- Technical diagnosis of the scoring model (my focus)
- For a well-behaved scoring model,
the score to log-odds function is generally quite linear
(by definition)
- Nonlinearity indicates there is possibly a problem
- What is the problem? (shape of nonlinearity - not absolutely diagnostic)
- Does the problem matter? (size of nonlinearity)
- How to fix the problem? ("fix" may be a work-around)
## Calibration function zoo
Some calibration function patterns that may be encountered:
```{r cal_zoo, echo = FALSE, out.width = "65%", fig.align = "center"}
# ```{r cal_zoo, echo = FALSE, fig.align = "center"}
# function to plot a single calibration sketch
cal_sketch <- function(
d, # data frame - with score and logit columns
t # character[1] - title for plot
# value # ggplot plot object
)
{
d %>%
ggplot(aes(x = score, y = logit)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("log-odds(outcome)") + # axis labels
labs(title = t[1]) + # label the plot
geom_path(size = 1, lineend = "round", linejoin = "round") # add the calibration curve
}
### generate each of the calibration sketch plots ###
# linear
p_linear <-
tibble::tribble(
~score, ~logit,
-1, -1,
+1, +1
) %>%
cal_sketch("Linear")
# bent
p_bent <-
tibble::tribble(
~score, ~logit,
-1, -0.5,
0, 0,
+1, +1
) %>%
cal_sketch("Change of slope")
# extreme flat
p_ext_flat <-
tibble::tribble(
~score, ~logit,
-1.3, -1,
-1, -1,
+1, +1,
+1.3, +1
) %>%
cal_sketch("Extremes flat")
# extreme reversal
p_ext_rev <-
tibble::tribble(
~score, ~logit,
-1.3, -0.8,
-1, -1,
+1, +1,
+1.3, +0.8
) %>%
cal_sketch("Extremes reversed")
# mid flat
p_mid_flat <-
tibble::tribble(
~score, ~logit,
-1, -1,
-0.15, 0,
+0.15, 0,
+1, +1
) %>%
cal_sketch("Locally flat")
# mid reversal
p_mid_rev <-
tibble::tribble(
~score, ~logit,
-1, -1,
-0.15, +0.1,
+0.15, -0.1,
+1, +1
) %>%
cal_sketch("Locally reversed")
# spike
p_spike <-
tibble::tribble(
~score, ~logit,
-1, -1,
-0.01, -0.01,
0, +0.4,
+0.01, +0.01,
+1, +1
) %>%
cal_sketch("Spike")
### arrange all the sketches as a single plot ###
p_linear + p_bent + p_ext_flat + p_ext_rev +
patchwork::plot_spacer() + p_mid_flat + p_mid_rev + p_spike +
patchwork::plot_layout(ncol = 4)
```
## Typical approaches to calibration function estimation
- Logistic regression from score to outcome, over cases
- `glm(outcome == "Good" ~ score, family=binomial)`
- Estimated function *forced* to be linear
- Unless you use `poly(score)` - but there are better ways
- Blind to any nonlinearities
- Score bands
- Group scores into bands; calculate outcome statistic for each band
- Calibration function is a step-function
- Doesn't assume *any* relationship between neighbouring bands
- Can model any relationship (*coarsely* - because of band widths)
- Local patterns may be hidden by bands (because of band widths)
- Doesn't make efficient use of data (doesn't use score ordering)
- Typically small number of observations per band
- Large variance of estimates obscures patterns
## Score band approach
```{r mk_sim_data_sb, echo = FALSE}
# make simulated data for score band plots
n_obs <- 2e3 # number of observations
# true calibration curve slope and intercept
sim_slope <- 1
sim_intercept <- 3
# divide data into score bands
n_sb <- 10 # number of score bands
# Function to calculate log-odds from score
# Assumes score to log-odsds relationship is linear
score2logit_linear <- function(
score, # numeric vector - scores
slope = 1, # numeric scalar - slope of relationship
intercept = 0 # numeric scalar - intercept of relationship
# value # numeric vector - logits corresponding to scores
) {
slope * score + intercept
}
# generate data frame of simulated cases
d_sim1 <-
tibble::tibble(
# make distribution of scores skew normal
score = sn::rsn(n_obs, dp = sn::cp2dp(c(mean = 0, sd = 1, gamma1 = -0.7), family = "SN"))
) %>%
dplyr::mutate(
logit = score2logit_linear(score, sim_slope, sim_intercept), # true outcome log-odds
prob = logit2p(logit), # true outcome probability
outcome_01 = as.integer(runif(length(prob)) <= prob), # randomly realise outcome from true probability
outcome = dplyr::if_else(outcome_01 == 1, "Good", "Bad") # also express outcome with pretty labels
)
```
Simulated data with linear score to log-odds relationship
(n = `r format(n_obs, big.mark = ",")`; `r round(100 * (1 - mean(d_sim1$outcome_01)))`% Bad; `r n_sb` bands)
```{r score_band_plot, echo=FALSE, out.width = "65%", fig.align = "center"}
### Plot step-function calibration function based on score bands
# calculate summary statistics per score band
d_sb <- d_sim1 %>%
dplyr::mutate(
score_band = dplyr::ntile(score, n_sb) # calculate score band
) %>%
# calculate score band stats
dplyr::group_by(score_band) %>%
dplyr::summarise(
n = n(),
n_good = sum(as.integer(outcome == "Good")),
score_min = min(score),
score_max = max(score),
score = mean(score),
p = n_good / n,
logit = p2logit(p)
) %>%
# calculate score band boundaries
dplyr::arrange(score_max) %>%
dplyr::mutate(
# Put band boundary midway between highest score below and lowest score above
# Upper bound of top score band is NA
upper_bound = (score_max + dplyr::lead(score_min)) / 2
)
# Calculate data frame for step-function
# Graft on appropriate values for the begiining and end of the step-function
d_step <- tibble::tibble(
score = c(min(d_sim1$score), na.omit(d_sb$upper_bound), max(d_sim1$score)),
logit = c(d_sb$logit, d_sb$logit[n_sb])
)
### plot the score band step-function ###
# construct the main scatterplot
p_main <-
d_sim1 %>% # individual case data
ggplot(aes(x = score, y = logit)) +
# general layout
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("log-odds(outcome)") + # axis labels
# show true calibration function for individual cases
geom_point(aes(x = score, y = logit, colour = outcome), size = 0.1, alpha = 0.5) +
# score bands and step-function calibration
geom_vline(xintercept = d_sb$upper_bound, colour = "grey", na.rm = TRUE) + # score band boundaries
geom_point(aes(x = score, y = logit), data = d_sb, size = 3) + # score band estimate of calibration curve
geom_step(aes(x = score, y = logit), data = d_step, direction = "hv") + # add step-function calibration curve
# show individual case outcomes as rugs
geom_rug(aes(x = score, colour = outcome), data = subset(d_sim1, outcome == "Good"), sides = "t") + # show Good cases
geom_rug(aes(x = score, colour = outcome), data = subset(d_sim1, outcome == "Bad"), sides = "b") + # show Bad cases
guides(colour = guide_legend(reverse = TRUE)) # make order legend order agree with rug order
# marginal density plot
p_xaxis <-
cowplot::axis_canvas(p_main, axis = "x", data = d_sim1) +
geom_density(aes(x = score, fill = outcome), alpha = 0.5, colour = "white", size = 0.2)
# compose the plots
cowplot::insert_xaxis_grob(p_main, p_xaxis, height = grid::unit(0.1, "null"), position = "top") %>%
cowplot::ggdraw()
```
# scorecal
## scorecal objectives
scorecal: An `R` package for score calibration
Be a better microscope for examining deviations from linearity
in calibration functions
Issues to be addressed:
- Use data efficiently (assume continuity and smoothness)
- Relative magnitude of linear and nonlinear components
- Common scores
- Sparsity of cases in extreme tails
- Spike deviations
## Use data efficiently - issue & approach
- Score band approach does not make efficient use of data
because it assumes:
- No relationship between neighbouring bands
- No significance to ordering of scores within bands
- Expect neighbouring scores to have similar outcome statistics
(continuity of scores and smoothness of calibration curve)
- Use smoothing spline or local regression models
- Cases "borrow strength" from their neighbours
(like having a moving-window estimator)
- The effective number of cases used per score value is higher,
giving narrower confidence intervals
- But, outcome statistic estimates at neighbouring point values are correlated
(which follows from assuming smoothness)
## Use data efficiently - example
The same simulated data (95% confidence intervals)
```{r data_efficiency, echo=FALSE, out.width = "65%", fig.align = "center"}
ci_conf_level <- 0.95 # total probability covered by CI
ci_width <- qnorm(1 - (1 - ci_conf_level)/2) # half-width of confidence intervals - in standard errors
# calculate standard errors of scoreband estimates
d_sb <- d_sb %>%
with(binom::binom.confint(n_good, n, conf.level = ci_conf_level, methods = "lrt")) %>%
dplyr::select(
p_lo = lower,
p_hi = upper
) %>%
dplyr::mutate(
logit_lo = p2logit(p_lo),
logit_hi = p2logit(p_hi)
) %>%
dplyr::bind_cols(d_sb, .)
# fit a smooth GAM and get standard errors
fit <- mgcv::gam(outcome == "Good" ~ s(score), family = binomial, data = d_sim1)
d_sim1_extra <- predict(fit, se.fit = TRUE) %>%
tibble::as_tibble() %>%
dplyr::bind_cols(d_sim1, .)
# get the smooth fit at distinct scores
d_distinct <- d_sim1_extra %>%
dplyr::distinct(score, .keep_all = TRUE) %>%
dplyr::arrange(score)
# colours for smooth and step calibration curves
col_smooth <- "olivedrab4"
col_step <- "darkorange"
# graphically compare smooth and step-function calibrations
d_sim1_extra %>% # individual case data
# general layout
ggplot(aes(x = score, y = logit)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("log-odds(outcome)") + # axis labels
# marginal rugs
geom_rug(aes(x = score, colour = outcome), data = subset(d_sim1_extra, outcome == "Good"), sides = "t") + # show Good cases
geom_rug(aes(x = score, colour = outcome), data = subset(d_sim1_extra, outcome == "Bad"), sides = "b") + # show Bad cases
guides(colour = guide_legend(reverse = TRUE)) + # make order legend order agree with rug order
# smooth calibration curve confidence interval
geom_ribbon(aes(x = score, ymin = fit - ci_width*se.fit, ymax = fit + ci_width*se.fit),
data = d_distinct, fill = col_smooth, alpha = 0.4) +
# true calibration curve
geom_abline(slope = sim_slope, intercept = sim_intercept, linetype = "dotted", size = 1) + # data simulation parameters
# step-function calibration curve
geom_linerange(aes(x = score, ymin = logit_lo, ymax = logit_hi), data= d_sb, colour = col_step, size = 5, alpha = 0.5) +
geom_step(aes(x = score, y = logit), data = d_step, direction = "hv", colour = col_step, size = 1, alpha = 1) + # add step-function calibration curve
# smooth calibration curve
geom_line(aes(x = score, y = fit), data = d_distinct, colour = col_smooth, size = 1)
```
## Relative magnitude of linear and nonlinear components - issue & approach
- Global linear trend is expected pattern
- Global linear trend generally much stronger than nonlinearities
- Nonlinearities are harder to see when combined with the strong linear component
- Decompose calibration function into linear and nonlinear components
- Fit linear model and use as offset in nonlinear models
- Regularisation of nonlinear component
makes the linear component the default pattern
when data is sparse (similar effect to a Bayesian prior)
- Display nonlinear components separately
## Relative magnitude of linear and nonlinear components - example
```{r decompose_curve, fig.height = 3, echo = FALSE, out.width = "100%", fig.align = "center"}
# function to plot calibration sketch
cal_sketch <- function(
d, # data frame - with score and logit columns
t # character[1] - title for plot
# value # ggplot plot object
)
{
d %>%
ggplot(aes(x = score, y = logit)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("log-odds(outcome)") + # axis labels
xlim(c(-1, +1)) + ylim(c(-1, +1)) +
coord_equal() +
labs(title = t[1]) + # label the plot
geom_path(size = 1, lineend = "round", linejoin = "round") # add the calibration curve
}
# mid reversal
p_mid_rev <-
tibble::tribble(
~score, ~logit,
-1, -1,
-0.15, +0.1,
+0.15, -0.1,
+1, +1
) %>%
cal_sketch("Calibration function")
# +
# geom_abline(slope = 1, intercept = 0, colour = "grey")
# linear
p_linear <-
tibble::tribble(
~score, ~logit,
-1, -1,
+1, +1
) %>%
cal_sketch("Linear component")
# residual of mid reversal realtive to linear
p_resid <-
tibble::tribble(
~score, ~logit,
-1, 0,
-0.15, +0.25,
+0.15, -0.25,
+1, 0
) %>%
cal_sketch("Nonlinear component") +
geom_hline(yintercept = 0, colour = "grey")
p_equal <-
ggplot() +
theme_void() +
geom_blank() +
xlim(c(-1, +1)) + ylim(c(-1, +1)) +
coord_equal() +
annotate("text", x = 0, y = 0, label = "=", size = 20)
p_plus <-
ggplot() +
theme_void() +
geom_blank() +
xlim(c(-1, +1)) + ylim(c(-1, +1)) +
coord_equal() +
annotate("text", x = 0, y = 0, label = "+", size = 20)
p_mid_rev +
p_equal +
p_linear +
p_plus +
p_resid +
patchwork::plot_layout(ncol = 5, widths = c(2, 1, 2, 1, 2))
```
## Common scores - issue & approach
For discrete scores, some score values are very common
(occur on a large fraction of cases),
e.g. bureau scores for New-to-Bureau cases
- For moving-window estimators with window width set at fixed fraction of cases,
the fraction of cases on a common score may exceed the window width
- No variance of the predictor (score) within the window; regression fails
- For smoothing-spline estimators, can reduce the effective number of score values
- Use jittering (add small random noise) to break tied scores
- Jittering magnitude chosen to preserve order of scores
- (Mostly) does no harm if using a smoothing-spline estimator
- Average the outcome estimates for all the jittered scores
derived from the same unjittered score
(i.e. transform the result back to the unjittered scores)
## Common scores - example
Histograms of unjittered and jittered simulated data for a small range of scores
```{r common_scores, echo = FALSE, out.width = "65%", fig.align = "center"}
# simulate a few scores
d_unjit <- tibble::tibble(
score = rep(498:502, times = c(6, 1, 27, 11, 3))
)
d_jit <- tibble::tibble(
score = jitter(d_unjit$score, factor = 5/4)
)
# plot the scores before and after jittering
ggplot() +
theme(panel.grid.minor = element_blank()) + # no minor grid lines
geom_bar(aes(x = score), data = d_unjit, fill = "black", width = 0.025, position = "identity") +
geom_bar(aes(x = score), data = d_jit, fill = "red", colour = "grey", width = 0.025, position = "identity")
```
## Sparsity of cases in extreme tails - issue
Distributions of scores tend to be skewed and heavy-tailed
- Cases are sparse in the extreme tails
- Confidence interval of fitted calibration curve may be *very* wide in tails
- May include positive *and* negative slopes
- Pattern is ill-defined in tails
(needs stronger assumptions to extract the pattern)
- Extreme tails have small fraction of cases
- Generally not practically important
- But, tend to be visually dominant
- Can cause technical problems
- May be *very* few cases between smoothing spline knots
- May be only one outcome class between smoothing spline knots
- Case density may vary strongly within local regression window
- Pattern at dense end of window may dominate pattern at sparse end
## Sparsity of cases in extreme tails - approach
- For nonlinear smooth fit, transform jittered score to *normal* density first
- Compresses heavy tails; expands light tails
- Estimate calibration curve then inverse transform back to original score scale
- Transform is to normal density rather than uniform,
because uniform is too aggressive
- Effect of density transform is to increase smoothing where tails are heavy
and decrease smoothing where tails are light
- Smoothing is effectively low-pass filtering
- Compression of tails by transformation shifts frequencies of patterns up
- Higher frequencies are attenuated more by the smoothing
- Inverse transformation back to original scores shifts frequencies down again
- Expansion of tails by transformation does the converse
## Sparsity of cases in extreme tails - example - nonlinear smooth
The same simulated data (nonlinear components; 95% confidence intervals)
```{r transformed_nonlinear_fit, echo = FALSE, out.width = "65%", fig.align = "center"}
### jitter ####
# utility function to jitter score
cal_jitter <- function(
score # numeric - vector of scores
# value: numeric - vector of jittered scores
) {
jitter(score, factor = 5/4)
}
# # demonstrate jittering
# d_test <- d_sim1 %>%
# dplyr::mutate(
# score = round(score, 1), # simulate a discrete score
# score_jit = cal_jitter(score)
# )
# d_test %>%
# ggplot() +
# geom_point(aes(x = score, y = score_jit - score))
### linear ###
# utility function to fit linear calibration curve
cal_lin <- function(
score, # numeric - vector of scores
outcome # logical - vector of outcomes (TRUE := Good)
# value: numeric - vector of fitted linear smooth of outcome (natural logodds scale)
) {
d_local <- tibble::tibble(score, outcome) # combine the score and outcome into a data framne
fit_lin <- glm(outcome ~ score, family = binomial, data = d_local) # fit linear calibration
predict(fit_lin, newdata = d_local) # return fitted curve
}
# # demonstrate linear calibration
# d_test <- d_sim1 %>%
# dplyr::mutate(
# cal_lin = cal_lin(score, outcome == "Good") # use unjittered score for linear component
# )
# d_test %>%
# ggplot() +
# geom_point(aes(x = score, y = cal_lin))
### smooth ###
# utility function to fit smooth nonlinear calibration curve with offset
cal_smooth <- function(
score, # numeric - vector of scores
outcome, # logical - vector of outcomes (TRUE := Good)
offset # numeric - vector of offsets
# value: numeric - vector of fitted nonlinear smooth of outcome (natural logodds scale)
) {
d_local <- tibble::tibble(score, outcome) # combine the score and outcome into a data framne
fit_smooth <- mgcv::gam(outcome ~ s(score), offset = offset,
family = binomial, data = d_local) # fit nonlinear smooth calibration with offset
predict(fit_smooth, newdata = d_local) # return fitted curve
}
# # demonstrate nonlinear calibration on untransformed score
# d_test <- d_sim1 %>%
# dplyr::mutate(
# score_jit = cal_jitter(score),
# cal_lin = cal_lin(score, outcome == "Good"), # use unjittered score for linear component
# cal_smooth = cal_smooth(score_jit, outcome == "Good", cal_lin) # use unjittered score for linear component
# )
# d_test %>%
# ggplot() +
# geom_point(aes(x = score, y = cal_smooth + cal_lin)) +
# geom_point(aes(x = score, y = cal_smooth), colour = "red")
### transform ###
# utility function to transform the score to uniform density
trans_density <- function(
score # numeric - vector of scores
# value: numeric - vector of scores transformed to uniform density
) {
qnorm(p = (0.999 * dplyr::cume_dist(score) + 0.0005)) # shift ECDF slightly to avoid probability = 1
}
# # demonstrate transformation of scores on simulated data
# d_test <- d_sim1 %>%
# dplyr::mutate(
# score = round(score, 1), # simulate an integer score
# score_jit = cal_jitter(score),
# score_trans = trans_density(score_jit), # use jittered score for transformation
# )
# d_test %>%
# ggplot() +
# geom_point(aes(x = score, y = score_trans))
# # demonstrate nonlinear smooth of transformed scores on simulated data
# d_test <- d_sim1 %>%
# dplyr::mutate(
# score = round(score, 1), # simulate a discrete score
# score_jit = cal_jitter(score),
# score_trans = trans_density(score_jit), # use jittered score for transformation
# cal_lin = cal_lin(score, outcome == "Good"), # use unjittered score for linear component
# cal_smooth = cal_smooth(score_trans, outcome == "Good", cal_lin) # use transformed score for smooth component
# ) %>%
# # reverse the jittering
# # take the mean smoothed outcome for each original discrete score
# dplyr::group_by(score) %>%
# dplyr::mutate(
# cal_smooth = mean(cal_smooth) # merge all jittered scores back to origninal
# ) %>%
# dplyr::ungroup()
# d_test %>%
# ggplot() +
# geom_point(aes(x = score, y = cal_smooth + cal_lin)) +
# geom_point(aes(x = score, y = cal_smooth), colour = "red")
### generate slide ###
# compare transformed and untransformed regressions wtih CIs
d_test <- d_sim1 %>%
dplyr::mutate(
score = round(score, 1), # simulate a discrete score
score_jit = cal_jitter(score),
score_trans = trans_density(score_jit), # use jittered score for transformation
cal_lin = cal_lin(score, outcome == "Good"), # use unjittered score for linear component
)
# fit nonlinear components manually to get SEs
# fit_untrans <- mgcv::gam(outcome == "Good" ~ s(score_jit, bs = "ad"), offset = cal_lin,
fit_untrans <- mgcv::gam(outcome == "Good" ~ s(score_jit), offset = cal_lin,
family = binomial, data = d_test)
d_test_untrans <- predict(fit_untrans, se.fit = TRUE) %>%
tibble::as_tibble() %>%
dplyr::bind_cols(d_test, .) %>%
dplyr::mutate(
type = "untransformed"
) %>%
dplyr::rename(
cal_smooth = fit,
se_smooth = se.fit
)
# fit_trans <- mgcv::gam(outcome == "Good" ~ s(score_trans, bs = "ad"), offset = cal_lin,
fit_trans <- mgcv::gam(outcome == "Good" ~ s(score_trans), offset = cal_lin,
family = binomial, data = d_test)
d_test_trans <- predict(fit_trans, se.fit = TRUE) %>%
tibble::as_tibble() %>%
dplyr::bind_cols(d_test, .) %>%
dplyr::mutate(
type = "transformed"
) %>%
dplyr::rename(
cal_smooth = fit,
se_smooth = se.fit
)
d_test <-
dplyr::bind_rows(d_test_untrans, d_test_trans) %>%
dplyr::mutate(
type = forcats::fct_relevel(type, "untransformed", "transformed") # get desired order
) %>%
# reverse the jittering
# take the mean smoothed outcome for each original discrete score
dplyr::group_by(type, score) %>%
dplyr::mutate(
cal_smooth = mean(cal_smooth),
se_smooth = mean(se_smooth)
) %>%
dplyr::ungroup() %>%
dplyr::distinct(type, score, .keep_all = TRUE) %>% # for plotting we only need 1 record per score x type
dplyr::arrange(type, score)
# graphically compare transformed and untransformed calibrations with CIs
d_test %>%
# general layout
ggplot() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("nonlinear component log-odds(outcome)") + # axis labels
# true calibration curve
geom_hline(yintercept = 0, linetype = "dotted", size = 1) +
# calibration curve confidence interval
geom_ribbon(
aes(x = score,
ymin = cal_smooth - ci_width*se_smooth,
ymax = cal_smooth + ci_width*se_smooth,
fill = type
),
alpha = 0.2) +
# smooth calibration curve
geom_line(aes(x = score, y = cal_smooth, colour = type), size = 1)
```
## Sparsity of cases in extreme tails - example - total curve
The same simulated data (combined components; 95% confidence intervals)
```{r transformed_combined_fit, echo = FALSE, out.width = "65%", fig.align = "center"}
# graphically compare transformed and untransformed calibrations with CIs
d_test %>%
# general layout
ggplot() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("combined components log-odds(outcome)") + # axis labels
# true calibration curve
geom_abline(slope = sim_slope, intercept = sim_intercept, linetype = "dotted", size = 1) + # data simulation parameters
# calibration curve confidence interval
geom_ribbon(
aes(x = score,
ymin = cal_lin + cal_smooth - ci_width*se_smooth,
ymax = cal_lin + cal_smooth + ci_width*se_smooth,
fill = type
),
alpha = 0.2) +
# smooth calibration curve
geom_line(aes(x = score, y = cal_lin + cal_smooth, colour = type), size = 1)
```
## Spike deviations - issue
- A specific score can have an outcome probability
very different from neighbours
- Interpretable as the cases in the spike having the wrong score
- Possibly due to score calculation error
- Possibly due to applying scorecard to a different population
- Difficult to detect unless the score is a common score
- Difficult to detect in a continuous scorecard because spikes are spread
## Spike deviations - issue with smoothing approach
- Spike deviations break assumption of smoothness
- Analysis developed so far *hides* spikes
```{r spike_sketch, echo = FALSE, out.width = "55%", fig.align = "center"}
tibble::tribble( # define a spike
~score, ~logit,
-6, 0,
-0.2, 0,
-0.1, 1,
+0.1, 1,
+0.2, 0,
+6, 0
) %>%
ggplot(aes(x = score, y = logit)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + # no grid
theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) + # no axis marks
xlab("score") + ylab("log-odds(outcome)") + # axis labels
stat_function(fun = dnorm, colour = "grey4", size = 1) + # draw smoothed spike
geom_path(colour = "red", size = 1, lineend = "round", linejoin = "round") # draw spike
```
## Spike deviations - approach
Approach: Model spikes with an indicator variable for each spike score
- Issue: To find the spikes we need an indicator variable for each unique score
- Ideally, fit smooth and select spikes simultaneously with regularised regression
- This is possible, but I haven't done it yet
- Current approach:
- Pre-filter potential spikes by frequency (say > 1% cases)
- Use lasso regression to select spikes with smooth as offset
- Re-estimate smooth with selected spikes as added predictors
## Spike deviations - example data
```{r mk_sim_data_spike, echo = FALSE}
# make simulated data for spike deviation analysis
set.seed(45) # for reproducibility
### utility functions ###
# function to implement piecewise linear slopes
# multiple calls to this function are added
above <- function(
x, # numeric - vector of score values
knot, # numeric[1] - location of knot
delta_slope # numeric[1] - change in slope above knot
# value - numeric - vector of incremental log-odds values
) {
as.integer(x >= knot) * (x - knot) * delta_slope
}
# function to implement shape of true calibration curve for main distribution
# value is in log-odds units
cal_true <- function(
x, # numeric - vector of score values
lo_range, # numeric[1] - range from of output in logits
p_bad # numeric[1] - target Bad rate (approximate)
# value - numeric - vector of log-odds values
) {
# calculate logits (arbitrary scale) as a function of score
logit <- 0.0 +
above(x, 0.3, +1) +
above(x, 0.6, +1) +
above(x, 0.9, -2)
logit_min <- min(logit)
logit_max <- max(logit)
logit_range <- logit_max - logit_min
logit_mean <- mean(logit)
logit_mean_target <- p2logit(1 - p_bad)
# rescale logits to have desired range and Bad rate
lo_range * (logit - logit_mean) / logit_range + logit_mean_target
}
n_scores <- 1e3 # number of discrete scores to simulate
n_obs <- 20e3 # total number of observations (including spike)
p_spike <- 0.05 # fraction of total observations in the spike
# number of observations in the main distribution and spike
n_spike <- round(n_obs * p_spike)
n_main <- n_obs - n_spike
# skew normal distribution parameters for main body of observations
main_mean <- 0.0
main_sd <- 1.0
main_gamma1 <- -0.7
lo_range <- 5.0 # range of true calibration curve in logits
p_bad <- 0.06 # *very approximate* target Bad rate for main distribution of simulated data
# this is manually adjusted to achieve the true target value
spike_score <- 0.75 # location of spike on [0,1] score range
delta_lo_spike <- -1.0 # logit increment of spike relative to main distribution
# make simulated data for main distribution
d_sim2_main <-
# make main distribution of scores skew normal
tibble::tibble(
score = sn::rsn(n_main,
dp = sn::cp2dp(c(mean = main_mean, sd = main_sd, gamma1 = main_gamma1),
family = "SN"))
) %>%
dplyr::mutate(
# rescale score to n_scores discrete levels over [0,1]
score = (cut(score, breaks = n_scores, labels = FALSE) - 1) / (n_scores - 1),
# calculate true logit as a function of score
logit = cal_true(score, lo_range = lo_range, p_bad = p_bad)
)
# turn the extensive definition of the calibration curve into a function
d_sim2_main_distinct <- d_sim2_main %>%
dplyr::distinct(score, .keep_all = TRUE)
cal_fun <- with(d_sim2_main_distinct, approxfun(score, logit, rule = 2))
# make simulated data for spike
d_sim2_spike <-
# make main distribution of scores skew normal
tibble::tibble(
score = rep(spike_score, times = n_spike)
) %>%
dplyr::mutate(
# calculate true logit of spike at score
logit = cal_fun(score) + delta_lo_spike
)
# concatenate the cases from the main distribution and spike
d_sim2 <- dplyr::bind_rows(d_sim2_main, d_sim2_spike) %>%
# calculatr the remaining variables
dplyr::mutate(
prob = logit2p(logit), # true outcome probability
outcome_01 = as.integer(runif(length(prob)) <= prob), # randomly realise outcome from true probability
outcome = dplyr::if_else(outcome_01 == 1, "Good", "Bad") # also express outcome with pretty labels