Skip to content

Commit

Permalink
Merge pull request #107 from rsquaredacademy/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
aravindhebbali authored May 28, 2020
2 parents 442b1bf + 2e27512 commit 80b4190
Show file tree
Hide file tree
Showing 134 changed files with 2,284 additions and 1,594 deletions.
9 changes: 2 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
Package: blorr
Type: Package
Title: Tools for Developing Binary Logistic Regression Models
Version: 0.2.2.9000
Version: 0.3.0
Authors@R: person("Aravind", "Hebbali", email = "hebbali.aravind@gmail.com", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-9220-9669"))
Description: Tools designed to make it easier for beginner and intermediate users to build and validate
binary logistic regression models. Includes bivariate analysis, comprehensive regression output,
model fit statistics, variable selection procedures, model validation techniques and a 'shiny'
app for interactive model building.
Depends:
R(>= 3.3)
R(>= 3.5)
Imports:
car,
caret,
data.table,
e1071,
ggplot2,
gridExtra,
lest,
Rcpp,
scales,
stats,
utils
Suggests:
Expand All @@ -40,5 +37,3 @@ Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.0
LinkingTo: Rcpp
Remotes:
tidyverse/dplyr
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

S3method(blr_bivariate_analysis,default)
S3method(blr_coll_diag,default)
S3method(blr_confusion_matrix,default)
S3method(blr_gains_table,default)
S3method(blr_model_fit_stats,default)
S3method(blr_multi_model_fit_stats,default)
Expand Down Expand Up @@ -30,6 +31,7 @@ S3method(plot,blr_step_p_forward)
S3method(plot,blr_woe_iv)
S3method(print,blr_bivariate_analysis)
S3method(print,blr_coll_diag)
S3method(print,blr_confusion_matrix)
S3method(print,blr_gains_table)
S3method(print,blr_model_fit_stats)
S3method(print,blr_multi_model_fit_stats)
Expand Down Expand Up @@ -119,14 +121,12 @@ export(blr_woe_iv)
export(blr_woe_iv_stats)
importFrom(Rcpp,sourceCpp)
importFrom(car,Anova)
importFrom(caret,confusionMatrix)
importFrom(data.table,":=")
importFrom(data.table,.N)
importFrom(data.table,data.table)
importFrom(data.table,rbindlist)
importFrom(data.table,setDF)
importFrom(data.table,setorder)
importFrom(e1071,classAgreement)
importFrom(ggplot2,aes)
importFrom(ggplot2,annotate)
importFrom(ggplot2,element_blank)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# blorr 0.3.0

This is a minor release to reduce package dependencies and fix bugs.

# blorr 0.2.2

This is a patch release to fix CRAN errors.
Expand Down
15 changes: 10 additions & 5 deletions R/blr-bivariate-analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -331,13 +331,18 @@ plot.blr_segment_dist <- function(x, title = NA, xaxis_title = "Levels",
geom_col(aes(y = `n%`), fill = bar_color) +
geom_line(aes(y = `1s%`, group = 1), color = line_color) +
xlab(xaxis_title) + ggtitle(plot_title) + ylab(yaxis_title) +
scale_y_continuous(labels = scales::percent,
sec.axis = sec_axis(~. / sec_axis_scale, name = sec_yaxis_title,
labels = scales::percent))
scale_y_continuous(
breaks = seq(0, 1, by = 0.1),
labels = paste0(seq(0, 1, by = 0.1) * 100, '%'),
sec.axis = sec_axis(
trans = ~.,
breaks = seq(0, 1, by = 0.1),
labels = paste0(seq(0, 1, by = 0.1) * 100, '%'),
name = sec_yaxis_title))

if (print_plot) {
print(p)
}
}

invisible(p)
}
Expand All @@ -347,6 +352,6 @@ secondary_axis_scale_comp <- function(x) {

d <- x$dist_table
d$sec <- d$`n%` / d$`1s%`
min(d$sec)
max(d$sec)

}
8 changes: 4 additions & 4 deletions R/blr-gains-table.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ plot.blr_gains_table <- function(x, title = "Lift Chart", xaxis_title = "% Popul
geom_line(aes(x = cum_total_per, y = cum_1s_per), color = lift_curve_col) +
geom_line(aes(x = cum_total_per, y = cum_total_y), color = diag_line_col) +
ggtitle(title) + xlab(xaxis_title) + ylab(yaxis_title) +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
scale_y_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
theme(plot.title = element_text(hjust = plot_title_justify))

if (print_plot) {
Expand Down Expand Up @@ -168,8 +168,8 @@ blr_ks_chart <- function(gains_table, title = "KS Chart", yaxis_title = " ",
annotate("text", x = annotate_x, y = annotate_y,
label = paste0("KS: ", ks_stat, "%")) +
ggtitle(title) + xlab(xaxis_title) + ylab(yaxis_title) +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
scale_y_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
theme(plot.title = element_text(hjust = 0.5),
legend.title = element_blank())

Expand Down
91 changes: 84 additions & 7 deletions R/blr-model-validation.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#' Confusion matrix
#'
#' Wrapper for \code{confMatrix} from the caret package.
#' Confusion matrix and statistics.
#'
#' @param model An object of class \code{glm}.
#' @param data A \code{tibble} or a \code{data.frame}.
#' @param cutoff Cutoff for classification.
#' @param ... Other arguments.
#'
#' @return Confusion matix.
#'
Expand All @@ -14,20 +15,22 @@
#'
#' blr_confusion_matrix(model, cutoff = 0.4)
#'
#' @importFrom caret confusionMatrix
#' @importFrom e1071 classAgreement
#'
#' @family model validation techniques
#'
#' @export
#'
blr_confusion_matrix <- function(model, cutoff = 0.5, data = NULL) {
blr_confusion_matrix <- function(model, cutoff = 0.5, data = NULL, ...) UseMethod("blr_confusion_matrix")

#' @rdname blr_confusion_matrix
#' @export
#'
blr_confusion_matrix.default <- function(model, cutoff = 0.5, data = NULL, ...) {

blr_check_model(model)
blr_check_values(cutoff, 0, 1)

namu <- formula(model)[[2]]

if (is.null(data)) {
data <- model$model
response <- data[[1]]
Expand All @@ -38,7 +41,81 @@ blr_confusion_matrix <- function(model, cutoff = 0.5, data = NULL) {

p_data <- predict(model, newdata = data, type = "response")
c_data <- as.factor(as.numeric(p_data > cutoff))
out <- table(Prediction = c_data, Reference = response)

a <- out[4]
b <- out[2]
c <- out[3]
d <- out[1]

accuracy <- (a + d) / (a + b + c + d)
no_inf_rate <- table(response)[[1]] / sum(table(response))
sensitivity <- a / (a + c)
specificity <- d / (b + d)
prevalence <- (a + c) / (a + b + c + d)
detect_rate <- a / (a + b + c + d)
detect_prev <- (a + b) / (a + b + c + d)
bal_accuracy <- (sensitivity + specificity) / 2
precision <- a / (a + b)
recall <- a / (a + c)
kappa <- blr_kappa(out)
mcnemar_p <- stats::mcnemar.test(out)$p.value

ppv <- (sensitivity * prevalence) / ((sensitivity * prevalence) +
((1 - specificity) * (1 - prevalence)))

npv <- specificity * (1 - prevalence) / (((1 - sensitivity) * prevalence) +
(specificity * (1 - prevalence)))

confusionMatrix(data = c_data, reference = response, positive = '1')
result <- list(
accuracy = accuracy,
balanced_accuracy = bal_accuracy,
conf_matrix = out,
detection_prevalence = detect_prev,
detection_rate = detect_rate,
mcnemar_kappa = kappa,
mcnemar_test_p_val = mcnemar_p,
negative_predicted_value = npv,
no_information_rate = no_inf_rate,
positive_predicted_value = ppv,
precision = precision,
prevalence = prevalence,
recall = recall,
sensitivity = sensitivity,
specificity = specificity)

class(result) <- "blr_confusion_matrix"
return(result)

}

#' @export
#'
print.blr_confusion_matrix <- function(x, ...) {

cat('Confusion Matrix and Statistics', '\n\n')
print(x$conf_matrix)
cat('\n\n')
cat(' Accuracy :', format(round(x$accuracy, 4), nsmall = 4), '\n')
cat(' No Information Rate :', format(round(x$no_information_rate, 4), nsmall = 4), '\n\n')
cat(' Kappa :', format(round(x$mcnemar_kappa, 4), nsmall = 4), '\n\n')
cat("McNemars's Test P-Value :", format(round(x$mcnemar_test_p_val, 4), nsmall = 4), '\n\n')
cat(' Sensitivity :', format(round(x$sensitivity, 4), nsmall = 4), '\n')
cat(' Specificity :', format(round(x$specificity, 4), nsmall = 4), '\n')
cat(' Pos Pred Value :', format(round(x$positive_predicted_value, 4), nsmall = 4), '\n')
cat(' Neg Pred Value :', format(round(x$negative_predicted_value, 4), nsmall = 4), '\n')
cat(' Prevalence :', format(round(x$prevalence, 4), nsmall = 4), '\n')
cat(' Detection Rate :', format(round(x$detection_rate, 4), nsmall = 4), '\n')
cat(' Detection Prevalence :', format(round(x$detection_prevalence, 4), nsmall = 4), '\n')
cat(' Balanced Accuracy :', format(round(x$balanced_accuracy, 4), nsmall = 4), '\n')
cat(' Precision :', format(round(x$precision, 4), nsmall = 4), '\n')
cat(' Recall :', format(round(x$recall, 4), nsmall = 4), '\n\n')
cat(" 'Positive' Class : 1")

}

blr_kappa <- function(out) {
agreement <- sum(diag(out)) / sum(out)
expected <- sum(rowSums(out) * colSums(out)) / (sum(out) ^ 2)
(agreement - expected) / (1 - expected)
}
5 changes: 3 additions & 2 deletions R/blr-roc-curve.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ blr_roc_curve <- function(gains_table, title = "ROC Curve",
ggplot(plot_data, aes(x = `1 - specificity`, y = sensitivity_per)) +
geom_point(shape = point_shape, fill = point_fill, color = point_color) +
geom_line(color = roc_curve_col) + ggtitle(title) +
scale_x_continuous(labels = scales::percent) + xlab(xaxis_title) +
scale_y_continuous(labels = scales::percent) + ylab(yaxis_title) +
scale_x_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
scale_y_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
xlab(xaxis_title) + ylab(yaxis_title) +
theme(plot.title = element_text(hjust = plot_title_justify)) +
geom_line(aes(x = `1 - specificity`, y = `1 - specificity`),
color = diag_line_col)
Expand Down
4 changes: 0 additions & 4 deletions R/blr-stepwise-backward-regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,6 @@ blr_step_p_backward.default <- function(model, prem = 0.3, details = FALSE, ...)
m <- glm(paste(response, "~", paste(preds, collapse = " + ")), l, family = binomial(link = 'logit'))
m_sum <- Anova(m, test.statistic = "Wald")
pvals <- m_sum$`Pr(>Chisq)`
# m_sum <- summary(m)
# pvals <- unname(m_sum$coefficients[, 4])[-1]
# m <- ols_regress(paste(response, "~", paste(preds, collapse = " + ")), l)
# pvals <- m$pvalues[-1]
maxp <- which(pvals == max(pvals))

suppressWarnings(
Expand Down
9 changes: 0 additions & 9 deletions R/blr-stepwise-forward-regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,18 +160,9 @@ blr_step_p_forward.default <- function(model, penter = 0.3, details = FALSE, ...
m_sum <- Anova(m, test.statistic = "Wald")
pvals[i] <- m_sum$`Pr(>Chisq)`[ppos]
tvals[i] <- m_sum$Chisq[ppos]
# m_sum <- summary(m)
# pvals[i] <- unname(m_sum$coefficients[, 4])[ppos]
# tvals[i] <- unname(m_sum$coefficients[, 3])[ppos]
# m <- blr_regress(paste(response, "~",
# paste(predictors, collapse = " + ")), l)
# pvals[i] <- m$pval[ppos]
# tvals[i] <- m$zval[ppos]
}

minp <- which(pvals == min(pvals))
# tvals <- abs(tvals)
# maxt <- which(tvals == max(tvals))

if (pvals[minp] <= penter) {

Expand Down
53 changes: 0 additions & 53 deletions R/blr-stepwise-regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,36 +114,16 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
m_sum <- Anova(m, test.statistic = "Wald")
pvals[i] <- m_sum$`Pr(>Chisq)`[ppos]
tvals[i] <- m_sum$Chisq[ppos]
# m_sum <- summary(m)
# pvals[i] <- unname(m_sum$coefficients[, 4])[ppos]
# tvals[i] <- unname(m_sum$coefficients[, 3])[ppos]
# m <- ols_regress(paste(response, "~", paste(predictors, collapse = " + ")), l)
# pvals[i] <- m$pvalues[ppos]
# tvals[i] <- m$tvalues[ppos]
}

minp <- which(pvals == min(pvals))
# tvals <- abs(tvals)
# maxt <- which(tvals == max(tvals))
preds <- all_pred[minp]
lpreds <- length(preds)
fr <- glm(paste(response, "~", paste(preds, collapse = " + ")), l, family = binomial(link = 'logit'))
mfs <- blr_model_fit_stats(fr)
aic <- mfs$m_aic
bic <- mfs$m_bic
dev <- mfs$m_deviance
# fr <- ols_regress(paste(response, "~",
# paste(preds, collapse = " + ")), l)
# rsq <- fr$rsq
# adjrsq <- fr$adjr
# cp <- ols_mallows_cp(fr$model, model)
# aic <- ols_aic(fr$model)
# sbc <- ols_sbc(fr$model)
# sbic <- ols_sbic(fr$model, model)
# rmse <- sqrt(fr$ems)
# betas <- append(betas, fr$betas)
# lbetas <- append(lbetas, length(fr$betas))
# pvalues <- append(pvalues, fr$pvalues)

if (details) {
cat("\n")
Expand Down Expand Up @@ -186,18 +166,9 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
m_sum <- Anova(m, test.statistic = "Wald")
pvals[i] <- m_sum$`Pr(>Chisq)`[ppos]
tvals[i] <- m_sum$Chisq[ppos]
# m_sum <- summary(m)
# pvals[i] <- unname(m_sum$coefficients[, 4])[ppos]
# tvals[i] <- unname(m_sum$coefficients[, 3])[ppos]
# m <- ols_regress(paste(response, "~",
# paste(predictors, collapse = " + ")), l)
# pvals[i] <- m$pvalues[ppos]
# tvals[i] <- m$tvalues[ppos]
}

minp <- which(pvals == min(pvals))
# tvals <- abs(tvals)
# maxt <- which(tvals == max(tvals))

if (pvals[minp] <= pent) {

Expand All @@ -211,18 +182,6 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
aic <- c(aic, mfs$m_aic)
bic <- c(bic, mfs$m_bic)
dev <- c(dev, mfs$m_deviance)
# fr <- ols_regress(paste(response, "~",
# paste(preds, collapse = " + ")), l)
# rsq <- c(rsq, fr$rsq)
# adjrsq <- c(adjrsq, fr$adjr)
# aic <- c(aic, ols_aic(fr$model))
# sbc <- c(sbc, ols_sbc(fr$model))
# sbic <- c(sbic, ols_sbic(fr$model, model))
# cp <- c(cp, ols_mallows_cp(fr$model, model))
# rmse <- c(rmse, sqrt(fr$ems))
# betas <- append(betas, fr$betas)
# lbetas <- append(lbetas, length(fr$betas))
# pvalues <- append(pvalues, fr$pvalues)

if (details == TRUE) {
cat("\n")
Expand Down Expand Up @@ -255,8 +214,6 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
family = binomial(link = 'logit'))
m_sum <- Anova(m2, test.statistic = "Wald")
pvals_r <- m_sum$`Pr(>Chisq)`
# tvals_r <- m_sum$Chisq[ppos]
# tvals_r <- abs(unname(m_sum$coefficients[, 3])[-1])
maxp <- which(pvals_r == max(pvals_r))
if (pvals_r[maxp] > prem) {

Expand All @@ -271,16 +228,6 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
aic <- c(aic, mfs$m_aic)
bic <- c(bic, mfs$m_bic)
dev <- c(dev, mfs$m_deviance)
# rsq <- c(rsq, fr$rsq)
# adjrsq <- c(adjrsq, fr$adjr)
# aic <- c(aic, ols_aic(fr$model))
# sbc <- c(sbc, ols_sbc(fr$model))
# sbic <- c(sbic, ols_sbic(fr$model, model))
# cp <- c(cp, ols_mallows_cp(fr$model, model))
# rmse <- c(rmse, sqrt(fr$ems))
# betas <- append(betas, fr$betas)
# lbetas <- append(lbetas, length(fr$betas))
# pvalues <- append(pvalues, fr$pvalues)

if (details) {
cat("\n")
Expand Down
Loading

0 comments on commit 80b4190

Please sign in to comment.