Merge pull request #107 from rsquaredacademy/develop

Develop
rsquaredacademy · May 28, 2020 · 80b4190 · 80b4190
2 parents 442b1bf + 2e27512
commit 80b4190
Show file tree

Hide file tree

Showing 134 changed files with 2,284 additions and 1,594 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,25 +1,22 @@
 Package: blorr
 Type: Package
 Title: Tools for Developing Binary Logistic Regression Models
-Version: 0.2.2.9000
+Version: 0.3.0
 Authors@R: person("Aravind", "Hebbali", email = "hebbali.aravind@gmail.com", role = c("aut", "cre"), 
     comment = c(ORCID = "0000-0001-9220-9669"))
 Description: Tools designed to make it easier for beginner and intermediate users to build and validate 
     binary logistic regression models. Includes bivariate analysis, comprehensive regression output, 
     model fit statistics, variable selection procedures, model validation techniques and a 'shiny' 
     app for interactive model building.
 Depends:
-    R(>= 3.3)
+    R(>= 3.5)
 Imports: 
     car,
-    caret,
     data.table,
-    e1071,  
     ggplot2,
     gridExtra,
     lest,
     Rcpp,
-    scales,
     stats,
     utils
 Suggests: 
@@ -40,5 +37,3 @@ Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.1.0
 LinkingTo: Rcpp
-Remotes:
-    tidyverse/dplyr
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method(blr_bivariate_analysis,default)
 S3method(blr_coll_diag,default)
+S3method(blr_confusion_matrix,default)
 S3method(blr_gains_table,default)
 S3method(blr_model_fit_stats,default)
 S3method(blr_multi_model_fit_stats,default)
@@ -30,6 +31,7 @@ S3method(plot,blr_step_p_forward)
 S3method(plot,blr_woe_iv)
 S3method(print,blr_bivariate_analysis)
 S3method(print,blr_coll_diag)
+S3method(print,blr_confusion_matrix)
 S3method(print,blr_gains_table)
 S3method(print,blr_model_fit_stats)
 S3method(print,blr_multi_model_fit_stats)
@@ -119,14 +121,12 @@ export(blr_woe_iv)
 export(blr_woe_iv_stats)
 importFrom(Rcpp,sourceCpp)
 importFrom(car,Anova)
-importFrom(caret,confusionMatrix)
 importFrom(data.table,":=")
 importFrom(data.table,.N)
 importFrom(data.table,data.table)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setDF)
 importFrom(data.table,setorder)
-importFrom(e1071,classAgreement)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,annotate)
 importFrom(ggplot2,element_blank)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# blorr 0.3.0
+
+This is a minor release to reduce package dependencies and fix bugs.
+
 # blorr 0.2.2
 
 This is a patch release to fix CRAN errors.

diff --git a/R/blr-bivariate-analysis.R b/R/blr-bivariate-analysis.R
@@ -331,13 +331,18 @@ plot.blr_segment_dist <- function(x, title = NA, xaxis_title = "Levels",
     geom_col(aes(y = `n%`), fill = bar_color) +
     geom_line(aes(y = `1s%`, group = 1), color = line_color) +
     xlab(xaxis_title) + ggtitle(plot_title) + ylab(yaxis_title) +
-    scale_y_continuous(labels = scales::percent,
-      sec.axis = sec_axis(~. / sec_axis_scale, name = sec_yaxis_title,
-        labels = scales::percent))
+    scale_y_continuous(
+      breaks   = seq(0, 1, by = 0.1),
+      labels   = paste0(seq(0, 1, by = 0.1) * 100, '%'),
+      sec.axis = sec_axis(
+        trans  = ~.,
+        breaks = seq(0, 1, by = 0.1),
+        labels = paste0(seq(0, 1, by = 0.1) * 100, '%'),
+        name   = sec_yaxis_title))
 
   if (print_plot) {
     print(p)
-  } 
+  }
 
   invisible(p)
 }
@@ -347,6 +352,6 @@ secondary_axis_scale_comp <- function(x) {
 
   d <- x$dist_table
   d$sec <- d$`n%` / d$`1s%`
-  min(d$sec)
+  max(d$sec)
 
 }
diff --git a/R/blr-gains-table.R b/R/blr-gains-table.R
@@ -100,8 +100,8 @@ plot.blr_gains_table <- function(x, title = "Lift Chart", xaxis_title = "% Popul
     geom_line(aes(x = cum_total_per, y = cum_1s_per), color = lift_curve_col) +
     geom_line(aes(x = cum_total_per, y = cum_total_y), color = diag_line_col) +
     ggtitle(title) + xlab(xaxis_title) + ylab(yaxis_title) +
-    scale_x_continuous(labels = scales::percent) +
-    scale_y_continuous(labels = scales::percent) +
+    scale_x_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
+    scale_y_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
     theme(plot.title = element_text(hjust = plot_title_justify))
 
   if (print_plot) {
@@ -168,8 +168,8 @@ blr_ks_chart <- function(gains_table, title = "KS Chart", yaxis_title = " ",
     annotate("text", x = annotate_x, y = annotate_y,
              label = paste0("KS: ", ks_stat, "%")) +
     ggtitle(title) + xlab(xaxis_title) + ylab(yaxis_title) +
-    scale_x_continuous(labels = scales::percent) +
-    scale_y_continuous(labels = scales::percent) +
+    scale_x_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
+    scale_y_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) +
     theme(plot.title = element_text(hjust = 0.5),
           legend.title = element_blank())
 

diff --git a/R/blr-model-validation.R b/R/blr-model-validation.R
@@ -1,10 +1,11 @@
 #' Confusion matrix
 #'
-#' Wrapper for \code{confMatrix} from the caret package.
+#' Confusion matrix and statistics.
 #'
 #' @param model An object of class \code{glm}.
 #' @param data A \code{tibble} or a \code{data.frame}.
 #' @param cutoff Cutoff for classification.
+#' @param ... Other arguments.
 #'
 #' @return Confusion matix.
 #'
@@ -14,20 +15,22 @@
 #'
 #' blr_confusion_matrix(model, cutoff = 0.4)
 #'
-#' @importFrom caret confusionMatrix
-#' @importFrom e1071 classAgreement
-#'
 #' @family model validation techniques
 #'
 #' @export
 #'
-blr_confusion_matrix <- function(model, cutoff = 0.5, data = NULL) {
+blr_confusion_matrix <- function(model, cutoff = 0.5, data = NULL, ...) UseMethod("blr_confusion_matrix")
+
+#' @rdname blr_confusion_matrix
+#' @export
+#'
+blr_confusion_matrix.default <- function(model, cutoff = 0.5, data = NULL, ...) {
 
   blr_check_model(model)
   blr_check_values(cutoff, 0, 1)
 
   namu <- formula(model)[[2]]
-  
+
   if (is.null(data)) {
     data     <- model$model
   	response <- data[[1]]
@@ -38,7 +41,81 @@ blr_confusion_matrix <- function(model, cutoff = 0.5, data = NULL) {
 
   p_data <- predict(model, newdata = data, type = "response")
   c_data <- as.factor(as.numeric(p_data > cutoff))
+  out    <- table(Prediction = c_data, Reference = response)
+
+  a <- out[4]
+  b <- out[2]
+  c <- out[3]
+  d <- out[1]
+
+  accuracy     <- (a + d) / (a + b + c + d)
+  no_inf_rate  <- table(response)[[1]] / sum(table(response))
+  sensitivity  <- a / (a +  c)
+  specificity  <- d / (b + d)
+  prevalence   <- (a +  c) / (a + b + c + d)
+  detect_rate  <- a / (a + b + c + d)
+  detect_prev  <- (a +  b) / (a + b + c + d)
+  bal_accuracy <- (sensitivity + specificity) / 2
+  precision    <- a / (a + b)
+  recall       <- a / (a + c)
+  kappa        <- blr_kappa(out)
+  mcnemar_p    <- stats::mcnemar.test(out)$p.value
+
+  ppv <- (sensitivity * prevalence) / ((sensitivity * prevalence) +
+                                         ((1 - specificity) * (1 - prevalence)))
+
+  npv <- specificity * (1 - prevalence) / (((1 - sensitivity) * prevalence) +
+                                             (specificity * (1 - prevalence)))
 
-  confusionMatrix(data = c_data, reference = response, positive = '1')
+  result <- list(
+    accuracy                 = accuracy,
+    balanced_accuracy        = bal_accuracy,
+    conf_matrix              = out,
+    detection_prevalence     = detect_prev,
+    detection_rate           = detect_rate,
+    mcnemar_kappa            = kappa,
+    mcnemar_test_p_val       = mcnemar_p,
+    negative_predicted_value = npv,
+    no_information_rate      = no_inf_rate,
+    positive_predicted_value = ppv,
+    precision                = precision,
+    prevalence               = prevalence,
+    recall                   = recall,
+    sensitivity              = sensitivity,
+    specificity              = specificity)
+
+  class(result) <- "blr_confusion_matrix"
+  return(result)
+
+}
+
+#' @export
+#'
+print.blr_confusion_matrix <- function(x, ...) {
+
+  cat('Confusion Matrix and Statistics', '\n\n')
+  print(x$conf_matrix)
+  cat('\n\n')
+  cat('                Accuracy :', format(round(x$accuracy, 4), nsmall = 4), '\n')
+  cat('     No Information Rate :', format(round(x$no_information_rate, 4), nsmall = 4), '\n\n')
+  cat('                   Kappa :', format(round(x$mcnemar_kappa, 4), nsmall = 4), '\n\n')
+  cat("McNemars's Test P-Value  :", format(round(x$mcnemar_test_p_val, 4), nsmall = 4), '\n\n')
+  cat('             Sensitivity :', format(round(x$sensitivity, 4), nsmall = 4), '\n')
+  cat('             Specificity :', format(round(x$specificity, 4), nsmall = 4), '\n')
+  cat('          Pos Pred Value :', format(round(x$positive_predicted_value, 4), nsmall = 4), '\n')
+  cat('          Neg Pred Value :', format(round(x$negative_predicted_value, 4), nsmall = 4), '\n')
+  cat('              Prevalence :', format(round(x$prevalence, 4), nsmall = 4), '\n')
+  cat('          Detection Rate :', format(round(x$detection_rate, 4), nsmall = 4), '\n')
+  cat('    Detection Prevalence :', format(round(x$detection_prevalence, 4), nsmall = 4), '\n')
+  cat('       Balanced Accuracy :', format(round(x$balanced_accuracy, 4), nsmall = 4), '\n')
+  cat('               Precision :', format(round(x$precision, 4), nsmall = 4), '\n')
+  cat('                  Recall :', format(round(x$recall, 4), nsmall = 4), '\n\n')
+  cat("        'Positive' Class : 1")
+
+}
 
+blr_kappa <- function(out) {
+  agreement <- sum(diag(out)) / sum(out)
+  expected  <- sum(rowSums(out) * colSums(out)) / (sum(out) ^ 2)
+  (agreement - expected) / (1 - expected)
 }
diff --git a/R/blr-roc-curve.R b/R/blr-roc-curve.R
@@ -53,8 +53,9 @@ blr_roc_curve <- function(gains_table, title = "ROC Curve",
     ggplot(plot_data, aes(x = `1 - specificity`, y = sensitivity_per)) +
     geom_point(shape = point_shape, fill = point_fill, color = point_color) +
     geom_line(color = roc_curve_col) + ggtitle(title) +
-    scale_x_continuous(labels = scales::percent) + xlab(xaxis_title) +
-    scale_y_continuous(labels = scales::percent) + ylab(yaxis_title) +
+    scale_x_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) + 
+    scale_y_continuous(labels = c('0%', '25%', '50%', '75%', '100%')) + 
+    xlab(xaxis_title) + ylab(yaxis_title) +
     theme(plot.title = element_text(hjust = plot_title_justify)) +
     geom_line(aes(x = `1 - specificity`, y = `1 - specificity`),
               color = diag_line_col)

diff --git a/R/blr-stepwise-backward-regression.R b/R/blr-stepwise-backward-regression.R
@@ -99,10 +99,6 @@ blr_step_p_backward.default <- function(model, prem = 0.3, details = FALSE, ...)
     m <- glm(paste(response, "~", paste(preds, collapse = " + ")), l, family = binomial(link = 'logit'))
     m_sum <- Anova(m, test.statistic = "Wald")
     pvals <- m_sum$`Pr(>Chisq)`
-    # m_sum <- summary(m)
-    # pvals <- unname(m_sum$coefficients[, 4])[-1]
-    # m <- ols_regress(paste(response, "~", paste(preds, collapse = " + ")), l)
-    # pvals <- m$pvalues[-1]
     maxp  <- which(pvals == max(pvals))
 
     suppressWarnings(

diff --git a/R/blr-stepwise-forward-regression.R b/R/blr-stepwise-forward-regression.R
@@ -160,18 +160,9 @@ blr_step_p_forward.default <- function(model, penter = 0.3, details = FALSE, ...
       m_sum <- Anova(m, test.statistic = "Wald")
       pvals[i] <- m_sum$`Pr(>Chisq)`[ppos]
       tvals[i] <- m_sum$Chisq[ppos]
-      # m_sum <- summary(m)
-      # pvals[i] <- unname(m_sum$coefficients[, 4])[ppos]
-      # tvals[i] <- unname(m_sum$coefficients[, 3])[ppos]
-      # m <- blr_regress(paste(response, "~",
-      #                        paste(predictors, collapse = " + ")), l)
-      # pvals[i] <- m$pval[ppos]
-      # tvals[i] <- m$zval[ppos]
     }
 
     minp  <- which(pvals == min(pvals))
-    # tvals <- abs(tvals)
-    # maxt  <- which(tvals == max(tvals))
 
     if (pvals[minp] <= penter) {
 

diff --git a/R/blr-stepwise-regression.R b/R/blr-stepwise-regression.R
@@ -114,36 +114,16 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
     m_sum <- Anova(m, test.statistic = "Wald")
     pvals[i] <- m_sum$`Pr(>Chisq)`[ppos]
     tvals[i] <- m_sum$Chisq[ppos]
-    # m_sum <- summary(m)
-    # pvals[i] <- unname(m_sum$coefficients[, 4])[ppos]
-    # tvals[i] <- unname(m_sum$coefficients[, 3])[ppos]
-    # m <- ols_regress(paste(response, "~", paste(predictors, collapse = " + ")), l)
-    # pvals[i] <- m$pvalues[ppos]
-    # tvals[i] <- m$tvalues[ppos]
   }
 
   minp    <- which(pvals == min(pvals))
-  # tvals   <- abs(tvals)
-  # maxt    <- which(tvals == max(tvals))
   preds   <- all_pred[minp]
   lpreds  <- length(preds)
   fr      <- glm(paste(response, "~", paste(preds, collapse = " + ")), l, family = binomial(link = 'logit'))
   mfs     <- blr_model_fit_stats(fr)
   aic     <- mfs$m_aic
   bic     <- mfs$m_bic
   dev     <- mfs$m_deviance
-  # fr      <- ols_regress(paste(response, "~",
-  #                              paste(preds, collapse = " + ")), l)
-  # rsq     <- fr$rsq
-  # adjrsq  <- fr$adjr
-  # cp      <- ols_mallows_cp(fr$model, model)
-  # aic     <- ols_aic(fr$model)
-  # sbc     <- ols_sbc(fr$model)
-  # sbic    <- ols_sbic(fr$model, model)
-  # rmse    <- sqrt(fr$ems)
-  # betas   <- append(betas, fr$betas)
-  # lbetas  <- append(lbetas, length(fr$betas))
-  # pvalues <- append(pvalues, fr$pvalues)
 
   if (details) {
     cat("\n")
@@ -186,18 +166,9 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
       m_sum <- Anova(m, test.statistic = "Wald")
       pvals[i] <- m_sum$`Pr(>Chisq)`[ppos]
       tvals[i] <- m_sum$Chisq[ppos]
-      # m_sum <- summary(m)
-      # pvals[i] <- unname(m_sum$coefficients[, 4])[ppos]
-      # tvals[i] <- unname(m_sum$coefficients[, 3])[ppos]
-      # m          <- ols_regress(paste(response, "~",
-      #                                 paste(predictors, collapse = " + ")), l)
-      # pvals[i]   <- m$pvalues[ppos]
-      # tvals[i]   <- m$tvalues[ppos]
     }
 
     minp  <- which(pvals == min(pvals))
-    # tvals <- abs(tvals)
-    # maxt  <- which(tvals == max(tvals))
 
     if (pvals[minp] <= pent) {
 
@@ -211,18 +182,6 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
       aic    <- c(aic, mfs$m_aic)
       bic    <- c(bic, mfs$m_bic)
       dev    <- c(dev, mfs$m_deviance)
-      # fr        <- ols_regress(paste(response, "~",
-      #                                paste(preds, collapse = " + ")), l)
-      # rsq       <- c(rsq, fr$rsq)
-      # adjrsq    <- c(adjrsq, fr$adjr)
-      # aic       <- c(aic, ols_aic(fr$model))
-      # sbc       <- c(sbc, ols_sbc(fr$model))
-      # sbic      <- c(sbic, ols_sbic(fr$model, model))
-      # cp        <- c(cp, ols_mallows_cp(fr$model, model))
-      # rmse      <- c(rmse, sqrt(fr$ems))
-      # betas     <- append(betas, fr$betas)
-      # lbetas    <- append(lbetas, length(fr$betas))
-      # pvalues   <- append(pvalues, fr$pvalues)
 
       if (details == TRUE) {
         cat("\n")
@@ -255,8 +214,6 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
                   family = binomial(link = 'logit'))
       m_sum <- Anova(m2, test.statistic = "Wald")
       pvals_r <- m_sum$`Pr(>Chisq)`
-      # tvals_r <- m_sum$Chisq[ppos]
-      # tvals_r <- abs(unname(m_sum$coefficients[, 3])[-1])
       maxp    <- which(pvals_r == max(pvals_r))
       if (pvals_r[maxp] > prem) {
 
@@ -271,16 +228,6 @@ blr_step_p_both.default <- function(model, pent = 0.1, prem = 0.3, details = FAL
         aic       <- c(aic, mfs$m_aic)
         bic       <- c(bic, mfs$m_bic)
         dev       <- c(dev, mfs$m_deviance)
-        # rsq       <- c(rsq, fr$rsq)
-        # adjrsq    <- c(adjrsq, fr$adjr)
-        # aic       <- c(aic, ols_aic(fr$model))
-        # sbc       <- c(sbc, ols_sbc(fr$model))
-        # sbic      <- c(sbic, ols_sbic(fr$model, model))
-        # cp        <- c(cp, ols_mallows_cp(fr$model, model))
-        # rmse      <- c(rmse, sqrt(fr$ems))
-        # betas     <- append(betas, fr$betas)
-        # lbetas    <- append(lbetas, length(fr$betas))
-        # pvalues   <- append(pvalues, fr$pvalues)
 
         if (details) {
           cat("\n")