Merge pull request #5 from gorkang/development

Development
gorkang · Sep 2, 2018 · 1aa2397 · 1aa2397
2 parents f982897 + 9bcabdb
commit 1aa2397
Show file tree

Hide file tree

Showing 11 changed files with 188 additions and 148 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: BayesianReasoning
-Title: Plot and help understand Positive Predictive Values, and their relationship with Sensitivity, Specificity and Prevalence.  
+Title: Plot and help understand Positive and Negative Predictive Values, and their relationship with Sensitivity, Specificity and Prevalence.  
 Version: 0.1
 Authors@R: person("Gorka", "Navarrete", email = "gorkang@gmail.com",
                   role = c("aut", "cre"))

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,9 @@
+# README
+
+### v0.1 It barely works  
+
+Initial release, includes 3 functions:  
+
+* **PPV_heatmap.R**: Plot PPV heatmaps  
+* **PPV_diagnostic_vs_screening.R**: Plot PPV for a diagnostic and a screening group  
+* **min_possible_prevalence.R**: Show minimum possible prevalence given the test characteristics  
diff --git a/R/.createPPVmatrix.R b/R/.createPPVmatrix.R
@@ -0,0 +1,73 @@
+.createPPVmatrix <- function(Max_Prevalence, Sensitivity, Max_FP) {
+
+  # Libraries ---------------------------------------------------------------
+  # if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
+  if (!require('reshape2')) install.packages('reshape2'); library('reshape2')
+
+
+    # DEBUG -------------------------------------------------------------------
+
+    # Max_Prevalence = 4
+    # Sensitivity = 90
+    # Max_FP = 10
+    # PPV_NPV = "PPV" # NPV/PPV
+
+    #TEST Parameters **************
+
+    # False Positives (x axis) 
+    Steps_FP <<- 100
+    Step_size_FP <<- Max_FP/Steps_FP
+    Min_FP <<- 0 
+    FP = seq(Min_FP, Max_FP, Step_size_FP) #With (Max_FP-Step_size_FP) we get 100 FPs. If we use Max_FP instead we have 101 (because we start at 0!)
+
+    #CONDITION Parameters ***********
+
+    #Prevalence_y - x out of y
+    Prevalence_x <<- 1
+    Min_Prevalence <<- 1
+    Steps_Prevalence <<- 100
+    Step_size_Prevalence <<- Max_Prevalence/Steps_Prevalence
+    Prevalence = seq(Min_Prevalence, (1 + Max_Prevalence), Step_size_Prevalence) #With (1 + Max_Prevalence) we get 101. If we use Max_Prevalence we get 100
+
+
+    # PPV Calculation -------------------------------------------------------------
+
+        # We calculate the 100x100 PPV matrix using %o% (outer)
+        PPV <<- (Sensitivity * Prevalence_x) / ((Sensitivity * Prevalence_x) + ((Prevalence - 1) %o% FP) )
+
+        #Label columns and rows of matrix
+        colnames(PPV) = FP
+        rownames(PPV) = Prevalence
+
+        # Long format para ggplot Heatmap
+        PPV_melted = melt(PPV)
+
+        # Give names to variables
+        names(PPV_melted) = c("melted_Prevalence", "melted_FP", "melted_PPV") 
+
+        PPV_melted <<- PPV_melted
+
+
+    # NPV Calculation -------------------------------------------------------------
+
+        # As NPV is more dependent on Sensitivity, the x axis should show Sensitivity, and FP should be fixed
+        NPV <<- ((Prevalence - 1) %o% (100 - FP)) / (((Prevalence - 1) %o% (100 - FP)) + (Prevalence_x * (100 - Sensitivity)))
+
+        #Label columns and rows of matrix
+        colnames(NPV) = FP
+        rownames(NPV) = Prevalence
+
+        # Long format para ggplot Heatmap
+        NPV_melted = melt(NPV)
+
+        # Give names to variables
+        names(NPV_melted) = c("melted_Prevalence", "melted_FP", "melted_NPV") 
+
+        NPV_melted <<- NPV_melted
+
+        # HACK to easily switch betweeen NPV and PPV
+        if (PPV_NPV == "NPV") {
+          PPV_melted = NPV_melted
+        }
+
+}
diff --git a/R/PPV_diagnostic_vs_screening.R b/R/PPV_diagnostic_vs_screening.R
@@ -1,6 +1,6 @@
 #' Plot PPV for a diagnostic and a screening group
 #'
-#'Plot PPV associated to different levels of FP and a specific Sensitivity, for two different Prevalence groups. 
+#' Plot PPV associated to different levels of FP and a specific Sensitivity, for two different Prevalence groups. 
 #'
 #' @param Max_FP False positive rate (1-Specificity) [0-100].
 #' @param Sensitivity Sensitivity of the test [0-100].
@@ -23,13 +23,17 @@
 #' PPV_diagnostic_vs_screening(Max_FP = 10, Sensitivity = 100, 
 #'                             prevalence_screening_group = 1667, 
 #'                             prevalence_diagnostic_group = 44, 
-# '                            labels_prevalence = c("20 y.o.", "50 y.o."))
+#'                            labels_prevalence = c("20 y.o.", "50 y.o."))
 PPV_diagnostic_vs_screening <- function(Max_FP = 10, Sensitivity = 100, prevalence_screening_group = 100,  prevalence_diagnostic_group = 2, 
                                         labels_prevalence = c("Screening", "Diagnostic"),
-                                        save_plot = TRUE) {
-  # Prevalences (1 out of x)  
+                                        save_plot = FALSE) {
+
+  # Libraries ---------------------------------------------------------------
+  if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
+  if (!require('tidyr')) install.packages('tidyr'); library('tidyr')
+  if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
+  if (!require('reshape2')) install.packages('reshape2'); library('reshape2')
 
-  library(tidyverse)
 
   # PARAMETERS --------------------------------------------------------------
   # Max_FP = 10
@@ -69,42 +73,38 @@ PPV_diagnostic_vs_screening <- function(Max_FP = 10, Sensitivity = 100, prevalen
 
   # Plot --------------------------------------------------------------------
 
-  # Labels_plot = c(paste0("Real: ", (1/prevalence_screening_group) * 100, "%"), paste0("Study: ", (1/(prevalence_diagnostic_group) * 100), "%"))
-  Labels_plot = c(paste0(labels_prevalence[1], " prevalence: 1 out of ", prevalence_screening_group), paste0(labels_prevalence[2], " prevalence: 1 out of ", prevalence_diagnostic_group))
-
-  p = ggplot(data = FINAL, aes(x = FP, y = PPV, colour = Prevalence)) + 
-    geom_line(size = 1.5) + 
-    scale_colour_hue(l = 50, labels = Labels_plot) +
-    theme_minimal() +
-    theme(text = element_text(size = 20)) +
-    scale_x_continuous(labels = function(x) paste0(x, "%")) +
-    scale_y_continuous(name = "Positive Predictive Value", limits = c(0, 100), labels = function(x) paste0(x, "%")) +
-    theme(legend.position = "bottom") +  
-    labs(title = "",
-         subtitle = paste0("Sensitivity = ", Sensitivity, "%" ), 
-         x = "False Positive rate", 
-         color = "") 
-  # labs(caption = "(based on data from ...)") + 
-  # theme(plot.caption = element_text(size = 10))
-  # guides(color=guide_legend("Prevalence of the "))
-
-  print(p)
-
-  # Parameters = paste0("FP", Max_FP, "_Sens", Sensitivity, "_PReal", prevalence_screening_group, "_PStudy", prevalence_diagnostic_group)
-  # ggsave(paste0("outputs/diagnostic_vs_screening/", Parameters, ".svg"), Plot_PPV, dpi = 300, width = 14, height = 10)
-  # ggsave(paste0("outputs/diagnostic_vs_screening/", Parameters, ".png"), Plot_PPV, dpi = 300, width = 14, height = 10)
-
-  if (save_plot == TRUE) {
-
-    print(p)
-    plot_name = paste0("outputs/diagnostic_vs_screening/", "FP_", Max_FP, "_sens_", Sensitivity, "_screening_", prevalence_screening_group, "_diagnostic_", prevalence_diagnostic_group, ".png")
-    ggsave(plot_name, p, dpi = 300, width = 14, height = 10)
-    cat("\n Plot created in: ", plot_name, "\n")
+    Labels_plot = c(paste0(labels_prevalence[1], " prevalence: 1 out of ", prevalence_screening_group), paste0(labels_prevalence[2], " prevalence: 1 out of ", prevalence_diagnostic_group))
 
-  } else {
+    p = ggplot(data = FINAL, aes(x = FP, y = PPV, colour = Prevalence)) + 
+      geom_line(size = 1.5) + 
+      scale_colour_hue(l = 50, labels = Labels_plot) +
+      theme_minimal() +
+      theme(text = element_text(size = 20)) +
+      scale_x_continuous(labels = function(x) paste0(x, "%")) +
+      scale_y_continuous(name = "Positive Predictive Value", limits = c(0, 100), labels = function(x) paste0(x, "%")) +
+      theme(legend.position = "bottom") +  
+      labs(title = "",
+           subtitle = paste0("Sensitivity = ", Sensitivity, "%" ), 
+           x = "False Positive rate", 
+           color = "") 
+    # labs(caption = "(based on data from ...)") + 
+    # theme(plot.caption = element_text(size = 10))
+    # guides(color=guide_legend("Prevalence of the "))
 
     print(p)
 
-  }
 
-}
+    if (save_plot == TRUE) {
+
+      print(p)
+      plot_name = paste0("outputs/diagnostic_vs_screening/", "FP_", Max_FP, "_sens_", Sensitivity, "_screening_", prevalence_screening_group, "_diagnostic_", prevalence_diagnostic_group, ".png")
+      ggsave(plot_name, p, dpi = 300, width = 14, height = 10)
+      cat("\n Plot created in: ", plot_name, "\n")
+
+    } else {
+
+      print(p)
+
+    }
+
+}
diff --git a/R/PPV_heatmap.R b/R/PPV_heatmap.R
@@ -39,24 +39,26 @@
 #'             overlay_labels = c("40 y.o.", "35 y.o.", "30 y.o.", "25 y.o.", "20 y.o."),
 #'             overlay_position_FP = c(4.8, 4.8, 4.8, 4.8, 4.8),
 #'             overlay_position_Prevalence = c(68, 249, 626, 946, 1068))
-PPV_heatmap <- function(Max_Prevalence, Sensitivity, Max_FP, 
+PPV_heatmap <- function(Max_Prevalence, Sensitivity, Max_FP,
                             overlay = FALSE, overlay_labels, overlay_position_FP, overlay_position_Prevalence, 
                             label_title = "", label_subtitle = "",
-                            Language = "en", save_plot = TRUE) {
+                            Language = "en", save_plot = FALSE,
+                            PPV_NPV = "PPV") {
 
   # Libraries ---------------------------------------------------------------
-
-  if (!require('pacman')) install.packages('pacman'); library('pacman')
-  p_load(tidyverse, reshape2)
-
+  # if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
+  if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
+  if (!require('reshape2')) install.packages('reshape2'); library('reshape2')
+  source("R/.createPPVmatrix.R", local = TRUE)
 
+
   # DEBUG -------------------------------------------------
 
     # overlay = TRUE # TRUE / FALSE
     # Language = "en" # "sp" / "en"
-    # Sensitivity = 99 # [0-100]
-    # Max_FP = 10 # FP (1-Specificity): [0-100]
-    # Max_Prevalence = 1667.5 # Prevalence (1 out of X): [1-Inf?]
+    # Max_Prevalence = 4 # Prevalence (1 out of X): [1-Inf?]
+    # Sensitivity = 90 # [0-100]
+    # Max_FP = 5 # FP (1-Specificity): [0-100]
     # label_subtitle = "PPV of Mammogram for Breast Cancer by Age"
     # overlay = TRUE
     # overlay_labels = c("80", "70", "60", "50", "40", "30", "20  y.o.")
@@ -112,42 +114,11 @@ PPV_heatmap <- function(Max_Prevalence, Sensitivity, Max_FP,
         }
       }
 
-
-      #TEST Parameters **************
 
-          # False Positives (x axis) 
-          Steps_FP = 100
-          Step_size_FP = Max_FP/Steps_FP
-          Min_FP = 0 
-          FP = seq(Min_FP, Max_FP, Step_size_FP) #With (Max_FP-Step_size_FP) we get 100 FPs. If we use Max_FP instead we have 101 (because we start at 0!)
-
-      #CONDITION Parameters ***********
-
-          #Prevalence_y - x out of y
-          Prevalence_x = 1
-          Min_Prevalence = 1
-          Steps_Prevalence = 100
-          Step_size_Prevalence = Max_Prevalence/Steps_Prevalence
-          Prevalence = seq(Min_Prevalence, (1 + Max_Prevalence), Step_size_Prevalence) #With (1 + Max_Prevalence) we get 101. If we use Max_Prevalence we get 100
-
 
-  # Calculation -------------------------------------------------------------
+  # Create PPV matrix -------------------------------------------------------
+    .createPPVmatrix(Max_Prevalence, Sensitivity, Max_FP)
 
-      # We calculate the 100x100 PPV matrix using %o% (outer)
-      PPV = (Sensitivity * Prevalence_x) / ((Sensitivity * Prevalence_x) + ((Prevalence - 1) %o% FP) )
-      # NPV = (Sensitivity * Prevalence_x) / ((Sensitivity * Prevalence_x) + ((Prevalence - 1) %o% FP) )
-
-      #Label columns and rows of matrix
-      colnames(PPV) = FP
-      rownames(PPV) = Prevalence
-
-      # Long format para ggplot Heatmap
-      PPV_melted = melt(PPV)
-
-      # Give names to variables
-      names(PPV_melted) = c("melted_Prevalence", "melted_FP", "melted_PPV") 
-
-
 
   # PLOT --------------------------------------------------------------------
 
@@ -160,7 +131,7 @@ PPV_heatmap <- function(Max_Prevalence, Sensitivity, Max_FP,
       breaks_x = seq(0, Max_FP, Step_size_FP * 10)
       labels_x = paste0(seq(Min_FP, Max_FP, Step_size_FP * 10), "%")
       breaks_y = seq(0, Max_Prevalence, Step_size_Prevalence * 10)
-      labels_y = paste(prevalence_label, round(seq(Min_Prevalence - 1, Max_Prevalence, Step_size_Prevalence * 10), 0))[-1]
+      labels_y = paste(prevalence_label, round(seq(Min_Prevalence - 1, Max_Prevalence, Step_size_Prevalence * 10),0))[-1]
       labels_y = c(paste(prevalence_label, "1"), labels_y) #We want the legend to start on 1 out of 1
 
 

diff --git a/R/min_possible_prevalence.R b/R/min_possible_prevalence.R
@@ -1,28 +1,29 @@
-#' Show minimum possible prevalence
+#' Show minimum possible prevalence given the test characteristics
 #'
 #'Given a FP and a desired PPV, what is the Minimum Prevalence of a Condition 
 #'
-#' @param Sensitivity Sensitivity of the test.
-#' @param FP_test False positive rate (1-Specificity) [0-100].
-#' @param min_PPV_desired Which PPV is what you consider the minimum to trust a positive result in the test.
+#' @param Sensitivity Sensitivity of the test: [0-100]
+#' @param FP_test False positive rate (1-Specificity): [0-100]
+#' @param min_PPV_desired Which PPV is what you consider the minimum to trust a positive result in the test: [0-100]
 #'
 #' @return A description showing the minimum necessary prevalence.
 #' @export
 #'
 #' @examples
 #' 
 #' # Example 1
-#' min_possible_prevalence(Sensitivity = 99.9, FP_test = 0.2, min_PPV_desired = 0.9)
-#' > To get a PPV of 0.9 with a test with 99.9 % Sensitivity and 0.2 % False Positive Rate, you need a prevalence of at least 1 out of 56
+#' > min_possible_prevalence(Sensitivity = 99.9, FP_test = .1, min_PPV_desired = 70)
+#' To reach a PPV of 70 when using a test with 99.9 % Sensitivity and 0.1 % False Positive Rate, you need a prevalence of at least 1 out of 429
 #' 
 #' # Example 2
-#' min_possible_prevalence(100, 0.1, 0.98)
-#' > To get a PPV of 0.98 with a test with 100 % Sensitivity and 0.1 % False Positive Rate, you need a prevalence of at least 1 out of 21
+#' > min_possible_prevalence(100, 0.1, 98)
+#' To reach a PPV of 98 when using a test with 100 % Sensitivity and 0.1 % False Positive Rate, you need a prevalence of at least 1 out of 21
 min_possible_prevalence <- function(Sensitivity, FP_test, min_PPV_desired) {
 
-  if (!require('pacman')) install.packages('pacman'); library('pacman')
-  p_load(tidyverse, reshape2)
-
+  # Libraries ---------------------------------------------------------------
+  # if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
+  if (!require('reshape2')) install.packages('reshape2'); library('reshape2')
+
   #TEST Parameters **************
     #FP
     Max_FP = 100 
@@ -35,12 +36,11 @@ min_possible_prevalence <- function(Sensitivity, FP_test, min_PPV_desired) {
 
     #Prevalence_y - x out of y
     Prevalence_x = 1
-
     Min_Prevalence = 1
     Max_Prevalence = 10000 # CHANGE ME
     Steps_Prevalence = 10000
     Step_size_Prevalence = Max_Prevalence/Steps_Prevalence
-    Prevalence = seq(Min_Prevalence, (1 + Max_Prevalence), Step_size_Prevalence) #With (1 + Max_Prevalence) we get 101. If we use Max_Prevalence we get 100
+    Prevalence = seq(Min_Prevalence, (1 + Max_Prevalence), Step_size_Prevalence)
 
     # ****************************************************************************************
 
@@ -55,7 +55,10 @@ min_possible_prevalence <- function(Sensitivity, FP_test, min_PPV_desired) {
     #Por algun motivo, melt a veces cambia nombres de variables. De este modo los fijamos
     names(PPV_melted) = c("melted_Prevalence", "melted_FP", "melted_PPV") 
 
+
+    output_prevalence = max(PPV_melted$melted_Prevalence[PPV_melted$melted_PPV > (min_PPV_desired/100) & PPV_melted$melted_FP == FP_test])
+
+
     # Function output!
-    cat("To get a PPV of", min_PPV_desired, "with a test with", Sensitivity, "% Sensitivity and", FP_test, "% False Positive Rate, you need a prevalence of at least 1 out of", max(PPV_melted$melted_Prevalence[PPV_melted$melted_PPV > min_PPV_desired & PPV_melted$melted_FP == FP_test]))
-
+    cat("To reach a PPV of", min_PPV_desired, "when using a test with", Sensitivity, "% Sensitivity and", FP_test, "% False Positive Rate, you need a prevalence of at least 1 out of", output_prevalence)
 }
diff --git a/README.md b/README.md
@@ -82,3 +82,27 @@ PPV_diagnostic_vs_screening(Max_FP = 10,
 ![](outputs/diagnostic_vs_screening/FP_10_sens_100_screening_1000_diagnostic_3.png)
 
 
+---   
+
+
+## min_possible_prevalence
+
+Imagine you would like to use a test in a population and have a 98% PPV, that is, *if* a positive result comes out in the test, you would like a 98% certainty that it is a true positive. How high should be the prevalence of the disease in that group?  
+
+``` r 
+min_possible_prevalence(100, 0.1, 98)
+```
+
+`To reach a PPV of 98 when using a test with 100 % Sensitivity and 0.1 % False Positive Rate, you need a prevalence of at least 1 out of 21`
+
+--- 
+
+Another example, with a very good test, and lower expectations:  
+
+``` r 
+min_possible_prevalence(Sensitivity = 99.9, FP_test = .1, min_PPV_desired = 70)
+```
+
+$ To reach a PPV of 70 when using a test with 99.9 % Sensitivity and 0.1 % False Positive Rate, you need a prevalence of at least 1 out of 429  
+
+---