Merge pull request #62 from FredHutch/handle_lm_issue

add filtering
FredHutch · Sep 30, 2024 · 68c0f98 · 68c0f98
2 parents 588b580 + fa738ba
commit 68c0f98
Show file tree

Hide file tree

Showing 9 changed files with 51 additions and 27 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,8 +7,15 @@ export(get_example_data)
 export(gimap_annotate)
 export(gimap_filter)
 export(gimap_normalize)
+export(qc_cdf)
+export(qc_constructs_countzero_bar)
+export(qc_cor_heatmap)
+export(qc_plasmid_histogram)
+export(qc_sample_hist)
+export(qc_variance_hist)
 export(run_qc)
 export(setup_data)
+export(supported_cell_lines)
 import(dplyr)
 import(ggplot2)
 import(kableExtra)

diff --git a/R/03-annotate.R b/R/03-annotate.R
@@ -136,12 +136,7 @@ gimap_annotate <- function(.data = NULL,
       )
     )
 
-  if (gimap_dataset$filtered_data$filter_step_run) {
-    keep_for_annotdf <- annotation_df$pgRNA_id %in% unlist(gimap_dataset$filtered_data$metadata_pg_ids)
-    annotation_df <- annotation_df[keep_for_annotdf, ]
-  }
-
-  ################################ STORE IT ####################################
+################################ STORE IT ####################################
 
   if (gimap_dataset$filtered_data$filter_step_run) {
     keep_for_annotdf <- annotation_df$pgRNA_id %in% unlist(gimap_dataset$filtered_data$metadata_pg_ids)

diff --git a/R/04-normalize.R b/R/04-normalize.R
@@ -9,6 +9,9 @@
 #' @param replicates Specifies the column name of the metadata set up in `$metadata$sample_metadata`
 #' that has a factor that represents column that specifies replicates. These replicates will be kept separate
 #' for the late but the early and plasmid others will be averaged and used for normalization.
+#' @param num_ids_wo_annot default is 20; the number of pgRNA IDs to display to console if they don't have corresponding annotation data;
+#' ff there are more IDs without annotation data than this number, the output will be sent to a file rather than the console.
+#' @param rm_ids_wo_annot default is TRUE; whether or not to filter out pgRNA IDs from the input dataset that don't have corresponding annotation data available
 #' @export
 #' @examples \dontrun{
 #'
@@ -31,7 +34,9 @@
 gimap_normalize <- function(.data = NULL,
                             gimap_dataset,
                             timepoints = NULL,
-                            replicates = NULL) {
+                            replicates = NULL,
+                            num_ids_wo_annot = 20,
+                            rm_ids_wo_annot = TRUE) {
   # Code adapted from
   # https://github.com/FredHutch/GI_mapping/blob/main/workflow/scripts/03-filter_and_calculate_LFC.Rmd
 
@@ -83,6 +88,7 @@ gimap_normalize <- function(.data = NULL,
     dataset <- gimap_dataset$transformed_data$log2_cpm
     pg_ids <- gimap_dataset$metadata$pg_ids
   }
+
   # Doing some reshaping to get one handy data frame
   lfc_df <- dataset %>%
     as.data.frame() %>%
@@ -94,18 +100,34 @@ gimap_normalize <- function(.data = NULL,
     tidyr::pivot_wider(values_from = "log2_cpm",
                        names_from = c(timepoints, replicates))
 
-
+
+  missing_ids <- data.frame(
+    missing_ids = setdiff(lfc_df$pg_ids, gimap_dataset$annotation$pgRNA_id)
+  )
+
+  if ((nrow(missing_ids) > 0) & (nrow(missing_ids) < num_ids_wo_annot)){
+    message("The following ", nrow(missing_ids), " IDs were not found in the annotation data: \n", paste0(missing_ids, collapse = ", "))
+  } else {
+    missing_ids_file <- file.path("missing_ids_file.csv")
+    readr::write_csv(missing_ids, missing_ids_file)
+  }
+
+  if ((nrow(missing_ids) > 0) & (rm_ids_wo_annot == TRUE)){
+    lfc_df <- lfc_df %>%
+      filter(!pg_ids %in% missing_ids)
+    message("The input data for the IDs which were not found in the annotation data has been filtered out and will not be included in the analysis output.")
+  } else{
+    message("The input data for the IDs which were not found in the annotation data will be kept throughout the analysis, but any data from the annotation won't be available for them.")
+  }
 
   # Calculate the means for each construct across the groups
   plasmid_df <- apply(dplyr::select(lfc_df, starts_with("plasmid")), 1, mean)
 
-
   late_vs_plasmid_df <-  lfc_df %>%
     dplyr::mutate_at(dplyr::vars(dplyr::starts_with("late")), ~.x - plasmid_df) %>%
     dplyr::select(pg_ids, dplyr::starts_with("late"))  %>%
     dplyr::left_join(gimap_dataset$annotation, by = c("pg_ids" = "pgRNA_id"))
 
-
   ########################### Perform adjustments #############################
 
   ### Calculate medians
@@ -128,7 +150,7 @@ gimap_normalize <- function(.data = NULL,
       # Then, divide by the median of negative controls (double non-targeting) minus
       # median of positive controls (targeting 1 essential gene).
       # This will effectively set the median of the positive controls (essential genes) to -1.
-      lfc_adj = lfc_adj1 / (median(lfc_adj1[norm_ctrl_flag == "negative_control"]) - median(lfc_adj1[norm_ctrl_flag == "positive_control"]))
+      lfc_adj = lfc_adj1 / (median(lfc_adj1[norm_ctrl_flag == "negative_control"], na.rm = TRUE) - median(lfc_adj1[norm_ctrl_flag == "positive_control"]))
     ) %>%
     ungroup()
 

diff --git a/R/06-calculate_gi.R b/R/06-calculate_gi.R
@@ -30,7 +30,6 @@ calc_gi <- function(.data = NULL,
 
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
 
-
   ## calculate expected double-targeting GI score by summing the two mean single-targeting
   ## CRISPR scores for that paralog pair
   gi_calc_df <- gimap_dataset$crispr_score %>%
@@ -39,6 +38,7 @@ calc_gi <- function(.data = NULL,
       target_type == "gene_ctrl" ~ mean_single_target_crispr_1 + mean_double_control_crispr_2,
       target_type == "ctrl_gene" ~ mean_double_control_crispr_1 + mean_single_target_crispr_2
     ))
+
   # Calculating mean crisprs
   overall_results <- gi_calc_df %>%
     dplyr::group_by(rep, pgRNA_target_double) %>%

diff --git a/man/calc_crispr.Rd b/man/calc_crispr.Rd
diff --git a/man/calc_gi.Rd b/man/calc_gi.Rd
diff --git a/man/gimap_annotate.Rd b/man/gimap_annotate.Rd
diff --git a/man/gimap_normalize.Rd b/man/gimap_normalize.Rd
diff --git a/man/setup_data.Rd b/man/setup_data.Rd