moving to github

abcwcm · Mar 3, 2022 · 3fb6775 · 3fb6775
1 parent 67da8fd
commit 3fb6775
Show file tree

Hide file tree

Showing 62 changed files with 7,450 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,28 @@
+Package: KlebanoffMR4050
+Type: Package
+Title: Single-Cell-Sequencing Data Generated by The Klebanoff Lab for Sample MR4050
+Version: 0.1.1
+Author: c(person("Friederike", "Dündar", email = "frd2007@med.cornell.edu", role = c("aut","cre")),
+    person("Paul","Zumbo", email="paz2005@med.cornell.edu", role = c("aut","ctb")))
+Maintainer: Friederike Dündar <frd2007@med.cornell.edu>
+Description: SingleCellExperiment object and list of differentially expressed genes
+    as determined using 5'-DGE and V(D)J sequencing of tumor-antigen-specific T cells
+    and corresponding control cells; all obtained from donor MR4050 (breast
+    cancer patient MK-02). The tumor-specific antigen is a mutant form of PIK3CA.
+Depends:
+    R (>= 3.5.0)
+Imports:
+    BiocFileCache,
+    data.table,
+    EnsDb.Hsapiens.v86,
+    ggplot2,
+    magrittr,
+    scater,
+    SingleCellExperiment
+Suggests:
+    stringr,
+    usethis
+License: MIT
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,20 @@
+# Generated by roxygen2: do not edit by hand
+
+export(check_columns)
+export(extract_markers)
+export(load_DE_results)
+export(load_MR4050filt)
+export(load_MR4050merged)
+export(load_MR4050shared)
+export(load_RDSdata_from_Box)
+export(load_data_from_Box)
+export(load_sce)
+export(make_long_dt)
+export(my_table)
+export(plot_tgram)
+export(prep_data4tgram)
+export(run_DE)
+import(data.table)
+import(kableExtra)
+import(magrittr)
+import(scater)
diff --git a/R/data_DEresults.R b/R/data_DEresults.R
@@ -0,0 +1,45 @@
+#' Load the list of DE test results (any direction) for all clonotypes present
+#' in both conditions ('shared' clonotypes)
+#'
+#' @details The p- and q-values here represent a two-tailed test for any direction
+#' of the logFC.
+#' For details on how the DE analysis was done, see the vignette "DE_genes"
+#' and the wrapper function \code{\link{run_DE}}.
+#'
+#' @format Nested list where names correspond to the abbreviated clonotype IDs.
+#' For every clonotype, there's a list that contains:
+#' \describe{
+#' \item{findMarkers_results:}{A SimpleDataFrameList, i.e. the original result
+#' of \code{scran::findMarkers()}, but only for the MUT comparison}
+#' \item{marker_IDs:}{a data.table with the genes that passed the FDR threshold;
+#' if that is NULL, this implies that there were no DEG for that particular
+#' clonotype comparing MUT vs WT}
+#' }
+#' @usage load_DE_results("MR4050")
+#' @examples \dontrun{
+#' library(KlebanoffMR4050)
+#'
+#' sce.shared <- load_MR4050shared()
+#' sce.shared$antigen <-  factor(gsub("\\..*","",sce.shared$Sample),
+#'     levels = c("WT", "MUT"), ordered = TRUE)
+#'
+#' delist.both <- lapply( unique(sce.shared$id), function(x){
+#' run_DE(
+#'    sce.shared[, sce.shared$id == x],
+#'    group_identifier = "antigen",
+#'    direction = "any",
+#'    FDR = 0.05, rank = Inf,
+#'    comp_name = paste0(x, "_"))
+#'    })
+#' names(delist.both) <- unique(sce.shared$id)
+#' }
+#'
+#'@export
+#'
+load_DE_results <- function(sample = "MR4050"){
+    fn <- system.file("extdata", "delist.both",
+        package = paste0("Klebanoff", sample), mustWork = TRUE)
+
+    fin <- read.table(fn, stringsAsFactor = FALSE)[[1]]
+    load_data_from_Box(fin, load_rds = FALSE)
+}
diff --git a/R/data_sce.R b/R/data_sce.R
@@ -0,0 +1,177 @@
+#' Load the SCE for the batch-corrected merged data set
+#'
+#' @description Load the SingleCellExperiment object that holds the batch-corrected
+#' reduced dimensionality results (MUT/WT = batch).
+#'
+#' @usage sce.filt <- load_MR4050filt()
+#'
+#' @seealso \code{\link{load_MR4050shared}}, \code{\link{load_MR4050merged}}
+#' @return an SCE object that needs to be assigned to an object in the environment
+#'
+#' @export
+#'
+load_MR4050filt <- function(){
+    out <- load_sce(which_assays = "all", sample = "MR4050")
+
+}
+
+#' Load the SCE for the batch-corrected merged data set
+#'
+#' @description Load the SingleCellExperiment object that holds the batch-corrected
+#' reduced dimensionality results (MUT/WT = batch).
+#'
+#' @usage sce.merged <- load_MR4050merged()
+#' @return an SCE object that needs to be assigned to an object in the environment
+#'
+#' @export
+#'
+load_MR4050merged <- function(){
+    out <- load_sce(which_assays = "reconstructed", sample = "MR4050Merged")
+    return(out)
+}
+
+#' Load the SCE for the batch-corrected merged data set
+#'
+#' @description Load the SingleCellExperiment object that holds the SCE
+#' representing cells with clonotypes that are present in both conditions.
+#' The UMAP coordinates were re-calculated on the reduced subset after removal
+#' of suspected doublets (see the vignette about the filtering).
+#'
+#' @return an SCE object that needs to be assigned to an object in the environment
+#'
+#' @usage sce.shared <- load_MR4050shared()
+#'
+#' @export
+load_MR4050shared <- function(){
+    out <- load_sce(which_assays = "logcounts", sample = "MR4050Shared")
+    return(out)
+}
+
+
+#' Filtered cells ' ' @format Named list of cell numbers following the filtering steps
+#described in this vignette.
+"cell_filt"
+
+#' Filtered genes ' ' @format Named list of gene numbers following the filtering steps
+#described in this vignette.
+"gene_filt"
+
+
+#' Load the filtered and processed SingleCellExperiment object
+#'
+#' @description Use this function to load the processed and filtered gene expression
+#' data plus the clonotype information stored within one SingleCellExperiment
+#' object.
+#'
+#'
+#' @param which_assays can be "all" (default) or individual assays, e.g. c("logcounts",
+#' "counts") etc. If space and memory are problematic, definitely limit the selection here!
+#' assays that are available are: "counts", "logcounts"
+#' @param ... Additional parameters passed on to \code{load_RDSdata_from_Box}, e.g.
+#' \code{check_for_updates = TRUE}
+#'
+#' @details
+#' For the entire code of the filtering and processing, see the vignette
+#' \code{01_processing.Rmd}.
+#'
+#' The resulting SCE object contains the usual content: colData with information
+#' about individidual cells, rowData with info about individual genes, reducedDims,
+#' etc.
+#'
+#' The \code{colData} includes:
+#'
+#' \describe{
+#' \item{Barcode:}{Each cell's barcode used for keeping track of its identity during sequencing.}
+#' \item{Sample:}{'WT' or 'MUT'}
+#' \item{raw_clonotype_id:}{e.g. 'clonotype94'}
+#' \item{cdr3s_aa:}{The amino acid sequence of the CDR3 portion, e.g. "TRA:CIARGGGGADGLTF;TRA:CGADRNGNEKLTF;TRB:CASSLTTDREPYEQYF"}
+#' \item{multiTRA:}{TRUE/FALSE entries based on whether \code{cdr3s_aa} contained more than one entry for TRA}
+#' \item{multiTRB:}{TRUE/FALSE entries based on whether \code{cdr3s_aa} contained more than one entry for TRB}
+#' \item{numTRA:}{Number of TRA sequences within \code{cdr3s_aa}}
+#' \item{numTRB:}{Number of TRB sequences within \code{cdr3s_aa}}
+#' \item{cluster:}{clustering results of all cells}
+#' }
+#'
+#' The object also containes the coordinates from dimensionality reductions (see
+#' examples for more details).
+#'
+#'
+#' @usage sce.filt <- load_sce(which_assays = "logcounts", sample = "MR4050")
+#'
+#' @return A SingleCellExperiment object with cells from 'mutant' samples
+#'  (stimulation with tumor antigen) and from the 'wt' sample (stimulation
+#'  with an irrelevant antigen)).
+#'
+#'
+#' @examples \dontrun{
+#'
+#' > library(SingleCellExperiment)
+#' > sce.MR4050 <- load_sce(which_assays = "all", sample = "MR4050")
+#'
+#' > reducedDimNames(sce.MR4050)
+#' "corrected" "TSNE"      "UMAP"
+#'
+#' > assayNames(sce.MR4050)
+#' [1] "counts"                "logcounts"
+#' }
+#'
+#' @return SCE object
+#'
+#'
+#' @export
+#'
+load_sce <- function(which_assays = "all", sample = "Sample", ...){
+
+    ## the Box links are noted in the text file
+    fl <- system.file("extdata", paste0("sce_storage_", sample, ".txt"),
+        package = "KlebanoffMR4050")
+    if(fl == ""){stop(paste("sce_storage_", sample, ".txt does not exist in package 'KlebanoffMR4050'."))}
+
+    inf <- read.table(fl,stringsAsFactors = FALSE)
+
+    if(unique(inf$V3) != sample){stop("The sce_storage.txt file must contain a third column holding the sample name. Which should be the same as the one specified via sample = .")}
+
+    ## DOWNLOAD AND CACHE THE FILES FROM THE BOX ==============================
+    ## note: using the default cache of BioC here, we may want to change that
+    ## to something more specific via the `cache_path` option of `load_RDSdata_fromBox()`
+
+    ## load colData
+    cold <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "colData",]$V2, data_name = paste0("KlebColData",sample), ...)
+
+    ## load rowData
+    rowd <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "rowData",]$V2, data_name = paste0("KlebRowData", sample) , ...)
+
+    ## get reducedDims
+    rdms <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "reducedDims",]$V2, data_name = paste0("KlebRedDims", sample), ... )
+
+    ## metadata
+    metd <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "metadata",]$V2, data_name = paste0("KlebMetadata", sample), ... )
+
+    ## get assayData
+    if(which_assays == "all"){
+        ## extract corresponding assay entry from the text file
+        asss <- grep("^assay:", unique(inf$V1), value = TRUE)
+    }else{
+        asss <- unlist(lapply(which_assays, function(x) grep(paste0(":",x,"$"), unique(inf$V1), ignore.case = TRUE, value=TRUE)))
+        if(length(which_assays) == 0){
+            warning("None of the assays you specified are part of the file stored in inst/extdata, i.e. we can't find the links.")
+        }
+    }
+
+    assl <- list()
+    for(i in asss){
+        j <- gsub("^assay:","", i)
+        assl[[j]] <-  load_RDSdata_from_Box(
+            shared_link = inf[inf$V1 == i,]$V2, data_name = paste0("Kleb",j, sample), ...)
+    }
+
+    ## construct the SCE object =============================================
+    return(SingleCellExperiment::SingleCellExperiment(assays = assl,
+        colData = cold, rowData = rowd,
+        metadata = metd, reducedDims = rdms))
+}
+
diff --git a/R/data_sharedClonotypes.R b/R/data_sharedClonotypes.R
@@ -0,0 +1,46 @@
+#' Shared clonotypes
+#'
+#' @description \code{data.table} of the clonotypes that are found in both
+#' conditions MUT and WT in the MR4050 data set.
+#'
+#' @usage data(shared_clonotypes)
+#'
+#' @examples \dontrun{
+#'
+#' ## count TRA/TRB
+#' clono_freq <- colData(sce.filt)[, c("Sample" ,"cdr3s_aa")] %>%
+#'    as.data.frame %>% data.table(., keep.rownames = TRUE) %>%
+#'     .[!is.na(cdr3s_aa), .N, c("cdr3s_aa","Sample")]
+#' setorder(clono_freq, N)
+#'
+#' ## formatting the TRA/TRB notations
+#' ## will only work if there's just one TRA
+#' ct <- dcast(clono_freq, cdr3s_aa ~ Sample, value.var = "N") %>%
+#'  .[!is.na(MUT.MR4050) & !is.na(WT.MR4050)]
+#'
+#' ct[, TRA := gsub(";*TRB:[A-Z]+", "", cdr3s_aa)]
+#' ct[, TRA := ifelse(TRA == "", NA, TRA)]
+#' ct[, TRB := gsub(".*(TRB:[A-Z]+)", "\\1", cdr3s_aa)]
+#' ct[, TRB := ifelse(grepl("^TRA", TRB), NA, TRB)] # if only TRB was present,
+#'  I need to fill in the NA
+#'  setorder(ct, -MUT.MR4050, -WT.MR4050 )
+#'  shared_clonotypes <- copy(ct)
+#' }
+#'
+#' @seealso \code{clonotype_ids}
+"shared_clonotypes"
+
+
+
+#' Table of customized clonotype IDs for sample MR4050
+#'
+#' @description \code{data.table} with our customized clonotype IDs for ease of
+#' visualization and comparison. I.e., the CDR3s amino acid sequences are re-
+#' placed with arbitrary IDs. Note thate these clonotypes are those that are
+#' found in both conditions of patient MR4050, i.e. MUT and WT.
+#'
+#' @details See section "Adding the clonotype IDs" in the vignette "Filtering and Processing"
+#' about how the consolidation and clean up of the TRA/TRB sequences was done.
+#'
+#' @seealso \code{shared_clonotypes}
+"clonotype_ids"
diff --git a/R/reading_clonotypes.R b/R/reading_clonotypes.R
@@ -0,0 +1,64 @@
+#' Reading in clonotype info per barcode
+#'
+#' @details This function expects the files "filtered_contig_annotations.csv" and
+#' "clonotypes.csv" to be found in the "outs" folder after following \code{cellRanger_result_dir}.
+#' These files are used to match the cell barcodes with unique clonotype IDs.
+#'
+#' @param cellRanger_result_dir e.g. "/scratchLocal/frd2007/2018-11_Smita_Tcells/data/wildtype/".
+#'  This should be the path to the un-tar'ed directory from cellRanger.
+#'
+#'  @export
+#'
+reading_in_clonotypes <- function(cellRanger_result_dir){
+
+  ## reading barcodes (including empty ones)
+  bcs <- fread(paste0(cellRanger_result_dir,"outs/filtered_contig_annotations.csv"), sep = ",")
+
+  #> head(barcode_csv)
+  #barcode is_cell                   contig_id high_confidence length
+  #1 AAACCTGAGTGACATA-1    True AAACCTGAGTGACATA-1_contig_1            True    568
+  #2 AAACCTGAGTGACATA-1    True AAACCTGAGTGACATA-1_contig_3            True    567
+  #3 AAACCTGAGTGACATA-1    True AAACCTGAGTGACATA-1_contig_4            True    712
+  #4 AAACCTGCACCGCTAG-1    True AAACCTGCACCGCTAG-1_contig_1            True    564
+  #5 AAACCTGCACCGCTAG-1    True AAACCTGCACCGCTAG-1_contig_2            True    584
+  #6 AAACCTGCACTTCTGC-1    True AAACCTGCACTTCTGC-1_contig_1            True    569
+  #chain   v_gene d_gene  j_gene c_gene full_length productive
+  #1   TRA   TRAV19   None  TRAJ50   TRAC        True       True
+  #2   TRA TRAV13-1   None  TRAJ45   TRAC        True       None
+  #3   TRB   TRBV19  TRBD2 TRBJ2-7  TRBC2        True       True
+  #4   TRB    TRBV9  TRBD2 TRBJ2-6  TRBC2        True       True
+  #5   TRA TRAV12-2   None  TRAJ44   TRAC        True       True
+  #6   TRA   TRAV25   None  TRAJ54   TRAC        True       True
+  #cdr3                                                   cdr3_nt
+  #1     CALSSMKTSYDKVIF             TGTGCTCTGAGTAGCATGAAAACCTCCTACGACAAGGTGATATTT
+  #2                None                                                      None
+  #3      CATSLALAAYEQYF                TGTGCCACGAGTCTGGCGCTAGCGGCGTACGAGCAGTACTTC
+  #4 CASSGLAGGPVSGANVLTF TGTGCCAGCAGCGGACTAGCGGGAGGGCCCGTTTCTGGGGCCAACGTCCTGACTTTC
+  #5       CAGNTGTASKLTF                   TGTGCCGGAAATACCGGCACTGCCAGTAAACTCACCTTT
+  #6        CAGLQGAQKLVF                      TGTGCAGGCCTTCAGGGAGCCCAGAAGCTGGTATTT
+  #reads umis raw_clonotype_id         raw_consensus_id
+  #1  2511    7     clonotype146 clonotype146_consensus_1
+  #2   679    2     clonotype146                     None
+  #3   850    4     clonotype146 clonotype146_consensus_2
+  #4 10237   36       clonotype1   clonotype1_consensus_1
+  #5  3470   10       clonotype1   clonotype1_consensus_2
+  #6  1780    7     clonotype147 clonotype147_consensus_1
+
+  ## extract those with a clonotype
+  bcs.sub <- bcs[ raw_clonotype_id != "None", c("raw_clonotype_id","barcode"), with=FALSE] %>% unique
+
+  ## reading clonotypes
+  clotp <- fread(paste0(cellRanger_result_dir, "outs/clonotypes.csv"), sep = ",")
+  setnames(clotp, "clonotype_id", "raw_clonotype_id")
+
+  ## join the informationbiocon
+  jnd <- merge(clotp, bcs.sub, by = "raw_clonotype_id", all = TRUE)
+
+  ## sanity check
+  tst <- jnd[raw_clonotype_id == "clonotype1", .N] ==  unique(jnd[raw_clonotype_id == "clonotype1"]$frequency)
+  if(isFALSE(tst)){
+    warning("The number of rows for clonotype1 does not match the frequency taken from the clonotypes.csv file.")
+  }
+
+  return(jnd)
+}