Skip to content

Commit

Permalink
moving to github
Browse files Browse the repository at this point in the history
  • Loading branch information
friedue committed Mar 3, 2022
1 parent 67da8fd commit 3fb6775
Show file tree
Hide file tree
Showing 62 changed files with 7,450 additions and 2 deletions.
28 changes: 28 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Package: KlebanoffMR4050
Type: Package
Title: Single-Cell-Sequencing Data Generated by The Klebanoff Lab for Sample MR4050
Version: 0.1.1
Author: c(person("Friederike", "Dündar", email = "frd2007@med.cornell.edu", role = c("aut","cre")),
person("Paul","Zumbo", email="paz2005@med.cornell.edu", role = c("aut","ctb")))
Maintainer: Friederike Dündar <frd2007@med.cornell.edu>
Description: SingleCellExperiment object and list of differentially expressed genes
as determined using 5'-DGE and V(D)J sequencing of tumor-antigen-specific T cells
and corresponding control cells; all obtained from donor MR4050 (breast
cancer patient MK-02). The tumor-specific antigen is a mutant form of PIK3CA.
Depends:
R (>= 3.5.0)
Imports:
BiocFileCache,
data.table,
EnsDb.Hsapiens.v86,
ggplot2,
magrittr,
scater,
SingleCellExperiment
Suggests:
stringr,
usethis
License: MIT
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
20 changes: 20 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by roxygen2: do not edit by hand

export(check_columns)
export(extract_markers)
export(load_DE_results)
export(load_MR4050filt)
export(load_MR4050merged)
export(load_MR4050shared)
export(load_RDSdata_from_Box)
export(load_data_from_Box)
export(load_sce)
export(make_long_dt)
export(my_table)
export(plot_tgram)
export(prep_data4tgram)
export(run_DE)
import(data.table)
import(kableExtra)
import(magrittr)
import(scater)
45 changes: 45 additions & 0 deletions R/data_DEresults.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#' Load the list of DE test results (any direction) for all clonotypes present
#' in both conditions ('shared' clonotypes)
#'
#' @details The p- and q-values here represent a two-tailed test for any direction
#' of the logFC.
#' For details on how the DE analysis was done, see the vignette "DE_genes"
#' and the wrapper function \code{\link{run_DE}}.
#'
#' @format Nested list where names correspond to the abbreviated clonotype IDs.
#' For every clonotype, there's a list that contains:
#' \describe{
#' \item{findMarkers_results:}{A SimpleDataFrameList, i.e. the original result
#' of \code{scran::findMarkers()}, but only for the MUT comparison}
#' \item{marker_IDs:}{a data.table with the genes that passed the FDR threshold;
#' if that is NULL, this implies that there were no DEG for that particular
#' clonotype comparing MUT vs WT}
#' }
#' @usage load_DE_results("MR4050")
#' @examples \dontrun{
#' library(KlebanoffMR4050)
#'
#' sce.shared <- load_MR4050shared()
#' sce.shared$antigen <- factor(gsub("\\..*","",sce.shared$Sample),
#' levels = c("WT", "MUT"), ordered = TRUE)
#'
#' delist.both <- lapply( unique(sce.shared$id), function(x){
#' run_DE(
#' sce.shared[, sce.shared$id == x],
#' group_identifier = "antigen",
#' direction = "any",
#' FDR = 0.05, rank = Inf,
#' comp_name = paste0(x, "_"))
#' })
#' names(delist.both) <- unique(sce.shared$id)
#' }
#'
#'@export
#'
load_DE_results <- function(sample = "MR4050"){
fn <- system.file("extdata", "delist.both",
package = paste0("Klebanoff", sample), mustWork = TRUE)

fin <- read.table(fn, stringsAsFactor = FALSE)[[1]]
load_data_from_Box(fin, load_rds = FALSE)
}
177 changes: 177 additions & 0 deletions R/data_sce.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#' Load the SCE for the batch-corrected merged data set
#'
#' @description Load the SingleCellExperiment object that holds the batch-corrected
#' reduced dimensionality results (MUT/WT = batch).
#'
#' @usage sce.filt <- load_MR4050filt()
#'
#' @seealso \code{\link{load_MR4050shared}}, \code{\link{load_MR4050merged}}
#' @return an SCE object that needs to be assigned to an object in the environment
#'
#' @export
#'
load_MR4050filt <- function(){
out <- load_sce(which_assays = "all", sample = "MR4050")

}

#' Load the SCE for the batch-corrected merged data set
#'
#' @description Load the SingleCellExperiment object that holds the batch-corrected
#' reduced dimensionality results (MUT/WT = batch).
#'
#' @usage sce.merged <- load_MR4050merged()
#' @return an SCE object that needs to be assigned to an object in the environment
#'
#' @export
#'
load_MR4050merged <- function(){
out <- load_sce(which_assays = "reconstructed", sample = "MR4050Merged")
return(out)
}

#' Load the SCE for the batch-corrected merged data set
#'
#' @description Load the SingleCellExperiment object that holds the SCE
#' representing cells with clonotypes that are present in both conditions.
#' The UMAP coordinates were re-calculated on the reduced subset after removal
#' of suspected doublets (see the vignette about the filtering).
#'
#' @return an SCE object that needs to be assigned to an object in the environment
#'
#' @usage sce.shared <- load_MR4050shared()
#'
#' @export
load_MR4050shared <- function(){
out <- load_sce(which_assays = "logcounts", sample = "MR4050Shared")
return(out)
}


#' Filtered cells ' ' @format Named list of cell numbers following the filtering steps
#described in this vignette.
"cell_filt"

#' Filtered genes ' ' @format Named list of gene numbers following the filtering steps
#described in this vignette.
"gene_filt"


#' Load the filtered and processed SingleCellExperiment object
#'
#' @description Use this function to load the processed and filtered gene expression
#' data plus the clonotype information stored within one SingleCellExperiment
#' object.
#'
#'
#' @param which_assays can be "all" (default) or individual assays, e.g. c("logcounts",
#' "counts") etc. If space and memory are problematic, definitely limit the selection here!
#' assays that are available are: "counts", "logcounts"
#' @param ... Additional parameters passed on to \code{load_RDSdata_from_Box}, e.g.
#' \code{check_for_updates = TRUE}
#'
#' @details
#' For the entire code of the filtering and processing, see the vignette
#' \code{01_processing.Rmd}.
#'
#' The resulting SCE object contains the usual content: colData with information
#' about individidual cells, rowData with info about individual genes, reducedDims,
#' etc.
#'
#' The \code{colData} includes:
#'
#' \describe{
#' \item{Barcode:}{Each cell's barcode used for keeping track of its identity during sequencing.}
#' \item{Sample:}{'WT' or 'MUT'}
#' \item{raw_clonotype_id:}{e.g. 'clonotype94'}
#' \item{cdr3s_aa:}{The amino acid sequence of the CDR3 portion, e.g. "TRA:CIARGGGGADGLTF;TRA:CGADRNGNEKLTF;TRB:CASSLTTDREPYEQYF"}
#' \item{multiTRA:}{TRUE/FALSE entries based on whether \code{cdr3s_aa} contained more than one entry for TRA}
#' \item{multiTRB:}{TRUE/FALSE entries based on whether \code{cdr3s_aa} contained more than one entry for TRB}
#' \item{numTRA:}{Number of TRA sequences within \code{cdr3s_aa}}
#' \item{numTRB:}{Number of TRB sequences within \code{cdr3s_aa}}
#' \item{cluster:}{clustering results of all cells}
#' }
#'
#' The object also containes the coordinates from dimensionality reductions (see
#' examples for more details).
#'
#'
#' @usage sce.filt <- load_sce(which_assays = "logcounts", sample = "MR4050")
#'
#' @return A SingleCellExperiment object with cells from 'mutant' samples
#' (stimulation with tumor antigen) and from the 'wt' sample (stimulation
#' with an irrelevant antigen)).
#'
#'
#' @examples \dontrun{
#'
#' > library(SingleCellExperiment)
#' > sce.MR4050 <- load_sce(which_assays = "all", sample = "MR4050")
#'
#' > reducedDimNames(sce.MR4050)
#' "corrected" "TSNE" "UMAP"
#'
#' > assayNames(sce.MR4050)
#' [1] "counts" "logcounts"
#' }
#'
#' @return SCE object
#'
#'
#' @export
#'
load_sce <- function(which_assays = "all", sample = "Sample", ...){

## the Box links are noted in the text file
fl <- system.file("extdata", paste0("sce_storage_", sample, ".txt"),
package = "KlebanoffMR4050")
if(fl == ""){stop(paste("sce_storage_", sample, ".txt does not exist in package 'KlebanoffMR4050'."))}

inf <- read.table(fl,stringsAsFactors = FALSE)

if(unique(inf$V3) != sample){stop("The sce_storage.txt file must contain a third column holding the sample name. Which should be the same as the one specified via sample = .")}

## DOWNLOAD AND CACHE THE FILES FROM THE BOX ==============================
## note: using the default cache of BioC here, we may want to change that
## to something more specific via the `cache_path` option of `load_RDSdata_fromBox()`

## load colData
cold <- load_RDSdata_from_Box(
shared_link = inf[inf$V1 == "colData",]$V2, data_name = paste0("KlebColData",sample), ...)

## load rowData
rowd <- load_RDSdata_from_Box(
shared_link = inf[inf$V1 == "rowData",]$V2, data_name = paste0("KlebRowData", sample) , ...)

## get reducedDims
rdms <- load_RDSdata_from_Box(
shared_link = inf[inf$V1 == "reducedDims",]$V2, data_name = paste0("KlebRedDims", sample), ... )

## metadata
metd <- load_RDSdata_from_Box(
shared_link = inf[inf$V1 == "metadata",]$V2, data_name = paste0("KlebMetadata", sample), ... )

## get assayData
if(which_assays == "all"){
## extract corresponding assay entry from the text file
asss <- grep("^assay:", unique(inf$V1), value = TRUE)
}else{
asss <- unlist(lapply(which_assays, function(x) grep(paste0(":",x,"$"), unique(inf$V1), ignore.case = TRUE, value=TRUE)))
if(length(which_assays) == 0){
warning("None of the assays you specified are part of the file stored in inst/extdata, i.e. we can't find the links.")
}
}

assl <- list()
for(i in asss){
j <- gsub("^assay:","", i)
assl[[j]] <- load_RDSdata_from_Box(
shared_link = inf[inf$V1 == i,]$V2, data_name = paste0("Kleb",j, sample), ...)
}

## construct the SCE object =============================================
return(SingleCellExperiment::SingleCellExperiment(assays = assl,
colData = cold, rowData = rowd,
metadata = metd, reducedDims = rdms))
}

46 changes: 46 additions & 0 deletions R/data_sharedClonotypes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' Shared clonotypes
#'
#' @description \code{data.table} of the clonotypes that are found in both
#' conditions MUT and WT in the MR4050 data set.
#'
#' @usage data(shared_clonotypes)
#'
#' @examples \dontrun{
#'
#' ## count TRA/TRB
#' clono_freq <- colData(sce.filt)[, c("Sample" ,"cdr3s_aa")] %>%
#' as.data.frame %>% data.table(., keep.rownames = TRUE) %>%
#' .[!is.na(cdr3s_aa), .N, c("cdr3s_aa","Sample")]
#' setorder(clono_freq, N)
#'
#' ## formatting the TRA/TRB notations
#' ## will only work if there's just one TRA
#' ct <- dcast(clono_freq, cdr3s_aa ~ Sample, value.var = "N") %>%
#' .[!is.na(MUT.MR4050) & !is.na(WT.MR4050)]
#'
#' ct[, TRA := gsub(";*TRB:[A-Z]+", "", cdr3s_aa)]
#' ct[, TRA := ifelse(TRA == "", NA, TRA)]
#' ct[, TRB := gsub(".*(TRB:[A-Z]+)", "\\1", cdr3s_aa)]
#' ct[, TRB := ifelse(grepl("^TRA", TRB), NA, TRB)] # if only TRB was present,
#' I need to fill in the NA
#' setorder(ct, -MUT.MR4050, -WT.MR4050 )
#' shared_clonotypes <- copy(ct)
#' }
#'
#' @seealso \code{clonotype_ids}
"shared_clonotypes"



#' Table of customized clonotype IDs for sample MR4050
#'
#' @description \code{data.table} with our customized clonotype IDs for ease of
#' visualization and comparison. I.e., the CDR3s amino acid sequences are re-
#' placed with arbitrary IDs. Note thate these clonotypes are those that are
#' found in both conditions of patient MR4050, i.e. MUT and WT.
#'
#' @details See section "Adding the clonotype IDs" in the vignette "Filtering and Processing"
#' about how the consolidation and clean up of the TRA/TRB sequences was done.
#'
#' @seealso \code{shared_clonotypes}
"clonotype_ids"
64 changes: 64 additions & 0 deletions R/reading_clonotypes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#' Reading in clonotype info per barcode
#'
#' @details This function expects the files "filtered_contig_annotations.csv" and
#' "clonotypes.csv" to be found in the "outs" folder after following \code{cellRanger_result_dir}.
#' These files are used to match the cell barcodes with unique clonotype IDs.
#'
#' @param cellRanger_result_dir e.g. "/scratchLocal/frd2007/2018-11_Smita_Tcells/data/wildtype/".
#' This should be the path to the un-tar'ed directory from cellRanger.
#'
#' @export
#'
reading_in_clonotypes <- function(cellRanger_result_dir){

## reading barcodes (including empty ones)
bcs <- fread(paste0(cellRanger_result_dir,"outs/filtered_contig_annotations.csv"), sep = ",")

#> head(barcode_csv)
#barcode is_cell contig_id high_confidence length
#1 AAACCTGAGTGACATA-1 True AAACCTGAGTGACATA-1_contig_1 True 568
#2 AAACCTGAGTGACATA-1 True AAACCTGAGTGACATA-1_contig_3 True 567
#3 AAACCTGAGTGACATA-1 True AAACCTGAGTGACATA-1_contig_4 True 712
#4 AAACCTGCACCGCTAG-1 True AAACCTGCACCGCTAG-1_contig_1 True 564
#5 AAACCTGCACCGCTAG-1 True AAACCTGCACCGCTAG-1_contig_2 True 584
#6 AAACCTGCACTTCTGC-1 True AAACCTGCACTTCTGC-1_contig_1 True 569
#chain v_gene d_gene j_gene c_gene full_length productive
#1 TRA TRAV19 None TRAJ50 TRAC True True
#2 TRA TRAV13-1 None TRAJ45 TRAC True None
#3 TRB TRBV19 TRBD2 TRBJ2-7 TRBC2 True True
#4 TRB TRBV9 TRBD2 TRBJ2-6 TRBC2 True True
#5 TRA TRAV12-2 None TRAJ44 TRAC True True
#6 TRA TRAV25 None TRAJ54 TRAC True True
#cdr3 cdr3_nt
#1 CALSSMKTSYDKVIF TGTGCTCTGAGTAGCATGAAAACCTCCTACGACAAGGTGATATTT
#2 None None
#3 CATSLALAAYEQYF TGTGCCACGAGTCTGGCGCTAGCGGCGTACGAGCAGTACTTC
#4 CASSGLAGGPVSGANVLTF TGTGCCAGCAGCGGACTAGCGGGAGGGCCCGTTTCTGGGGCCAACGTCCTGACTTTC
#5 CAGNTGTASKLTF TGTGCCGGAAATACCGGCACTGCCAGTAAACTCACCTTT
#6 CAGLQGAQKLVF TGTGCAGGCCTTCAGGGAGCCCAGAAGCTGGTATTT
#reads umis raw_clonotype_id raw_consensus_id
#1 2511 7 clonotype146 clonotype146_consensus_1
#2 679 2 clonotype146 None
#3 850 4 clonotype146 clonotype146_consensus_2
#4 10237 36 clonotype1 clonotype1_consensus_1
#5 3470 10 clonotype1 clonotype1_consensus_2
#6 1780 7 clonotype147 clonotype147_consensus_1

## extract those with a clonotype
bcs.sub <- bcs[ raw_clonotype_id != "None", c("raw_clonotype_id","barcode"), with=FALSE] %>% unique

## reading clonotypes
clotp <- fread(paste0(cellRanger_result_dir, "outs/clonotypes.csv"), sep = ",")
setnames(clotp, "clonotype_id", "raw_clonotype_id")

## join the informationbiocon
jnd <- merge(clotp, bcs.sub, by = "raw_clonotype_id", all = TRUE)

## sanity check
tst <- jnd[raw_clonotype_id == "clonotype1", .N] == unique(jnd[raw_clonotype_id == "clonotype1"]$frequency)
if(isFALSE(tst)){
warning("The number of rows for clonotype1 does not match the frequency taken from the clonotypes.csv file.")
}

return(jnd)
}
Loading

0 comments on commit 3fb6775

Please sign in to comment.