From 7c03904223efe73e54cf0bd084d2991957d61aeb Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Mon, 26 Jun 2023 10:45:39 +0900 Subject: [PATCH 01/17] Re-activate importPublicData --- R/ImportMethods.R | 678 +++++++++++++++++++++++----------------------- 1 file changed, 338 insertions(+), 340 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 0ace8a9..2dbcf72 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -650,343 +650,341 @@ setMethod( "getCTSS", "CAGEexp" "FANTOM5mouseSamples" -#' #' importPublicData -#' #' @noRd -#' #' @importFrom utils data -#' #' @export -#' -#' setGeneric( -#' name="importPublicData", -#' def=function(source, dataset, group, sample){ -#' standardGeneric("importPublicData") -#' } -#' ) -#' -#' setMethod("importPublicData", -#' signature(source = "character", dataset = "character", sample = "character"), -#' function (source, dataset, group, sample){ -#' -#' if(source == "ENCODE"){ -#' -#' if("ENCODEprojectCAGE" %in% rownames(installed.packages()) == FALSE){ -#' stop("Requested CAGE data package is not installed! Please install and load the ENCODEprojectCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") -#' }else if(!("package:ENCODEprojectCAGE" %in% search())){ -#' stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(ENCODEprojectCAGE)'") -#' } -#' -#' if(dataset[1] == "ENCODEtissueCAGEfly"){ -#' genome.name <- "BSgenome.Dmelanogaster.UCSC.dm3" -#' ENCODEtissueCAGEfly <- NULL -#' data("ENCODEtissueCAGEfly", envir = environment()) -#' if(group == "embryo"){ -#' if(sample == "mixed_embryos_0-24hr"){ -#' -#' ctssTable <- ENCODEtissueCAGEfly[["embryo"]] -#' -#' }else{ -#' stop("Specified sample not valid! The dataset 'ENCODEtissueCAGEfly' containes only one sample named 'mixed_embryos_0-24hr'!") -#' } -#' }else{ -#' stop("Specified group not valid! The dataset 'ENCODEtissueCAGEfly' containes only one group named 'embryo'!") -#' } -#' -#' }else{ -#' genome.name <- "BSgenome.Hsapiens.UCSC.hg19" -#' ENCODEhumanCellLinesSamples <- NULL -#' data("ENCODEhumanCellLinesSamples", envir = environment()) -#' info.df <- ENCODEhumanCellLinesSamples -#' -#' if(!(all(dataset %in% info.df$dataset))){ -#' stop("Specified dataset(s) not found! Call data(ENCODEhumanCellLinesSamples) and check 'dataset' column for available ENCODE datasets!") -#' } -#' if(length(dataset) == 1){ -#' -#' if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))){ -#' stop("Some of the provided groups cannot be found in the specified dataset!") -#' } -#' if(length(group) == 1){ -#' if(!(all(sample %in% info.df[info.df$group == group,"sample"]))){ -#' stop("Some of the provided samples cannot be found in the specified group!") -#' } -#' }else if(length(group) == length(sample)){ -#' if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ -#' stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") -#' } -#' }else{ -#' stop("Number of elements in the 'group' must be either 1 or must match the number of elements in the 'sample'!") -#' } -#' -#' -#' }else if(length(dataset) == length(group)){ -#' if(!all(sapply(c(1:length(dataset)), function(x) {group[x] %in% info.df[info.df$dataset == dataset[x],"group"]}))){ -#' stop("Provided 'dataset' and 'group' do not match! Some of the provided groups cannot be found in the corresponding datasets!") -#' } -#' if(length(group) == length(sample)){ -#' if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ -#' stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") -#' } -#' }else{ -#' stop("Number of elements in the 'group' must match the number of elements in the 'sample'!") -#' } -#' -#' -#' }else{ -#' stop("Number of elements in the 'dataset' must be either 1 or must match the number of elements in the 'group' and in the 'sample'!") -#' } -#' -#' data(list = dataset, envir = environment()) -#' -#' if(length(unique(dataset))>1){ -#' -#' for(i in 1:length(unique(dataset))){ -#' dset <- get(unique(dataset)[i]) -#' for(j in 1:length(unique(group[which(dataset == unique(dataset)[i])]))){ -#' g <- unique(group[which(dataset == unique(dataset)[i])])[j] -#' ctss <- dset[[g]][, c("chr", "pos", "strand", sample[which((group == g) & (dataset == unique(dataset)[i]))])] -#' discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 -#' ctss <- data.table(ctss[discard,]) -#' setkeyv(ctss, cols = c("chr", "pos", "strand")) -#' if(i == 1 & j == 1){ -#' ctssTable <- ctss -#' }else{ -#' ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) -#' } -#' } -#' } -#' ctssTable[is.na(ctssTable)] <- 0 -#' -#' }else{ -#' -#' selected.dataset <- get(dataset) -#' if(length(unique(group))>1){ -#' -#' for(i in 1:length(unique(group))){ -#' g <- unique(group)[i] -#' ctss <- selected.dataset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] -#' discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 -#' ctss <- data.table(ctss[discard,]) -#' setkeyv(ctss, cols = c("chr", "pos", "strand")) -#' if(i == 1){ -#' ctssTable <- ctss -#' }else{ -#' ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) -#' } -#' } -#' -#' ctssTable[is.na(ctssTable)] <- 0 -#' -#' }else{ -#' ctssTable <- selected.dataset[[group]][,c("chr", "pos", "strand", sample)] -#' } -#' } -#' -#' ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) -#' } -#' -#' -#' }else if(source == "FANTOM3and4"){ -#' -#' if("FANTOM3and4CAGE" %in% rownames(installed.packages()) == FALSE){ -#' stop("Requested CAGE data package is not installed! Please install and load the FANTOM3and4CAGE package available from Bioconductor.") -#' }else if(!("package:FANTOM3and4CAGE" %in% search())){ -#' stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(FANTOM3and4CAGE)'") -#' } -#' FANTOMhumanSamples <- FANTOMmouseSamples <- NULL -#' data("FANTOMhumanSamples", envir = environment()) -#' info.df1 <- FANTOMhumanSamples -#' data("FANTOMmouseSamples", envir = environment()) -#' info.df2 <- FANTOMmouseSamples -#' -#' if(!(all(dataset %in% info.df1$dataset) | all(dataset %in% info.df2$dataset))){ -#' stop("Specified dataset(s) not found! Call data(FANTOMhumanSamples) and data(FANTOMmouseSamples) and check 'dataset' column for available ENCODE datasets!") -#' } -#' if(length(grep("human", dataset))>0){ -#' genome.name <- "BSgenome.Hsapiens.UCSC.hg18" -#' info.df <- info.df1 -#' }else if(length(grep("mouse", dataset))>0){ -#' genome.name <- "BSgenome.Mmusculus.UCSC.mm9" -#' info.df <- info.df2 -#' } -#' -#' if(length(dataset) == 1){ -#' -#' if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))){ -#' stop("Some of the provided groups cannot be found in the specified dataset!") -#' } -#' if(length(group) == 1){ -#' if(!(all(sample %in% info.df[info.df$group == group,"sample"]))){ -#' stop("Some of the provided samples cannot be found in the specified group!") -#' } -#' }else if(length(group) == length(sample)){ -#' if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ -#' stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") -#' } -#' }else{ -#' stop("Number of elements in the 'group' must be either 1 or must match the number of elements in the 'sample'!") -#' } -#' -#' -#' }else if(length(dataset) == length(group)){ -#' if(!all(sapply(c(1:length(dataset)), function(x) {group[x] %in% info.df[info.df$dataset == dataset[x],"group"]}))){ -#' stop("Provided 'dataset' and 'group' do not match! Some of the provided groups cannot be found in the corresponding datasets!") -#' } -#' if(length(group) == length(sample)){ -#' if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ -#' stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") -#' } -#' }else{ -#' stop("Number of elements in the 'group' must match the number of elements in the 'sample'!") -#' } -#' -#' -#' }else{ -#' stop("Number of elements in the 'dataset' must be either 1 or must match the number of elements in the 'group' and in the 'sample'!") -#' } -#' -#' data(list = dataset, envir = environment()) -#' -#' if(length(unique(dataset))>1){ -#' -#' for(i in 1:length(unique(dataset))){ -#' dset <- get(unique(dataset)[i]) -#' for(j in 1:length(unique(group[which(dataset == unique(dataset)[i])]))){ -#' g <- unique(group[which(dataset == unique(dataset)[i])])[j] -#' ctss <- dset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] -#' discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 -#' ctss <- data.table(ctss[discard,]) -#' setnames(ctss, c("chr", "pos", "strand", paste(g, sample[which(group == g)], sep = "__"))) -#' setkeyv(ctss, cols = c("chr", "pos", "strand")) -#' if(i == 1 & j == 1){ -#' ctssTable <- ctss -#' }else{ -#' ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) -#' } -#' } -#' } -#' -#' ctssTable[is.na(ctssTable)] <- 0 -#' -#' }else{ -#' -#' selected.dataset <- get(dataset) -#' if(length(unique(group))>1){ -#' -#' for(i in 1:length(unique(group))){ -#' g <- unique(group)[i] -#' ctss <- selected.dataset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] -#' discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 -#' ctss <- data.table(ctss[discard,]) -#' setnames(ctss, c("chr", "pos", "strand", paste(g, sample[which(group == g)], sep = "__"))) -#' setkeyv(ctss, cols = c("chr", "pos", "strand")) -#' if(i == 1){ -#' ctssTable <- ctss -#' }else{ -#' ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) -#' } -#' } -#' -#' ctssTable[is.na(ctssTable)] <- 0 -#' -#' }else{ -#' ctssTable <- selected.dataset[[group]][,c("chr", "pos", "strand", sample)] -#' setnames(ctssTable, c("chr", "pos", "strand", paste(group, sample, sep = "__"))) -#' } -#' } -#' -#' ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) -#' -#' -#' -#' }else if (source == "FANTOM5"){ -#' -#' if(length(dataset) != 1){ -#' stop("For FANTOM5 only one dataset can be specified and it can be either 'human' or 'mouse'!") -#' }else if(!(dataset %in% c("human", "mouse"))){ -#' stop("For FANTOM5, dataset can be either 'human' or 'mouse'!") -#' } -#' if(dataset == "human"){ -#' FANTOM5humanSamples <- NULL -#' data("FANTOM5humanSamples", envir = environment()) -#' samples.info <- FANTOM5humanSamples -#' genome.name <- "BSgenome.Hsapiens.UCSC.hg19" -#' }else if(dataset == "mouse"){ -#' FANTOM5mouseSamples <- NULL -#' data("FANTOM5mouseSamples", envir = environment()) -#' samples.info <- FANTOM5mouseSamples -#' genome.name <- "BSgenome.Mmusculus.UCSC.mm9" -#' } -#' -#' if(!(all(sample %in% samples.info$sample))){ -#' stop(paste("Some sample names cannot be found for the specified dataset! Call data(FANTOM5", dataset, "Samples) and check the 'sample' column for valid sample names!", sep = "")) -#' } -#' -#' -#' -#' for(i in c(1:length(sample))){ -#' -#' message("Fetching sample: ", sample[i], "...") -#' sample.url <- samples.info[samples.info$sample == sample[i], "data_url"] -#' con <- gzcon(url(paste(sample.url))) -#' ctss <- scan(con, what = list(character(), NULL, integer(), NULL, integer(), character())) -#' ctss.df <- data.table(chr = ctss[[1]], pos = ctss[[3]], strand = ctss[[6]], tagCount = ctss[[5]]) -#' setnames(ctss.df, c("chr", "pos", "strand", sample[i])) -#' setkeyv(ctss.df, cols = c("chr", "pos", "strand")) -#' if(i == 1){ -#' ctss.table <- ctss.df -#' }else{ -#' message("Adding sample to CTSS table...\n") -#' ctss.table <- merge(ctss.table, ctss.df, all.x = T, all.y = T) -#' ctss.table[is.na(ctss.table)] <- 0 -#' } -#' -#' } -#' -#' ctssTable <- data.frame(ctss.table, stringsAsFactors = F, check.names = F) -#' -#' -#' }else if (source == "ZebrafishDevelopment"){ -#' -#' if("ZebrafishDevelopmentalCAGE" %in% rownames(installed.packages()) == FALSE){ -#' stop("Requested CAGE data package is not installed! Please install and load the ZebrafishDevelopmentalCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") -#' }else if(!("package:ZebrafishDevelopmentalCAGE" %in% search())){ -#' stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(ZebrafishDevelopmentalCAGE)'") -#' } -#' -#' ZebrafishSamples <- NULL -#' data("ZebrafishSamples", envir = environment()) -#' if(dataset == "ZebrafishCAGE"){ -#' if(group == "development"){ -#' if(!(all(sample %in% ZebrafishSamples$sample))){ -#' stop("Some sample names cannot be found for the specified dataset! Call data(ZebrafishSamples) and check the 'sample' column for valid sample names!") -#' }else{ -#' genome.name <- "BSgenome.Drerio.UCSC.danRer7" -#' ZebrafishCAGE <- NULL -#' data("ZebrafishCAGE", envir = environment()) -#' ctssTable <- ZebrafishCAGE[["development"]][,c("chr", "pos", "strand", sample)] -#' ctssTable <- ctssTable[apply(ctssTable[,4:ncol(ctssTable),drop=FALSE], 1, function(x) {any(x>0)}),] -#' } -#' }else{ -#' stop("Invalid group name! There is only one group in this dataset named 'development'.") -#' } -#' }else{ -#' stop("Invalid dataset name! There is only one available dataset named 'ZebrafishCAGE'.") -#' } -#' -#' -#' }else{ -#' stop("Currently only the following public CAGE data resources are supported: 'FANTOM5', 'FANTOM3and4', 'ENCODE', 'ZebrafishDevelopment'. Refer to CAGEr vignette on how to use those resources!") -#' } -#' -#' rownames(ctssTable) <- c(1:nrow(ctssTable)) -#' -#' sample.labels <- colnames(ctssTable)[4:ncol(ctssTable)] -#' names(sample.labels) <- rainbow(n = length(sample.labels)) -#' myCAGEset <- new("CAGEset", genomeName = genome.name, inputFiles = paste(source, sample.labels, sep = "__"), inputFilesType = source, sampleLabels = sample.labels) -#' myCAGEset@librarySizes <- as.integer(colSums(ctssTable[,4:ncol(ctssTable),drop=FALSE])) -#' myCAGEset@CTSScoordinates <- ctssTable[, c("chr", "pos", "strand")] -#' myCAGEset@tagCountMatrix <- ctssTable[,4:ncol(ctssTable),drop=FALSE] -#' -#' return(myCAGEset) -#' -#' } -#' ) +#' importPublicData +#' @noRd +#' @importFrom utils data +#' @export + +setGeneric("importPublicData", + function(source, dataset, group, sample) + standardGeneric("importPublicData")) + +setMethod("importPublicData", +signature(source = "character", dataset = "character", sample = "character"), +.importPublicData (source, dataset, group, sample)) + +.importPublicData <- function (source, dataset, group, sample){ + + if(source == "ENCODE"){ + + if("ENCODEprojectCAGE" %in% rownames(installed.packages()) == FALSE){ + stop("Requested CAGE data package is not installed! Please install and load the ENCODEprojectCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") + }else if(!("package:ENCODEprojectCAGE" %in% search())){ + stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(ENCODEprojectCAGE)'") + } + + if(dataset[1] == "ENCODEtissueCAGEfly"){ + genome.name <- "BSgenome.Dmelanogaster.UCSC.dm3" + ENCODEtissueCAGEfly <- NULL + data("ENCODEtissueCAGEfly", envir = environment()) + if(group == "embryo"){ + if(sample == "mixed_embryos_0-24hr"){ + + ctssTable <- ENCODEtissueCAGEfly[["embryo"]] + + }else{ + stop("Specified sample not valid! The dataset 'ENCODEtissueCAGEfly' containes only one sample named 'mixed_embryos_0-24hr'!") + } + }else{ + stop("Specified group not valid! The dataset 'ENCODEtissueCAGEfly' containes only one group named 'embryo'!") + } + + }else{ + genome.name <- "BSgenome.Hsapiens.UCSC.hg19" + ENCODEhumanCellLinesSamples <- NULL + data("ENCODEhumanCellLinesSamples", envir = environment()) + info.df <- ENCODEhumanCellLinesSamples + + if(!(all(dataset %in% info.df$dataset))){ + stop("Specified dataset(s) not found! Call data(ENCODEhumanCellLinesSamples) and check 'dataset' column for available ENCODE datasets!") + } + if(length(dataset) == 1){ + + if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))){ + stop("Some of the provided groups cannot be found in the specified dataset!") + } + if(length(group) == 1){ + if(!(all(sample %in% info.df[info.df$group == group,"sample"]))){ + stop("Some of the provided samples cannot be found in the specified group!") + } + }else if(length(group) == length(sample)){ + if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ + stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") + } + }else{ + stop("Number of elements in the 'group' must be either 1 or must match the number of elements in the 'sample'!") + } + + + }else if(length(dataset) == length(group)){ + if(!all(sapply(c(1:length(dataset)), function(x) {group[x] %in% info.df[info.df$dataset == dataset[x],"group"]}))){ + stop("Provided 'dataset' and 'group' do not match! Some of the provided groups cannot be found in the corresponding datasets!") + } + if(length(group) == length(sample)){ + if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ + stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") + } + }else{ + stop("Number of elements in the 'group' must match the number of elements in the 'sample'!") + } + + + }else{ + stop("Number of elements in the 'dataset' must be either 1 or must match the number of elements in the 'group' and in the 'sample'!") + } + + data(list = dataset, envir = environment()) + + if(length(unique(dataset))>1){ + + for(i in 1:length(unique(dataset))){ + dset <- get(unique(dataset)[i]) + for(j in 1:length(unique(group[which(dataset == unique(dataset)[i])]))){ + g <- unique(group[which(dataset == unique(dataset)[i])])[j] + ctss <- dset[[g]][, c("chr", "pos", "strand", sample[which((group == g) & (dataset == unique(dataset)[i]))])] + discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 + ctss <- data.table(ctss[discard,]) + setkeyv(ctss, cols = c("chr", "pos", "strand")) + if(i == 1 & j == 1){ + ctssTable <- ctss + }else{ + ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) + } + } + } + ctssTable[is.na(ctssTable)] <- 0 + + }else{ + + selected.dataset <- get(dataset) + if(length(unique(group))>1){ + + for(i in 1:length(unique(group))){ + g <- unique(group)[i] + ctss <- selected.dataset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] + discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 + ctss <- data.table(ctss[discard,]) + setkeyv(ctss, cols = c("chr", "pos", "strand")) + if(i == 1){ + ctssTable <- ctss + }else{ + ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) + } + } + + ctssTable[is.na(ctssTable)] <- 0 + + }else{ + ctssTable <- selected.dataset[[group]][,c("chr", "pos", "strand", sample)] + } + } + + ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) + } + + + }else if(source == "FANTOM3and4"){ + + if("FANTOM3and4CAGE" %in% rownames(installed.packages()) == FALSE){ + stop("Requested CAGE data package is not installed! Please install and load the FANTOM3and4CAGE package available from Bioconductor.") + }else if(!("package:FANTOM3and4CAGE" %in% search())){ + stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(FANTOM3and4CAGE)'") + } + FANTOMhumanSamples <- FANTOMmouseSamples <- NULL + data("FANTOMhumanSamples", envir = environment()) + info.df1 <- FANTOMhumanSamples + data("FANTOMmouseSamples", envir = environment()) + info.df2 <- FANTOMmouseSamples + + if(!(all(dataset %in% info.df1$dataset) | all(dataset %in% info.df2$dataset))){ + stop("Specified dataset(s) not found! Call data(FANTOMhumanSamples) and data(FANTOMmouseSamples) and check 'dataset' column for available ENCODE datasets!") + } + if(length(grep("human", dataset))>0){ + genome.name <- "BSgenome.Hsapiens.UCSC.hg18" + info.df <- info.df1 + }else if(length(grep("mouse", dataset))>0){ + genome.name <- "BSgenome.Mmusculus.UCSC.mm9" + info.df <- info.df2 + } + + if(length(dataset) == 1){ + + if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))){ + stop("Some of the provided groups cannot be found in the specified dataset!") + } + if(length(group) == 1){ + if(!(all(sample %in% info.df[info.df$group == group,"sample"]))){ + stop("Some of the provided samples cannot be found in the specified group!") + } + }else if(length(group) == length(sample)){ + if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ + stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") + } + }else{ + stop("Number of elements in the 'group' must be either 1 or must match the number of elements in the 'sample'!") + } + + + }else if(length(dataset) == length(group)){ + if(!all(sapply(c(1:length(dataset)), function(x) {group[x] %in% info.df[info.df$dataset == dataset[x],"group"]}))){ + stop("Provided 'dataset' and 'group' do not match! Some of the provided groups cannot be found in the corresponding datasets!") + } + if(length(group) == length(sample)){ + if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ + stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") + } + }else{ + stop("Number of elements in the 'group' must match the number of elements in the 'sample'!") + } + + + }else{ + stop("Number of elements in the 'dataset' must be either 1 or must match the number of elements in the 'group' and in the 'sample'!") + } + + data(list = dataset, envir = environment()) + + if(length(unique(dataset))>1){ + + for(i in 1:length(unique(dataset))){ + dset <- get(unique(dataset)[i]) + for(j in 1:length(unique(group[which(dataset == unique(dataset)[i])]))){ + g <- unique(group[which(dataset == unique(dataset)[i])])[j] + ctss <- dset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] + discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 + ctss <- data.table(ctss[discard,]) + setnames(ctss, c("chr", "pos", "strand", paste(g, sample[which(group == g)], sep = "__"))) + setkeyv(ctss, cols = c("chr", "pos", "strand")) + if(i == 1 & j == 1){ + ctssTable <- ctss + }else{ + ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) + } + } + } + + ctssTable[is.na(ctssTable)] <- 0 + + }else{ + + selected.dataset <- get(dataset) + if(length(unique(group))>1){ + + for(i in 1:length(unique(group))){ + g <- unique(group)[i] + ctss <- selected.dataset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] + discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 + ctss <- data.table(ctss[discard,]) + setnames(ctss, c("chr", "pos", "strand", paste(g, sample[which(group == g)], sep = "__"))) + setkeyv(ctss, cols = c("chr", "pos", "strand")) + if(i == 1){ + ctssTable <- ctss + }else{ + ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) + } + } + + ctssTable[is.na(ctssTable)] <- 0 + + }else{ + ctssTable <- selected.dataset[[group]][,c("chr", "pos", "strand", sample)] + setnames(ctssTable, c("chr", "pos", "strand", paste(group, sample, sep = "__"))) + } + } + + ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) + + + + }else if (source == "FANTOM5"){ + + if(length(dataset) != 1){ + stop("For FANTOM5 only one dataset can be specified and it can be either 'human' or 'mouse'!") + }else if(!(dataset %in% c("human", "mouse"))){ + stop("For FANTOM5, dataset can be either 'human' or 'mouse'!") + } + if(dataset == "human"){ + FANTOM5humanSamples <- NULL + data("FANTOM5humanSamples", envir = environment()) + samples.info <- FANTOM5humanSamples + genome.name <- "BSgenome.Hsapiens.UCSC.hg19" + }else if(dataset == "mouse"){ + FANTOM5mouseSamples <- NULL + data("FANTOM5mouseSamples", envir = environment()) + samples.info <- FANTOM5mouseSamples + genome.name <- "BSgenome.Mmusculus.UCSC.mm9" + } + + if(!(all(sample %in% samples.info$sample))){ + stop(paste("Some sample names cannot be found for the specified dataset! Call data(FANTOM5", dataset, "Samples) and check the 'sample' column for valid sample names!", sep = "")) + } + + + + for(i in c(1:length(sample))){ + + message("Fetching sample: ", sample[i], "...") + sample.url <- samples.info[samples.info$sample == sample[i], "data_url"] + con <- gzcon(url(paste(sample.url))) + ctss <- scan(con, what = list(character(), NULL, integer(), NULL, integer(), character())) + ctss.df <- data.table(chr = ctss[[1]], pos = ctss[[3]], strand = ctss[[6]], tagCount = ctss[[5]]) + setnames(ctss.df, c("chr", "pos", "strand", sample[i])) + setkeyv(ctss.df, cols = c("chr", "pos", "strand")) + if(i == 1){ + ctss.table <- ctss.df + }else{ + message("Adding sample to CTSS table...\n") + ctss.table <- merge(ctss.table, ctss.df, all.x = T, all.y = T) + ctss.table[is.na(ctss.table)] <- 0 + } + + } + + ctssTable <- data.frame(ctss.table, stringsAsFactors = F, check.names = F) + + + }else if (source == "ZebrafishDevelopment"){ + + if("ZebrafishDevelopmentalCAGE" %in% rownames(installed.packages()) == FALSE){ + stop("Requested CAGE data package is not installed! Please install and load the ZebrafishDevelopmentalCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") + }else if(!("package:ZebrafishDevelopmentalCAGE" %in% search())){ + stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(ZebrafishDevelopmentalCAGE)'") + } + + ZebrafishSamples <- NULL + data("ZebrafishSamples", envir = environment()) + if(dataset == "ZebrafishCAGE"){ + if(group == "development"){ + if(!(all(sample %in% ZebrafishSamples$sample))){ + stop("Some sample names cannot be found for the specified dataset! Call data(ZebrafishSamples) and check the 'sample' column for valid sample names!") + }else{ + genome.name <- "BSgenome.Drerio.UCSC.danRer7" + ZebrafishCAGE <- NULL + data("ZebrafishCAGE", envir = environment()) + ctssTable <- ZebrafishCAGE[["development"]][,c("chr", "pos", "strand", sample)] + ctssTable <- ctssTable[apply(ctssTable[,4:ncol(ctssTable),drop=FALSE], 1, function(x) {any(x>0)}),] + } + }else{ + stop("Invalid group name! There is only one group in this dataset named 'development'.") + } + }else{ + stop("Invalid dataset name! There is only one available dataset named 'ZebrafishCAGE'.") + } + + + }else{ + stop("Currently only the following public CAGE data resources are supported: 'FANTOM5', 'FANTOM3and4', 'ENCODE', 'ZebrafishDevelopment'. Refer to CAGEr vignette on how to use those resources!") + } + + rownames(ctssTable) <- c(1:nrow(ctssTable)) + + sample.labels <- colnames(ctssTable)[4:ncol(ctssTable)] + names(sample.labels) <- rainbow(n = length(sample.labels)) + myCAGEset <- new("CAGEset", genomeName = genome.name, inputFiles = paste(source, sample.labels, sep = "__"), inputFilesType = source, sampleLabels = sample.labels) + myCAGEset@librarySizes <- as.integer(colSums(ctssTable[,4:ncol(ctssTable),drop=FALSE])) + myCAGEset@CTSScoordinates <- ctssTable[, c("chr", "pos", "strand")] + myCAGEset@tagCountMatrix <- ctssTable[,4:ncol(ctssTable),drop=FALSE] + + return(myCAGEset) + +} \ No newline at end of file From c9033910d990b826fc2389a26466b8f4708aece9 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Mon, 26 Jun 2023 10:50:54 +0900 Subject: [PATCH 02/17] Use match.arg to ensure valid source. --- R/ImportMethods.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 2dbcf72..17a86d7 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -656,14 +656,17 @@ setMethod( "getCTSS", "CAGEexp" #' @export setGeneric("importPublicData", - function(source, dataset, group, sample) + function(source = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), + dataset, + group, + sample) standardGeneric("importPublicData")) -setMethod("importPublicData", -signature(source = "character", dataset = "character", sample = "character"), -.importPublicData (source, dataset, group, sample)) +setMethod("importPublicData", signature(source = "character", dataset = "character", sample = "character"), + .importPublicData(source, dataset, group, sample)) -.importPublicData <- function (source, dataset, group, sample){ +.importPublicData <- function(source = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) { + source <- match.arg(source) if(source == "ENCODE"){ @@ -972,10 +975,7 @@ signature(source = "character", dataset = "character", sample = "character"), } - }else{ - stop("Currently only the following public CAGE data resources are supported: 'FANTOM5', 'FANTOM3and4', 'ENCODE', 'ZebrafishDevelopment'. Refer to CAGEr vignette on how to use those resources!") } - rownames(ctssTable) <- c(1:nrow(ctssTable)) sample.labels <- colnames(ctssTable)[4:ncol(ctssTable)] From 6bffcac0bfffaf131faad3ddec0dbdc91245925e Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Mon, 26 Jun 2023 13:03:36 +0900 Subject: [PATCH 03/17] Import zebrafish CAGE data as CAGEexp (see #78) --- R/ImportMethods.R | 74 +++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 17a86d7..879b2cd 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -946,36 +946,8 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact ctssTable <- data.frame(ctss.table, stringsAsFactors = F, check.names = F) - }else if (source == "ZebrafishDevelopment"){ - - if("ZebrafishDevelopmentalCAGE" %in% rownames(installed.packages()) == FALSE){ - stop("Requested CAGE data package is not installed! Please install and load the ZebrafishDevelopmentalCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") - }else if(!("package:ZebrafishDevelopmentalCAGE" %in% search())){ - stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(ZebrafishDevelopmentalCAGE)'") - } - - ZebrafishSamples <- NULL - data("ZebrafishSamples", envir = environment()) - if(dataset == "ZebrafishCAGE"){ - if(group == "development"){ - if(!(all(sample %in% ZebrafishSamples$sample))){ - stop("Some sample names cannot be found for the specified dataset! Call data(ZebrafishSamples) and check the 'sample' column for valid sample names!") - }else{ - genome.name <- "BSgenome.Drerio.UCSC.danRer7" - ZebrafishCAGE <- NULL - data("ZebrafishCAGE", envir = environment()) - ctssTable <- ZebrafishCAGE[["development"]][,c("chr", "pos", "strand", sample)] - ctssTable <- ctssTable[apply(ctssTable[,4:ncol(ctssTable),drop=FALSE], 1, function(x) {any(x>0)}),] - } - }else{ - stop("Invalid group name! There is only one group in this dataset named 'development'.") - } - }else{ - stop("Invalid dataset name! There is only one available dataset named 'ZebrafishCAGE'.") - } - - - } + } else if (source == "ZebrafishDevelopment") .importPublicData_ZF (group = group, sample = sample) + rownames(ctssTable) <- c(1:nrow(ctssTable)) sample.labels <- colnames(ctssTable)[4:ncol(ctssTable)] @@ -987,4 +959,44 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact return(myCAGEset) -} \ No newline at end of file +} + +.importPublicData_ZF <- function(group = "development", sample = NULL) { + if (group != "development") + stop("Invalid group name! There is only one group in this dataset named 'development'.") + if (! requireNamespace("ZebrafishDevelopmentalCAGE")) + stop ("This function requires the ", dQuote("ZebrafishDevelopmentalCAGE"), + " package; please install it from http://promshift.genereg.net/CAGEr/PackageSource/.") + if (! requireNamespace("BSgenome.Drerio.UCSC.danRer7")) + stop ("This function requires the ", dQuote("BSgenome.Drerio.UCSC.danRer7"), " package.") + + ZebrafishSamples <- NULL + data("ZebrafishSamples", package = "ZebrafishDevelopmentalCAGE", envir = environment()) + + validSampleNames <- levels(ZebrafishSamples$sample) + if (is.null(sample )) sample <- validSampleNames + if ( ! all(sample %in% validSampleNames)) + stop("At least one sample name is not valid. ", + "Valid sample names are: ", paste(validSampleNames, collapse = ", "), ".") + + genome.name <- "BSgenome.Drerio.UCSC.danRer7" + ZebrafishCAGE <- NULL + data("ZebrafishCAGE", package = "ZebrafishDevelopmentalCAGE", envir = environment()) + # The vignette of ZebrafishDevelopmentalCAGE states that the provided coordinates are 1-based. + ctssRanges <- CTSS(ZebrafishCAGE[["development"]]$chr, + ZebrafishCAGE[["development"]]$pos, + ZebrafishCAGE[["development"]]$strand, + seqinfo = seqinfo(BSgenome.Drerio.UCSC.danRer7:::BSgenome.Drerio.UCSC.danRer7)) + ctssDF <- lapply(ZebrafishCAGE[["development"]][ , sample], Rle) |> DataFrame() + ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges) + ctssSE <- ctssSE[rowSums(ZebrafishCAGE[["development"]][ , sample]) > 0,] + ctssSE <- sort(ctssSE) + ce <- CAGEexp(genomeName = "BSgenome.Drerio.UCSC.danRer7", + colData = DataFrame( sampleLabels = sample, + inputFiles = NA, + inputFilesType = 'ctss', + librarySizes = sapply(assay(ctssSE), sum), + row.names = sample)) + CTSStagCountSE(ce) <- ctssSE + ce +} From 14a2396057a022762acaf6f68945d00ad689dae0 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 10:45:32 +0900 Subject: [PATCH 04/17] Support loading data from URLs. Closes #50 See also #52 and #78 --- R/ImportMethods.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 879b2cd..8d5f930 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -121,7 +121,8 @@ setGeneric( "getCTSS" checkFilesExist <- function(paths) { for (f in paths) - if (! file.exists(f)) stop("Could not locate input file ", f) + if (isFALSE(grepl("^http", f))) + if (! file.exists(f)) stop("Could not locate input file ", f) } #' toCTSSdt From 2afbddc8691a08395f63b8a0180aa864e80c24f9 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 10:46:30 +0900 Subject: [PATCH 05/17] Correct the ZF loaded to track BSgenome name. --- R/ImportMethods.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 8d5f930..95f7186 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -980,14 +980,14 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact stop("At least one sample name is not valid. ", "Valid sample names are: ", paste(validSampleNames, collapse = ", "), ".") - genome.name <- "BSgenome.Drerio.UCSC.danRer7" ZebrafishCAGE <- NULL data("ZebrafishCAGE", package = "ZebrafishDevelopmentalCAGE", envir = environment()) # The vignette of ZebrafishDevelopmentalCAGE states that the provided coordinates are 1-based. ctssRanges <- CTSS(ZebrafishCAGE[["development"]]$chr, ZebrafishCAGE[["development"]]$pos, ZebrafishCAGE[["development"]]$strand, - seqinfo = seqinfo(BSgenome.Drerio.UCSC.danRer7:::BSgenome.Drerio.UCSC.danRer7)) + seqinfo = seqinfo(BSgenome.Drerio.UCSC.danRer7:::BSgenome.Drerio.UCSC.danRer7), + bsgenomeName = "BSgenome.Drerio.UCSC.danRer7") ctssDF <- lapply(ZebrafishCAGE[["development"]][ , sample], Rle) |> DataFrame() ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges) ctssSE <- ctssSE[rowSums(ZebrafishCAGE[["development"]][ , sample]) > 0,] From 8dcec188623e55b74fec0ad9e7a07fd3c861c6af Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 10:48:33 +0900 Subject: [PATCH 06/17] Load FANTOM5 data using getCTSS(). Also rename function parameter, because "source" is already a function name. See also #78 --- R/ImportMethods.R | 95 +++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 57 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 95f7186..0fd1216 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -657,19 +657,16 @@ setMethod( "getCTSS", "CAGEexp" #' @export setGeneric("importPublicData", - function(source = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), + function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) standardGeneric("importPublicData")) -setMethod("importPublicData", signature(source = "character", dataset = "character", sample = "character"), - .importPublicData(source, dataset, group, sample)) +.importPublicData <- function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) { + # origin <- match.arg(origin) -.importPublicData <- function(source = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) { - source <- match.arg(source) - - if(source == "ENCODE"){ + if(origin == "ENCODE"){ if("ENCODEprojectCAGE" %in% rownames(installed.packages()) == FALSE){ stop("Requested CAGE data package is not installed! Please install and load the ENCODEprojectCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") @@ -787,7 +784,7 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact } - }else if(source == "FANTOM3and4"){ + }else if(origin == "FANTOM3and4"){ if("FANTOM3and4CAGE" %in% rownames(installed.packages()) == FALSE){ stop("Requested CAGE data package is not installed! Please install and load the FANTOM3and4CAGE package available from Bioconductor.") @@ -900,60 +897,15 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact - }else if (source == "FANTOM5"){ - - if(length(dataset) != 1){ - stop("For FANTOM5 only one dataset can be specified and it can be either 'human' or 'mouse'!") - }else if(!(dataset %in% c("human", "mouse"))){ - stop("For FANTOM5, dataset can be either 'human' or 'mouse'!") - } - if(dataset == "human"){ - FANTOM5humanSamples <- NULL - data("FANTOM5humanSamples", envir = environment()) - samples.info <- FANTOM5humanSamples - genome.name <- "BSgenome.Hsapiens.UCSC.hg19" - }else if(dataset == "mouse"){ - FANTOM5mouseSamples <- NULL - data("FANTOM5mouseSamples", envir = environment()) - samples.info <- FANTOM5mouseSamples - genome.name <- "BSgenome.Mmusculus.UCSC.mm9" - } - - if(!(all(sample %in% samples.info$sample))){ - stop(paste("Some sample names cannot be found for the specified dataset! Call data(FANTOM5", dataset, "Samples) and check the 'sample' column for valid sample names!", sep = "")) - } - - - - for(i in c(1:length(sample))){ - - message("Fetching sample: ", sample[i], "...") - sample.url <- samples.info[samples.info$sample == sample[i], "data_url"] - con <- gzcon(url(paste(sample.url))) - ctss <- scan(con, what = list(character(), NULL, integer(), NULL, integer(), character())) - ctss.df <- data.table(chr = ctss[[1]], pos = ctss[[3]], strand = ctss[[6]], tagCount = ctss[[5]]) - setnames(ctss.df, c("chr", "pos", "strand", sample[i])) - setkeyv(ctss.df, cols = c("chr", "pos", "strand")) - if(i == 1){ - ctss.table <- ctss.df - }else{ - message("Adding sample to CTSS table...\n") - ctss.table <- merge(ctss.table, ctss.df, all.x = T, all.y = T) - ctss.table[is.na(ctss.table)] <- 0 - } - - } - - ctssTable <- data.frame(ctss.table, stringsAsFactors = F, check.names = F) - - - } else if (source == "ZebrafishDevelopment") .importPublicData_ZF (group = group, sample = sample) + }else if (origin == "FANTOM5") { + .importPublicData_F5 (dataset = dataset, group = group, sample = sample) + }else if (origin == "ZebrafishDevelopment") .importPublicData_ZF (group = group, sample = sample) rownames(ctssTable) <- c(1:nrow(ctssTable)) sample.labels <- colnames(ctssTable)[4:ncol(ctssTable)] names(sample.labels) <- rainbow(n = length(sample.labels)) - myCAGEset <- new("CAGEset", genomeName = genome.name, inputFiles = paste(source, sample.labels, sep = "__"), inputFilesType = source, sampleLabels = sample.labels) + myCAGEset <- new("CAGEset", genomeName = genome.name, inputFiles = paste(origin, sample.labels, sep = "__"), inputFilesType = origin, sampleLabels = sample.labels) myCAGEset@librarySizes <- as.integer(colSums(ctssTable[,4:ncol(ctssTable),drop=FALSE])) myCAGEset@CTSScoordinates <- ctssTable[, c("chr", "pos", "strand")] myCAGEset@tagCountMatrix <- ctssTable[,4:ncol(ctssTable),drop=FALSE] @@ -962,6 +914,32 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact } +.importPublicData_F5 <- function(dataset = c("human", "mouse"), group = NULL, sample = NULL) { + dataset <- match.arg(dataset) + if (dataset == "human") { + if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) + stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg19"), " package.") + FANTOM5humanSamples <- NULL + data("FANTOM5humanSamples", package = "CAGEr", envir = environment()) + samples.info <- FANTOM5humanSamples + genome.name <- "BSgenome.Hsapiens.UCSC.hg19" + genome.obj <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 + } else { + if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9")) + stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.") + FANTOM5mouseSamples <- NULL + data("FANTOM5mouseSamples", envir = environment()) + samples.info <- FANTOM5mouseSamples + genome.name <- "BSgenome.Mmusculus.UCSC.mm9" + genome.obj <- BSgenome.Mmusculus.UCSC.mm9::BSgenome.Mmusculus.UCSC.mm9 + } + ce <- CAGEexp(genomeName = genome.name, + inputFiles = samples.info[samples.info$sample %in% sample,"data_url"], + inputFilesType = "bedScore", + sampleLabels = sample) + getCTSS(ce) +} + .importPublicData_ZF <- function(group = "development", sample = NULL) { if (group != "development") stop("Invalid group name! There is only one group in this dataset named 'development'.") @@ -1001,3 +979,6 @@ setMethod("importPublicData", signature(source = "character", dataset = "charact CTSStagCountSE(ce) <- ctssSE ce } + +setMethod("importPublicData", signature(origin = "character", dataset = "character", sample = "character"), + .importPublicData) \ No newline at end of file From d2cdd4c28bc7ac521769d20ab0a67459c61b0fa4 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 10:50:40 +0900 Subject: [PATCH 07/17] Return a CAGEexp object. --- R/ImportMethods.R | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 0fd1216..60146f4 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -895,23 +895,12 @@ setGeneric("importPublicData", ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) - - - }else if (origin == "FANTOM5") { - .importPublicData_F5 (dataset = dataset, group = group, sample = sample) - }else if (origin == "ZebrafishDevelopment") .importPublicData_ZF (group = group, sample = sample) - - rownames(ctssTable) <- c(1:nrow(ctssTable)) - - sample.labels <- colnames(ctssTable)[4:ncol(ctssTable)] - names(sample.labels) <- rainbow(n = length(sample.labels)) - myCAGEset <- new("CAGEset", genomeName = genome.name, inputFiles = paste(origin, sample.labels, sep = "__"), inputFilesType = origin, sampleLabels = sample.labels) - myCAGEset@librarySizes <- as.integer(colSums(ctssTable[,4:ncol(ctssTable),drop=FALSE])) - myCAGEset@CTSScoordinates <- ctssTable[, c("chr", "pos", "strand")] - myCAGEset@tagCountMatrix <- ctssTable[,4:ncol(ctssTable),drop=FALSE] - - return(myCAGEset) - + } else if (origin == "FANTOM5") { + ce <- .importPublicData_F5 (dataset = dataset, group = group, sample = sample) + } else if (origin == "ZebrafishDevelopment") { + ce <- .importPublicData_ZF (group = group, sample = sample) + } + ce } .importPublicData_F5 <- function(dataset = c("human", "mouse"), group = NULL, sample = NULL) { @@ -928,7 +917,7 @@ setGeneric("importPublicData", if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9")) stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.") FANTOM5mouseSamples <- NULL - data("FANTOM5mouseSamples", envir = environment()) + data("FANTOM5mouseSamples", package = "CAGEr", envir = environment()) samples.info <- FANTOM5mouseSamples genome.name <- "BSgenome.Mmusculus.UCSC.mm9" genome.obj <- BSgenome.Mmusculus.UCSC.mm9::BSgenome.Mmusculus.UCSC.mm9 From dceee1efda78e5a0c5e8a3b10641303b11646581 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 11:08:13 +0900 Subject: [PATCH 08/17] Validate sample names. --- R/ImportMethods.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 60146f4..733e94e 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -922,6 +922,14 @@ setGeneric("importPublicData", genome.name <- "BSgenome.Mmusculus.UCSC.mm9" genome.obj <- BSgenome.Mmusculus.UCSC.mm9::BSgenome.Mmusculus.UCSC.mm9 } + + validSampleNames <- samples.info$sample + if (is.null(sample)) sample <- validSampleNames + if ( ! all(sample %in% validSampleNames)) + stop("At least one sample name is not valid. ", + "Call data('FANTOM5humanSamples', package='CAGEr') or ", + "data('FANTOM5mouseSamples', package='CAGEr') to check valid names.") + ce <- CAGEexp(genomeName = genome.name, inputFiles = samples.info[samples.info$sample %in% sample,"data_url"], inputFilesType = "bedScore", From 56f1cd5f9ea5217d483fc48038eb664640085b21 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 11:38:09 +0900 Subject: [PATCH 09/17] Factorise code out of .importPublicData_ZF --- R/ImportMethods.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 733e94e..153f110 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -663,6 +663,18 @@ setGeneric("importPublicData", sample) standardGeneric("importPublicData")) +.df2SE <- function(df, sample, genome.name) { + ctssRanges <- CTSS(df$chr, + df$pos, + df$strand, + seqinfo = seqinfo(CAGEr:::getRefGenome(genome.name)), + bsgenomeName = genome.name) + ctssDF <- lapply(df[ , sample], Rle) |> DataFrame() + ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges) + ctssSE <- ctssSE[rowSums(df[ , sample]) > 0,] + sort(ctssSE) +} + .importPublicData <- function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) { # origin <- match.arg(origin) @@ -958,15 +970,7 @@ setGeneric("importPublicData", ZebrafishCAGE <- NULL data("ZebrafishCAGE", package = "ZebrafishDevelopmentalCAGE", envir = environment()) # The vignette of ZebrafishDevelopmentalCAGE states that the provided coordinates are 1-based. - ctssRanges <- CTSS(ZebrafishCAGE[["development"]]$chr, - ZebrafishCAGE[["development"]]$pos, - ZebrafishCAGE[["development"]]$strand, - seqinfo = seqinfo(BSgenome.Drerio.UCSC.danRer7:::BSgenome.Drerio.UCSC.danRer7), - bsgenomeName = "BSgenome.Drerio.UCSC.danRer7") - ctssDF <- lapply(ZebrafishCAGE[["development"]][ , sample], Rle) |> DataFrame() - ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges) - ctssSE <- ctssSE[rowSums(ZebrafishCAGE[["development"]][ , sample]) > 0,] - ctssSE <- sort(ctssSE) + ctssSE <- .df2SE(ZebrafishCAGE$development, sample, "BSgenome.Drerio.UCSC.danRer7") ce <- CAGEexp(genomeName = "BSgenome.Drerio.UCSC.danRer7", colData = DataFrame( sampleLabels = sample, inputFiles = NA, From 6413eddb83fff8b908161e50f2c385e15daa19f8 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 13:48:48 +0900 Subject: [PATCH 10/17] Draft import function for FANTOM 3/4 See also #78 --- R/ImportMethods.R | 164 +++++++++++++++------------------------------- 1 file changed, 53 insertions(+), 111 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 153f110..4504a52 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -670,6 +670,7 @@ setGeneric("importPublicData", seqinfo = seqinfo(CAGEr:::getRefGenome(genome.name)), bsgenomeName = genome.name) ctssDF <- lapply(df[ , sample], Rle) |> DataFrame() + colnames(ctssDF) <- make.names(colnames(ctssDF)) ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges) ctssSE <- ctssSE[rowSums(df[ , sample]) > 0,] sort(ctssSE) @@ -796,117 +797,8 @@ setGeneric("importPublicData", } - }else if(origin == "FANTOM3and4"){ - - if("FANTOM3and4CAGE" %in% rownames(installed.packages()) == FALSE){ - stop("Requested CAGE data package is not installed! Please install and load the FANTOM3and4CAGE package available from Bioconductor.") - }else if(!("package:FANTOM3and4CAGE" %in% search())){ - stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(FANTOM3and4CAGE)'") - } - FANTOMhumanSamples <- FANTOMmouseSamples <- NULL - data("FANTOMhumanSamples", envir = environment()) - info.df1 <- FANTOMhumanSamples - data("FANTOMmouseSamples", envir = environment()) - info.df2 <- FANTOMmouseSamples - - if(!(all(dataset %in% info.df1$dataset) | all(dataset %in% info.df2$dataset))){ - stop("Specified dataset(s) not found! Call data(FANTOMhumanSamples) and data(FANTOMmouseSamples) and check 'dataset' column for available ENCODE datasets!") - } - if(length(grep("human", dataset))>0){ - genome.name <- "BSgenome.Hsapiens.UCSC.hg18" - info.df <- info.df1 - }else if(length(grep("mouse", dataset))>0){ - genome.name <- "BSgenome.Mmusculus.UCSC.mm9" - info.df <- info.df2 - } - - if(length(dataset) == 1){ - - if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))){ - stop("Some of the provided groups cannot be found in the specified dataset!") - } - if(length(group) == 1){ - if(!(all(sample %in% info.df[info.df$group == group,"sample"]))){ - stop("Some of the provided samples cannot be found in the specified group!") - } - }else if(length(group) == length(sample)){ - if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ - stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") - } - }else{ - stop("Number of elements in the 'group' must be either 1 or must match the number of elements in the 'sample'!") - } - - - }else if(length(dataset) == length(group)){ - if(!all(sapply(c(1:length(dataset)), function(x) {group[x] %in% info.df[info.df$dataset == dataset[x],"group"]}))){ - stop("Provided 'dataset' and 'group' do not match! Some of the provided groups cannot be found in the corresponding datasets!") - } - if(length(group) == length(sample)){ - if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ - stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") - } - }else{ - stop("Number of elements in the 'group' must match the number of elements in the 'sample'!") - } - - - }else{ - stop("Number of elements in the 'dataset' must be either 1 or must match the number of elements in the 'group' and in the 'sample'!") - } - - data(list = dataset, envir = environment()) - - if(length(unique(dataset))>1){ - - for(i in 1:length(unique(dataset))){ - dset <- get(unique(dataset)[i]) - for(j in 1:length(unique(group[which(dataset == unique(dataset)[i])]))){ - g <- unique(group[which(dataset == unique(dataset)[i])])[j] - ctss <- dset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] - discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 - ctss <- data.table(ctss[discard,]) - setnames(ctss, c("chr", "pos", "strand", paste(g, sample[which(group == g)], sep = "__"))) - setkeyv(ctss, cols = c("chr", "pos", "strand")) - if(i == 1 & j == 1){ - ctssTable <- ctss - }else{ - ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) - } - } - } - - ctssTable[is.na(ctssTable)] <- 0 - - }else{ - - selected.dataset <- get(dataset) - if(length(unique(group))>1){ - - for(i in 1:length(unique(group))){ - g <- unique(group)[i] - ctss <- selected.dataset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] - discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 - ctss <- data.table(ctss[discard,]) - setnames(ctss, c("chr", "pos", "strand", paste(g, sample[which(group == g)], sep = "__"))) - setkeyv(ctss, cols = c("chr", "pos", "strand")) - if(i == 1){ - ctssTable <- ctss - }else{ - ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) - } - } - - ctssTable[is.na(ctssTable)] <- 0 - - }else{ - ctssTable <- selected.dataset[[group]][,c("chr", "pos", "strand", sample)] - setnames(ctssTable, c("chr", "pos", "strand", paste(group, sample, sep = "__"))) - } - } - - ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) - + } else if (origin == "FANTOM3and4") { + ce <- .importPublicData_F34 (dataset = dataset, group = group, sample = sample) } else if (origin == "FANTOM5") { ce <- .importPublicData_F5 (dataset = dataset, group = group, sample = sample) } else if (origin == "ZebrafishDevelopment") { @@ -915,6 +807,56 @@ setGeneric("importPublicData", ce } +.importPublicData_F34 <- function (dataset, group = NULL, sample = NULL) { + if (length(unique(dataset))>1) stop("merging datasets not supported yet.") + if (! requireNamespace("FANTOM3and4CAGE")) + stop ("This function requires the ", dQuote("FANTOM3and4CAGE"), " package.") + if (all(dataset %in% c("FANTOMtissueCAGEhuman" , "FANTOMtimecourseCAGEhuman"))) { + if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg18")) + stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg18"), " package.") + FANTOMhumanSamples <- FANTOMtimecourseCAGEhuman <- FANTOMtissueCAGEhuman <- NULL + data("FANTOMhumanSamples", package = "FANTOM3and4CAGE", envir = environment()) + data("FANTOMtimecourseCAGEhuman", package = "FANTOM3and4CAGE", envir = environment()) + data("FANTOMtissueCAGEhuman", package = "FANTOM3and4CAGE", envir = environment()) + samples.info <- FANTOMhumanSamples + genome.name <- "BSgenome.Hsapiens.UCSC.hg18" + } else if (all(dataset %in% c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"))) { + if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9")) + stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.") + FANTOMmouseSamples <- NULL + data("FANTOMmouseSamples", package = "FANTOM3and4CAGE", envir = environment()) + data("FANTOMtimecourseCAGEmouse", package = "FANTOM3and4CAGE", envir = environment()) + data("FANTOMtissueCAGEmouse", package = "FANTOM3and4CAGE", envir = environment()) + samples.info <- FANTOMmouseSamples + genome.name <- "BSgenome.Mmusculus.UCSC.mm9" + } else { + stop("At least one dataset name is not valid") + } + + validSampleNames <- samples.info$sample + if (is.null(sample)) sample <- validSampleNames + if ( ! all(sample %in% validSampleNames)) + stop("At least one sample name is not valid. ", + "Call data('FANTOMhumanSamples', package='FANTOM3and4CAGE') or ", + "data('FANTOMmouseSamples', package='FANTOM3and4CAGE') to check valid names.") + validGroupNames <- unique(samples.info$group) + if ( ! all(group %in% validGroupNames)) + stop("At least one group name is not valid. ", + "Call data('FANTOMhumanSamples', package='FANTOM3and4CAGE') or ", + "data('FANTOMmouseSamples', package='FANTOM3and4CAGE') to check valid names.") + + df <- get(dataset)[[group]] + se <- .df2SE(df, sample, genome.name) + ce <- CAGEexp(genomeName = genome.name, + colData = DataFrame( sampleLabels = make.names(sample), + inputFiles = NA, + inputFilesType = 'ctss', + librarySizes = sapply(assay(se), sum), + row.names = make.names(sample))) + CTSStagCountSE(ce) <- se + ce +} + .importPublicData_F5 <- function(dataset = c("human", "mouse"), group = NULL, sample = NULL) { dataset <- match.arg(dataset) if (dataset == "human") { From 794225c7ee64b7cf39293d50a1baf8d328a4a4b6 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 15:28:52 +0900 Subject: [PATCH 11/17] Document import of CAGE resources to CAGEexp --- vignettes/CAGE_Resources.Rmd | 50 ++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/vignettes/CAGE_Resources.Rmd b/vignettes/CAGE_Resources.Rmd index 3f31812..f6ee747 100644 --- a/vignettes/CAGE_Resources.Rmd +++ b/vignettes/CAGE_Resources.Rmd @@ -28,9 +28,6 @@ set.seed(0xdada) Available CAGE data resources ============================= -Note: this still uses the `CAGEset` class. - - There are several large collections of CAGE data available that provide single base-pair resolution TSSs for numerous human and mouse primary cells, cell lines and tissues. Together with several minor datasets for other model organisms @@ -55,12 +52,12 @@ each TSS detected by CAGE. The list of all available samples for both human and mouse (as presented in the Supplementary Table 1 of the publication) has been included in _CAGEr_ to facilitate browsing, searching and selecting samples of interest. TSSs for selected samples are then fetched directly form the web -resource and imported into a `CAGEset` object enabling their further +resource and imported into a `CAGEexp` object enabling their further manipulation with _CAGEr_. ## FANTOM3 and 4 -Previous FANTOM projects (3 and 4) [@Consortium:2005kp,@Faulkner:2009fw,@Suzuki:2009gy]) +Previous FANTOM projects (3 and 4) [@Consortium:2005kp] [@Faulkner:2009fw] [@Suzuki:2009gy] produced CAGE datasets for multiple human and mouse tissues as well as several timecourses, including differentiation of a THP-1 human myeloid leukemia cell line. All this TSS data has been grouped into datasets by the organism and @@ -68,7 +65,7 @@ tissue of origin and has been collected into an R data package named _FANTOM3and4CAGE_, which is available from Bioconductor . The vignette accompanying the package provides information on available datasets and lists of samples. When the data package is installed, _CAGEr_ can import the -TSSs for selected samples directly into a `CAGEset` object for further +TSSs for selected samples directly into a `CAGEexp` object for further manipulation. ## ENCODE cell lines @@ -100,14 +97,12 @@ data has been collected into an R data package named _ZebrafishDevelopmentalCAGE which is available for download from CAGEr web site at . As with other data packages mentioned above, once the package is installed _CAGEr_ can use it to import stage-specific single base pair TSSs into a -`CAGEset` object. +`CAGEexp` object. Importing public TSS data for manipulation in _CAGEr_ ===================================================== -Note: this still uses the `CAGEset` class. - -The data from above mentioned resources can be imported into a `CAGEset` object +The data from above mentioned resources can be imported into a `CAGEexp` object using the `importPublicData()` function. It function has four arguments: `source`, `dataset`, `group` and `sample`. Argument `source` accepts one of the following values: `"FANTOM5"`, `"FANTOM3and4"`, `"ENCODE"`, or `"ZebrafishDevelopment"`, @@ -146,8 +141,8 @@ Provided information facilitates searching for samples of interest, _e.g._ we can search for astrocyte samples: ```{r} -astrocyteSamples <- FANTOM5humanSamples[grep("Astrocyte", - FANTOM5humanSamples[,"description"]),] +astrocyteSamples <- + FANTOM5humanSamples[grep("Astrocyte", FANTOM5humanSamples[,"description"]),] astrocyteSamples ``` @@ -170,11 +165,11 @@ astrocyteSamples[,"sample"] and to import first three samples type: ``` -astrocyteCAGEset <- importPublicData(source = "FANTOM5", dataset = "human", +astrocyteCAGEexp <- importPublicData(origin = "FANTOM5", dataset = "human", sample = astrocyteSamples[1:3,"sample"]) ``` -The resulting `astrocyteCAGEset` is a `CAGEset` object that can be included in +The resulting `astrocyteCAGEexp` is a `CAGEexp` object that can be included in the _CAGEr_ workflow described above to perform normalisation, clustering, visualisation, etc. @@ -205,22 +200,27 @@ values that should be passed to corresponding arguments in `importPublicData()` function. For example to import human kidney normal and malignancy samples call: ``` -kidneyCAGEset <- importPublicData(source = "FANTOM3and4", +kidneyCAGEexp <- importPublicData(origin = "FANTOM3and4", dataset = "FANTOMtissueCAGEhuman", group = "kidney", sample = c("kidney", "malignancy")) ``` -When the samples belong to different groups or different datasets, it is necessary to provide the dataset and group assignment for each sample separately: +When the samples belong to different groups or different datasets, it is +necessary to provide the dataset and group assignment for each sample separately: + +Note: this functionality is disable for the moment. Please open an issue if you +need it. ``` -mixedCAGEset <- importPublicData(source = "FANTOM3and4", +mixedCAGEexp <- importPublicData(origin = "FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse", "FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"), group = c("liver", "liver", "liver_under_constant_darkness"), sample = c("cloned_mouse", "control_mouse", "4_hr")) ``` -For more details about datasets available in the \emph{FANTOM3and4CAGE} data package please refer to the vignette accompanying the package. +For more details about datasets available in the _FANTOM3and4CAGE_ data package +please refer to the vignette accompanying the package. ## _ENCODEprojectCAGE_ data package @@ -236,10 +236,16 @@ library(ENCODEprojectCAGE) data(ENCODEhumanCellLinesSamples) ``` -The information provided in this data frame is analogous to the one in previously discussed data package and provides values to be used with `importPublicData()` function. The command to import whole cell CAGE samples for three different cell lines would look like this: +The information provided in this data frame is analogous to the one in +previously discussed data package and provides values to be used with +`importPublicData()` function. The command to import whole cell CAGE samples for +three different cell lines would look like this: + +Note: this functionality is disable for the moment. Please open an issue if you +need it. ``` -ENCODEset <- importPublicData(source = "ENCODE", +ENCODEset <- importPublicData(origin = "ENCODE", dataset = c("A549", "H1-hESC", "IMR90"), group = c("cell", "cell", "cell"), sample = c("A549_cell_rep1", "H1-hESC_cell_rep1", "IMR90_cell_rep1")) @@ -266,7 +272,7 @@ belong to the same group called `development`. To import selected samples from this dataset type: ``` -zebrafishCAGEset <- importPublicData(source = "ZebrafishDevelopment", +zebrafishCAGEexp <- importPublicData(origin = "ZebrafishDevelopment", dataset = "ZebrafishCAGE", group = "development", sample = c("zf_64cells", "zf_prim6")) ``` @@ -275,7 +281,7 @@ For more details please refer to the vignette accompanying the data package. Importing TSS data from any of the four above explained resources results in the -`CAGEset` object that can be directly included into the workflow provided by +`CAGEexp` object that can be directly included into the workflow provided by _CAGEr_ to perform normalisation, clustering, promoter width analysis, visualisation, _etc_. This high-resolution TSS data can then easily be integrated with other genomic data types to perform various promoter-centred From 3edefb78589c2d81e7ea3ac33b68de747dc89157 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 15:29:42 +0900 Subject: [PATCH 12/17] Clean up :: and ::: calls. --- R/ImportMethods.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 4504a52..ca5f340 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -667,7 +667,7 @@ setGeneric("importPublicData", ctssRanges <- CTSS(df$chr, df$pos, df$strand, - seqinfo = seqinfo(CAGEr:::getRefGenome(genome.name)), + seqinfo = seqinfo(getRefGenome(genome.name)), bsgenomeName = genome.name) ctssDF <- lapply(df[ , sample], Rle) |> DataFrame() colnames(ctssDF) <- make.names(colnames(ctssDF)) @@ -866,7 +866,6 @@ setGeneric("importPublicData", data("FANTOM5humanSamples", package = "CAGEr", envir = environment()) samples.info <- FANTOM5humanSamples genome.name <- "BSgenome.Hsapiens.UCSC.hg19" - genome.obj <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 } else { if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9")) stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.") @@ -874,7 +873,6 @@ setGeneric("importPublicData", data("FANTOM5mouseSamples", package = "CAGEr", envir = environment()) samples.info <- FANTOM5mouseSamples genome.name <- "BSgenome.Mmusculus.UCSC.mm9" - genome.obj <- BSgenome.Mmusculus.UCSC.mm9::BSgenome.Mmusculus.UCSC.mm9 } validSampleNames <- samples.info$sample From dbcab6c2989dc6547cc8a89402e50cb36177c26b Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 15:30:07 +0900 Subject: [PATCH 13/17] Enforce sanitised names. --- R/ImportMethods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index ca5f340..be466a3 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -885,7 +885,7 @@ setGeneric("importPublicData", ce <- CAGEexp(genomeName = genome.name, inputFiles = samples.info[samples.info$sample %in% sample,"data_url"], inputFilesType = "bedScore", - sampleLabels = sample) + sampleLabels = make.names(sample)) getCTSS(ce) } From 35fa520283792cdc3bd1875a5feb496abbbe0e79 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 15:30:27 +0900 Subject: [PATCH 14/17] Limited ENCODE import (no merging). --- R/ImportMethods.R | 164 +++++++++++++--------------------------------- 1 file changed, 44 insertions(+), 120 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index be466a3..e88f3a1 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -678,126 +678,9 @@ setGeneric("importPublicData", .importPublicData <- function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) { # origin <- match.arg(origin) - - if(origin == "ENCODE"){ - - if("ENCODEprojectCAGE" %in% rownames(installed.packages()) == FALSE){ - stop("Requested CAGE data package is not installed! Please install and load the ENCODEprojectCAGE package, which is available for download from http://promshift.genereg.net/CAGEr/PackageSource/.") - }else if(!("package:ENCODEprojectCAGE" %in% search())){ - stop("Requested CAGE data package is not loaded! Please load the data package by calling 'library(ENCODEprojectCAGE)'") - } - - if(dataset[1] == "ENCODEtissueCAGEfly"){ - genome.name <- "BSgenome.Dmelanogaster.UCSC.dm3" - ENCODEtissueCAGEfly <- NULL - data("ENCODEtissueCAGEfly", envir = environment()) - if(group == "embryo"){ - if(sample == "mixed_embryos_0-24hr"){ - - ctssTable <- ENCODEtissueCAGEfly[["embryo"]] - - }else{ - stop("Specified sample not valid! The dataset 'ENCODEtissueCAGEfly' containes only one sample named 'mixed_embryos_0-24hr'!") - } - }else{ - stop("Specified group not valid! The dataset 'ENCODEtissueCAGEfly' containes only one group named 'embryo'!") - } - - }else{ - genome.name <- "BSgenome.Hsapiens.UCSC.hg19" - ENCODEhumanCellLinesSamples <- NULL - data("ENCODEhumanCellLinesSamples", envir = environment()) - info.df <- ENCODEhumanCellLinesSamples - - if(!(all(dataset %in% info.df$dataset))){ - stop("Specified dataset(s) not found! Call data(ENCODEhumanCellLinesSamples) and check 'dataset' column for available ENCODE datasets!") - } - if(length(dataset) == 1){ - - if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))){ - stop("Some of the provided groups cannot be found in the specified dataset!") - } - if(length(group) == 1){ - if(!(all(sample %in% info.df[info.df$group == group,"sample"]))){ - stop("Some of the provided samples cannot be found in the specified group!") - } - }else if(length(group) == length(sample)){ - if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ - stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") - } - }else{ - stop("Number of elements in the 'group' must be either 1 or must match the number of elements in the 'sample'!") - } - - - }else if(length(dataset) == length(group)){ - if(!all(sapply(c(1:length(dataset)), function(x) {group[x] %in% info.df[info.df$dataset == dataset[x],"group"]}))){ - stop("Provided 'dataset' and 'group' do not match! Some of the provided groups cannot be found in the corresponding datasets!") - } - if(length(group) == length(sample)){ - if(!all(sapply(c(1:length(group)), function(x) {sample[x] %in% info.df[info.df$group == group[x],"sample"]}))){ - stop("Provided 'group' and 'sample' do not match! Some of the provided samples cannot be found in the corresponding groups!") - } - }else{ - stop("Number of elements in the 'group' must match the number of elements in the 'sample'!") - } - - - }else{ - stop("Number of elements in the 'dataset' must be either 1 or must match the number of elements in the 'group' and in the 'sample'!") - } - - data(list = dataset, envir = environment()) - - if(length(unique(dataset))>1){ - - for(i in 1:length(unique(dataset))){ - dset <- get(unique(dataset)[i]) - for(j in 1:length(unique(group[which(dataset == unique(dataset)[i])]))){ - g <- unique(group[which(dataset == unique(dataset)[i])])[j] - ctss <- dset[[g]][, c("chr", "pos", "strand", sample[which((group == g) & (dataset == unique(dataset)[i]))])] - discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 - ctss <- data.table(ctss[discard,]) - setkeyv(ctss, cols = c("chr", "pos", "strand")) - if(i == 1 & j == 1){ - ctssTable <- ctss - }else{ - ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) - } - } - } - ctssTable[is.na(ctssTable)] <- 0 - - }else{ - - selected.dataset <- get(dataset) - if(length(unique(group))>1){ - - for(i in 1:length(unique(group))){ - g <- unique(group)[i] - ctss <- selected.dataset[[g]][, c("chr", "pos", "strand", sample[which(group == g)])] - discard <- apply(ctss[, c(4:ncol(ctss)), drop = F], 1, function(x) {sum(x > 0)}) >= 1 - ctss <- data.table(ctss[discard,]) - setkeyv(ctss, cols = c("chr", "pos", "strand")) - if(i == 1){ - ctssTable <- ctss - }else{ - ctssTable <- merge(ctssTable, ctss, all.x = T, all.y = T) - } - } - - ctssTable[is.na(ctssTable)] <- 0 - - }else{ - ctssTable <- selected.dataset[[group]][,c("chr", "pos", "strand", sample)] - } - } - - ctssTable <- data.frame(ctssTable, stringsAsFactors = F, check.names = F) - } - - - } else if (origin == "FANTOM3and4") { + if (origin == "ENCODE") { + ce <- .importPublicData_ENCODE (dataset = dataset, group = group, sample = sample) + } else if (origin == "FANTOM3and4") { ce <- .importPublicData_F34 (dataset = dataset, group = group, sample = sample) } else if (origin == "FANTOM5") { ce <- .importPublicData_F5 (dataset = dataset, group = group, sample = sample) @@ -807,6 +690,47 @@ setGeneric("importPublicData", ce } +.importPublicData_ENCODE <- function (dataset, group = NULL, sample = NULL) { + if (length(unique(dataset))>1) stop ("Merging datasets not supported yet.") + if (! requireNamespace("ENCODEprojectCAGE")) + stop ("This function requires the ", dQuote("ENCODEprojectCAGE"), " package.", + " package; please install it from http://promshift.genereg.net/CAGEr/PackageSource/.") + if (dataset == "ENCODEtissueCAGEfly") { + if (group != "embryo") stop("Only 'embryo' is allowed as group for dataset 'ENCODEtissueCAGEfly'.") + if (sample != "mixed_embryos_0-24hr") stop("Only 'mixed_embryos_0-24hr' is allowed as sample for dataset 'ENCODEtissueCAGEfly'.") + if (! requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3")) + stop ("This function requires the ", dQuote("BSgenome.Dmelanogaster.UCSC.dm3"), " package.") + genome.name <- "BSgenome.Dmelanogaster.UCSC.dm3" + ENCODEtissueCAGEfly <- NULL + data("ENCODEtissueCAGEfly", package = "ENCODEprojectCAGE", envir = environment()) + df <- ENCODEtissueCAGEfly$embryo + } else { + if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) + stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg19"), " package.") + genome.name <- "BSgenome.Hsapiens.UCSC.hg19" + ENCODEhumanCellLinesSamples <- NULL + data("ENCODEhumanCellLinesSamples", package = "ENCODEprojectCAGE", envir = environment()) + info.df <- ENCODEhumanCellLinesSamples + if(!(all(dataset %in% info.df$dataset))) + stop("Specified dataset(s) not found! Call data(ENCODEhumanCellLinesSamples) and check 'dataset' column for available ENCODE datasets!") + if(!(all(group %in% info.df[info.df$dataset == dataset,"group"]))) + stop("Some of the provided groups cannot be found in the specified dataset!") + if(!(all(sample %in% info.df[,"sample"]))) + stop("Some of the provided samples cannot be found!") + data(list= dataset, package = "ENCODEprojectCAGE", envir = environment()) + df <- get(dataset)[[group]] + } + se <- .df2SE(df, sample, genome.name) + ce <- CAGEexp(genomeName = genome.name, + colData = DataFrame( sampleLabels = make.names(sample), + inputFiles = NA, + inputFilesType = 'ctss', + librarySizes = sapply(assay(se), sum), + row.names = make.names(sample))) + CTSStagCountSE(ce) <- se + ce +} + .importPublicData_F34 <- function (dataset, group = NULL, sample = NULL) { if (length(unique(dataset))>1) stop("merging datasets not supported yet.") if (! requireNamespace("FANTOM3and4CAGE")) From cfb88eb918465466784f8455dfb75790f4585103 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 27 Jun 2023 18:33:23 +0900 Subject: [PATCH 15/17] Restore the importPublicData function. See: #78 --- NAMESPACE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index acb0ee7..3070776 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,6 +45,7 @@ export(getExpressionProfiles) export(getShiftingPromoters) export(hanabi) export(hanabiPlot) +export(importPublicData) export(inputFiles) export(inputFilesType) export(librarySizes) @@ -191,6 +192,7 @@ importFrom(stats,median) importFrom(stats,p.adjust) importFrom(stringdist,stringdist) importFrom(stringi,stri_sub) +importFrom(utils,data) importFrom(utils,head) importFrom(utils,installed.packages) importFrom(utils,read.table) From 8b70db0948b2b0d100efcd6fb187fc2b7b07232e Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Wed, 28 Jun 2023 08:55:56 +0900 Subject: [PATCH 16/17] Restore documentation (removing some redundancy in the text). --- R/ImportMethods.R | 131 ++++++++++++++++++++++++++++++- man/FANTOM5humanSamples.Rd | 3 +- man/FANTOM5mouseSamples.Rd | 3 +- man/importPublicData.Rd | 154 +++++++++++++++++++++++++++++++++++++ 4 files changed, 288 insertions(+), 3 deletions(-) create mode 100644 man/importPublicData.Rd diff --git a/R/ImportMethods.R b/R/ImportMethods.R index e88f3a1..4bd2824 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -652,7 +652,134 @@ setMethod( "getCTSS", "CAGEexp" #' importPublicData -#' @noRd +#' +#' Imports CAGE data from different sources into a [`CAGEexp`] object. After +#' the object has been created the data can be further manipulated and +#' visualized using other functions available in the _CAGEr_ package and +#' integrated with other analyses in R. Available resources include: +#' +#' - FANTOM5 datasets (Forrest _et al.,_ Nature 2014) for numerous human and +#' mouse samples (primary cells, cell lines and tissues), which are fetched +#' directly from FANTOM5 online resource at +#' \href{https://fantom.gsc.riken.jp/5/data}{https://fantom.gsc.riken.jp/5/data}. +#' +#' - FANTOM3 and 4 datasets (Carninci _et al., _ Science 2005, Faulkner +#' _et al.,_ Nature Genetics 2009, Suzuki _et al._ Nature Genetics 2009) from +#' _FANTOM3and4CAGE_ data package available from Bioconductor. +#' +#' - ENCODE datasets (Djebali _et al._ Nature 2012) for numerous human cell +#' lines from _ENCODEprojectCAGE_ data package, which is available for +#' download from \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}. +#' +#' - Zebrafish (_Danio rerio_) developmental timecourse datasets (Nepal _et al._ Genome +#' Research 2013) from \emph{ZebrafishDevelopmentalCAGE} data package, which +#' is available for download from +#' \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}. +#' +#' @param origin Character vector specifying one of the available resources for +#' CAGE data (`"FANTOM5"`, `"FANTOM3and4"`, `"ENCODE"` or `"ZebrafishDevelopment"`). +#' +#' @param dataset Character vector specifying one or more of the datasets +#' available in the selected resource. For FANTOM5 it can be either +#' `"human"` or `"mouse"`, and only one of them can be specified at a +#' time. For other resources please refer to the vignette of the +#' corresponding data package for the list of available datasets. +#' Multiple datasets mapped to the same genome can be specified to +#' combine selected samples from each. +#' +#' @param group Character string specifying one or more groups within specified +#' dataset(s), from which the samples should be selected. The `group` +#' argument is used only when importing TSSs from data packages and +#' ignored for "FANTOM5". For available groups in each dataset please +#' refer to the vignette of the corresponding data package. Either only +#' one group has to be specified (if all selected samples belong to the +#' same group) or one group per sample (if samples belong to different +#' groups). In the latter case, the number of elements in `group` must +#' match the number of elements in `sample`. +#' +#' @param sample Character string specifying one or more CAGE samples. Check +#' the corresponding data package for available samples within each group +#' and their labels. For FANTOM5 resource, list of all human (~1000) and +#' mouse (~) samples can be obtained in _CAGEr_ by loading +#' `data(FANTOM5humanSamples)` and `data(FANTOM5mouseSamples)`, +#' respectively. Use the names from the \code{sample} column to specify +#' which samples should be imported. +#' +#' @return A [`CAGEexp`] object is returned, containing information on library +#' size, CTSS coordinates and tag count matrix. The object is ready for _CAGEr_ +#' analysis (normalisation, tag clustering, …). +#' +#' @references +#' +#' - Carninci _et al.,_ (2005). _The Transcriptional Landscape of the Mammalian +#' Genome_. Science **309**(5740):1559-1563. +#' +#' - Djebali _et al.,_ (2012). _Landscape of transcription in human cells._ +#' Nature **488**(7414):101-108. +#' +#' - Faulkner _et al.,_ (2009). _The regulated retrotransposon transcriptome of +#' mammalian cells._, Nature Genetics **41**:563-571. +#' +#' - Forrest _et al.,_ (2014). _A promoter-level mammalian expression atlas._ +#' Nature **507**(7493):462-470. +#' +#' - Nepal _et al.,_ (2013). _Dynamic regulation of the transcription +#' initiation landscape at single nucleotide resolution during vertebrate +#' embryogenesis_. Genome Research **23**(11):1938-1950. +#' +#' - Suzuki_et al.,_ (2009). The transcriptional network that controls growth +#' arrest and differentiation in a human myeloid leukemia cell line_. Nature +#' Genetics **41**:553-562. +#' +#' @author Vanja Haberle +#' @author Charles Plessy +#' +#' @family FANTOM data +#' +#' @examples +#' \dontrun{ +#' ### importing FANTOM5 data +#' +#' # list of FANTOM5 human tissue samples +#' +#' data(FANTOM5humanSamples) +#' head(subset(FANTOM5humanSamples, type == "tissue")) +#' +#' # import selected samples +#' f5 <- importPublicData( +#' origin="FANTOM5", dataset = "human", +#' sample = c("adipose_tissue__adult__pool1", "adrenal_gland__adult__pool1", +#' "aorta__adult__pool1")) +#' +#' CTSScoordinatesGR(f5) +#' +#' ### importing FANTOM3/4 data from a data package +#' +#' library(FANTOM3and4CAGE) +#' +#' # list of mouse datasets available in this package +#' +#' data(FANTOMmouseSamples) +#' unique(FANTOMmouseSamples$dataset) +#' head(subset(FANTOMmouseSamples, dataset == "FANTOMtissueCAGEmouse")) +#' head(subset(FANTOMmouseSamples, dataset == "FANTOMtimecourseCAGEmouse")) +#' +#' # import selected samples from two different mouse datasets +#' +#' f34 <- importPublicData( +#' origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"), +#' group = c("brain", "adipogenic_induction"), +#' sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+", "DFAT-D1_preadipocytes_2days")) +#' +#' f34 <- importPublicData( +#' origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse"), +#' group = c("brain"), +#' sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+")) +#' +#' CTSScoordinatesGR(f34) +#' +#' } +#' #' @importFrom utils data #' @export @@ -845,5 +972,7 @@ setGeneric("importPublicData", ce } +#' @rdname importPublicData + setMethod("importPublicData", signature(origin = "character", dataset = "character", sample = "character"), .importPublicData) \ No newline at end of file diff --git a/man/FANTOM5humanSamples.Rd b/man/FANTOM5humanSamples.Rd index 4840553..f29bedc 100644 --- a/man/FANTOM5humanSamples.Rd +++ b/man/FANTOM5humanSamples.Rd @@ -18,7 +18,8 @@ libraries. Its use is described in more details in the vignette } \seealso{ Other FANTOM data: -\code{\link{FANTOM5mouseSamples}} +\code{\link{FANTOM5mouseSamples}}, +\code{\link{importPublicData}()} } \concept{FANTOM data} \keyword{datasets} diff --git a/man/FANTOM5mouseSamples.Rd b/man/FANTOM5mouseSamples.Rd index 179cd51..011f352 100644 --- a/man/FANTOM5mouseSamples.Rd +++ b/man/FANTOM5mouseSamples.Rd @@ -18,7 +18,8 @@ libraries. Its use is described in more details in the vignette } \seealso{ Other FANTOM data: -\code{\link{FANTOM5humanSamples}} +\code{\link{FANTOM5humanSamples}}, +\code{\link{importPublicData}()} } \concept{FANTOM data} \keyword{datasets} diff --git a/man/importPublicData.Rd b/man/importPublicData.Rd new file mode 100644 index 0000000..5e2cb52 --- /dev/null +++ b/man/importPublicData.Rd @@ -0,0 +1,154 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ImportMethods.R +\name{importPublicData} +\alias{importPublicData} +\alias{importPublicData,character,character,ANY,character-method} +\title{importPublicData} +\usage{ +importPublicData( + origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), + dataset, + group, + sample +) + +\S4method{importPublicData}{character,character,ANY,character}( + origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), + dataset, + group, + sample +) +} +\arguments{ +\item{origin}{Character vector specifying one of the available resources for +CAGE data (\code{"FANTOM5"}, \code{"FANTOM3and4"}, \code{"ENCODE"} or \code{"ZebrafishDevelopment"}).} + +\item{dataset}{Character vector specifying one or more of the datasets +available in the selected resource. For FANTOM5 it can be either +\code{"human"} or \code{"mouse"}, and only one of them can be specified at a +time. For other resources please refer to the vignette of the +corresponding data package for the list of available datasets. +Multiple datasets mapped to the same genome can be specified to +combine selected samples from each.} + +\item{group}{Character string specifying one or more groups within specified +dataset(s), from which the samples should be selected. The \code{group} +argument is used only when importing TSSs from data packages and +ignored for "FANTOM5". For available groups in each dataset please +refer to the vignette of the corresponding data package. Either only +one group has to be specified (if all selected samples belong to the +same group) or one group per sample (if samples belong to different +groups). In the latter case, the number of elements in \code{group} must +match the number of elements in \code{sample}.} + +\item{sample}{Character string specifying one or more CAGE samples. Check +the corresponding data package for available samples within each group +and their labels. For FANTOM5 resource, list of all human (~1000) and +mouse (~) samples can be obtained in \emph{CAGEr} by loading +\code{data(FANTOM5humanSamples)} and \code{data(FANTOM5mouseSamples)}, +respectively. Use the names from the \code{sample} column to specify +which samples should be imported.} +} +\value{ +A \code{\link{CAGEexp}} object is returned, containing information on library +size, CTSS coordinates and tag count matrix. The object is ready for \emph{CAGEr} +analysis (normalisation, tag clustering, …). +} +\description{ +Imports CAGE data from different sources into a \code{\link{CAGEexp}} object. After +the object has been created the data can be further manipulated and +visualized using other functions available in the \emph{CAGEr} package and +integrated with other analyses in R. Available resources include: +} +\details{ +\itemize{ +\item FANTOM5 datasets (Forrest \emph{et al.,} Nature 2014) for numerous human and +mouse samples (primary cells, cell lines and tissues), which are fetched +directly from FANTOM5 online resource at +\href{https://fantom.gsc.riken.jp/5/data}{https://fantom.gsc.riken.jp/5/data}. +\item FANTOM3 and 4 datasets (Carninci _et al., _ Science 2005, Faulkner +\emph{et al.,} Nature Genetics 2009, Suzuki \emph{et al.} Nature Genetics 2009) from +\emph{FANTOM3and4CAGE} data package available from Bioconductor. +\item ENCODE datasets (Djebali \emph{et al.} Nature 2012) for numerous human cell +lines from \emph{ENCODEprojectCAGE} data package, which is available for +download from \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}. +\item Zebrafish (\emph{Danio rerio}) developmental timecourse datasets (Nepal \emph{et al.} Genome +Research 2013) from \emph{ZebrafishDevelopmentalCAGE} data package, which +is available for download from +\href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}. +} +} +\examples{ +\dontrun{ +### importing FANTOM5 data + +# list of FANTOM5 human tissue samples + +data(FANTOM5humanSamples) +head(subset(FANTOM5humanSamples, type == "tissue")) + +# import selected samples +f5 <- importPublicData( + origin="FANTOM5", dataset = "human", + sample = c("adipose_tissue__adult__pool1", "adrenal_gland__adult__pool1", + "aorta__adult__pool1")) + +CTSScoordinatesGR(f5) + +### importing FANTOM3/4 data from a data package + +library(FANTOM3and4CAGE) + +# list of mouse datasets available in this package + +data(FANTOMmouseSamples) +unique(FANTOMmouseSamples$dataset) +head(subset(FANTOMmouseSamples, dataset == "FANTOMtissueCAGEmouse")) +head(subset(FANTOMmouseSamples, dataset == "FANTOMtimecourseCAGEmouse")) + +# import selected samples from two different mouse datasets + +f34 <- importPublicData( + origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"), + group = c("brain", "adipogenic_induction"), + sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+", "DFAT-D1_preadipocytes_2days")) + +f34 <- importPublicData( + origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse"), + group = c("brain"), + sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+")) + +CTSScoordinatesGR(f34) + +} + +} +\references{ +\itemize{ +\item Carninci \emph{et al.,} (2005). \emph{The Transcriptional Landscape of the Mammalian +Genome}. Science \strong{309}(5740):1559-1563. +\item Djebali \emph{et al.,} (2012). \emph{Landscape of transcription in human cells.} +Nature \strong{488}(7414):101-108. +\item Faulkner \emph{et al.,} (2009). \emph{The regulated retrotransposon transcriptome of +mammalian cells.}, Nature Genetics \strong{41}:563-571. +\item Forrest \emph{et al.,} (2014). \emph{A promoter-level mammalian expression atlas.} +Nature \strong{507}(7493):462-470. +\item Nepal \emph{et al.,} (2013). \emph{Dynamic regulation of the transcription +initiation landscape at single nucleotide resolution during vertebrate +embryogenesis}. Genome Research \strong{23}(11):1938-1950. +\item Suzuki_et al.,_ (2009). The transcriptional network that controls growth +arrest and differentiation in a human myeloid leukemia cell line_. Nature +Genetics \strong{41}:553-562. +} +} +\seealso{ +Other FANTOM data: +\code{\link{FANTOM5humanSamples}}, +\code{\link{FANTOM5mouseSamples}} +} +\author{ +Vanja Haberle + +Charles Plessy +} +\concept{FANTOM data} From a0191418d5b73978bf4313352772a388cc986102 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Wed, 28 Jun 2023 09:37:27 +0900 Subject: [PATCH 17/17] Fix function for one-sample cases. --- R/ImportMethods.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/ImportMethods.R b/R/ImportMethods.R index 4bd2824..4089ef4 100644 --- a/R/ImportMethods.R +++ b/R/ImportMethods.R @@ -796,10 +796,14 @@ setGeneric("importPublicData", df$strand, seqinfo = seqinfo(getRefGenome(genome.name)), bsgenomeName = genome.name) - ctssDF <- lapply(df[ , sample], Rle) |> DataFrame() + ctssDF <- lapply(df[ , sample, drop = FALSE], Rle) |> DataFrame() colnames(ctssDF) <- make.names(colnames(ctssDF)) ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges) - ctssSE <- ctssSE[rowSums(df[ , sample]) > 0,] + if (ncol(ctssDF) == 1) { + ctssSE <- ctssSE[ df[ , sample] > 0,] + } else { + ctssSE <- ctssSE[rowSums(df[ , sample]) > 0,] + } sort(ctssSE) }