From 30165e93ffd373828618ef6e83f175347f27935e Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 6 Feb 2017 11:51:11 +0100 Subject: [PATCH 1/7] Added threads and verbose arguments to cimpl. --- R/cimpl.R | 269 ++++++++++++++++++++++++++--------------------- R/plot-methods.R | 4 + 2 files changed, 151 insertions(+), 122 deletions(-) diff --git a/R/cimpl.R b/R/cimpl.R index 84fa420..cbf3cb2 100644 --- a/R/cimpl.R +++ b/R/cimpl.R @@ -9,9 +9,16 @@ doCimplAnalysis <- function( system = c('MMTV', 'MuLV', 'SB', 'PB'), specificity.pattern, lhc.method = c('none', 'exclude'), # local hopping correction method - verbose=TRUE + verbose=TRUE, + threads=1 ) { + if (any(!chromosomes %in% data$chr)) { + warning(sprintf('Dropped chromosomes not present in data: %s', + paste(chromosomes[!chromosomes %in% data$chr], collapse=', '))) + chromosomes = chromosomes[chromosomes %in% data$chr] + } + # processing arguments if (!missing(system)) { system <- match.arg(system) @@ -29,8 +36,8 @@ doCimplAnalysis <- function( } lhc.method <- match.arg(lhc.method) - - print(match.call(expand.dots=TRUE)) + + #print(match.call(expand.dots=TRUE)) # starting the analysis if (verbose) cat('Starting CIMPL analysis...\n') @@ -38,8 +45,13 @@ doCimplAnalysis <- function( if (verbose) cat(paste('>>>> chromosome', chr, '<<<<\n')) if ( specificity.pattern != '' ) { + strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="") - background <- c(matchPattern(specificity.pattern, BSgenome[[chr]])@start, matchPattern(strReverse(specificity.pattern), BSgenome[[chr]])@start) + + background <- c(matchPattern(specificity.pattern, BSgenome[[chr]])@ranges@start, matchPattern(strReverse(specificity.pattern), BSgenome[[chr]])@ranges@start) + # for R 2.11.1 + #background <- c(matchPattern(specificity.pattern, BSgenome[[chr]])@start, matchPattern(strReverse(specificity.pattern), BSgenome[[chr]])@start) + } else { background <- NULL } @@ -53,7 +65,7 @@ doCimplAnalysis <- function( # if (verbose) cat(paste('Warning: undersampling for h =', h , '!! Sample rate set to', max_sample_points, '. ')) n_sample_points <- max_sample_points } - + chr_data <- data[data$chr == chr, ] if (lhc.method != 'none') { chr_data$hop <- .isHop(chr_data, h * 3) @@ -71,10 +83,10 @@ doCimplAnalysis <- function( #if (verbose) cat('null-peaks...') #null_peaks <- .getPeakDistribution(null_densities) - + if (verbose) cat('null-peaks...') - null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose) - + null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads) + if (verbose) cat('cimpl_object...') cimplObject <- .getCimplObject(chr_data, chr, null_peaks, h=h, n_sample_points=n_sample_points, verbose=verbose) @@ -90,8 +102,8 @@ doCimplAnalysis <- function( #null_peaks <- .getPeakDistribution(null_densities) if (verbose) cat('null-peaks...') - null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose) - + null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads) + if (verbose) cat('at-density...') bg_density <- density(background, bw=h, n=n_sample_points) @@ -103,7 +115,7 @@ doCimplAnalysis <- function( } }) }) - + new('cimplAnalysis', cimplObjects = cimplObjects, chromosomes = chromosomes, @@ -119,54 +131,58 @@ doCimplAnalysis <- function( ) } -.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE) { - local_maxima_list <- lapply(1:n_iterations, function(i) { - if (verbose) cat('.') +.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) { + apply_func <- function(i) { r_insertions <- sort(floor(runif(n_insertions, min=1, max=chr_length))) - - # calculate density - d <- density(r_insertions, bw=h, n=n_sample_points) - - # get peaks - lms <- .getLocalMaxima(d$y) + d <- density(r_insertions, bw=h, n=n_sample_points) # calculate density + lms <- .getLocalMaxima(d$y) # get peaks list(max_pos=d$x[lms$max_pos], max_val=lms$max_val) - }) - + } + + if (threads > 1) { + suppressMessages(library(parallel)) + local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads) + } else { + local_maxima_list <- lapply(1:n_iterations, apply_func) + } + x <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_pos)) y <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_val)) - + # sort ascending srt <- sort.int(x, index.return=TRUE) x <- x[srt$ix] y <- y[srt$ix] - + return( list( x=x, y=y ) ) } -.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE) { - local_maxima_list <- lapply(1:n_iterations, function(i) { - if (verbose) cat('.') - # draw from TA/AT sites background +.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) { + + apply_func <- function(i) { r_insertions <- sort(sample(background, n_insertions, replace=TRUE)) - - # calculate density - d <- density(r_insertions, bw=h, n=n_sample_points) - - # get peaks - lms <- .getLocalMaxima(d$y) + d <- density(r_insertions, bw=h, n=n_sample_points) # calculate density + lms <- .getLocalMaxima(d$y) # get peaks list(max_pos=d$x[lms$max_pos], max_val=lms$max_val) - }) + } + + if (threads > 1) { + suppressMessages(library(parallel)) + local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads) + } else { + local_maxima_list <- lapply(1:n_iterations, apply_func) + } x <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_pos)) y <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_val)) - + # sort ascending srt <- sort.int(x, index.return=TRUE) x <- x[srt$ix] y <- y[srt$ix] - + return( list( x=x, y=y ) ) } @@ -193,16 +209,20 @@ doCimplAnalysis <- function( } .getHopIndices <- function(location, contig_depth, max.hopping.distance) { + if (is.null(contig_depth)) { + stop('Missing contig_depth column for hop exclusion') + } + # compute distance matrix between locations dist_m <- as.matrix(dist(location)) - + # which insertions (pairs) are within hopping range pairs <- which(dist_m <= max.hopping.distance, arr.ind=TRUE) - + # remove diagonal and duplicates pairs <- pairs[pairs[, 1] < pairs[, 2], , drop=FALSE] #distances <- dist_m[pairs] - + # for each pair, the insertion with the smallest contig depth is the hopped insertion idx1 <- contig_depth[pairs[, 1]] < contig_depth[pairs[, 2]] unique ( c ( pairs[idx1, 1], pairs[!idx1, 2] ) ) @@ -222,14 +242,14 @@ doCimplAnalysis <- function( .getHopWeights <- function(location, contig_depth, max.hopping.distance) { # compute distance matrix between locations dist_m <- as.matrix(dist(location)) - + # which insertions (pairs) are within hopping range pairs <- which(dist_m <= max.hopping.distance, arr.ind=TRUE) - + # remove diagonal and duplicates (upper part) pairs <- pairs[pairs[, 1] < pairs[, 2], , drop=FALSE] #distances <- dist_m[pairs] - + weights <- rep(1, length(location)) for (i in 1:dim(pairs)[1]) { i1 <- pairs[i, 1] @@ -253,8 +273,8 @@ doCimplAnalysis <- function( if (verbose) cat('.') # draw from uniform background r_insertions <- sort(floor(runif(n_insertions, min=1, max=chr_length))) - - # calculate density + + # calculate density density(r_insertions, bw=h, n=n_sample_points) }) @@ -268,11 +288,11 @@ doCimplAnalysis <- function( if (verbose) cat('.') # draw from TA/AT sites background r_insertions <- sort(sample(background, n_insertions, replace=TRUE)) - + # calculate density density(r_insertions, bw=h, n=n_sample_points) }) - + return( null_densities ) } @@ -289,7 +309,7 @@ doCimplAnalysis <- function( srt <- sort.int(x, index.return=TRUE) x <- x[srt$ix] y <- y[srt$ix] - + return( list( x=x, y=y ) ) } @@ -311,7 +331,7 @@ doCimplAnalysis <- function( insertions <- chr_data$location[!chr_data$hop] } d <- density(insertions, bw=h, n=n_sample_points) - + # locate peaks lms <- .getLocalMaxima(d$y) peaks <- list( x=d$x[lms$max_pos], y=lms$max_val ) @@ -337,11 +357,11 @@ doCimplAnalysis <- function( est <- kde2d(null_peaks$priors, null_peaks$y, n=100) # calculate prior-ph density } est$z <- t(apply( est$z, 1, function(x) cumsum(x)/sum(x) )) # make it a CDF-surface - + # calculate p-vals peaks$p_vals <- 1 - .biapprox( est, cbind(peaks$priors, peaks$y) ) # do a bi-linear approximation for the observed ph's - priors. peaks$p_vals[is.na(peaks$p_vals)] <- as.numeric( peaks$y[is.na(peaks$p_vals)] < max(est$y) ) # fix peaks that fall outside the density boundaries - + # somehow, numerical aberrations occur resulting in negative p-vals peaks$p_vals[peaks$p_vals < 0] <- 0 } else { @@ -350,7 +370,7 @@ doCimplAnalysis <- function( sum(null_peaks$y >= height) / length(null_peaks$y) }) } - + n_peaks <- length(peaks$x) if (!missing(bg_density)) { @@ -403,7 +423,7 @@ doCimplAnalysis <- function( .biapprox <- function (obj, loc) { # obj is a surface object like the list for contour or image. - # loc is a matrix of (x, y) locations + # loc is a matrix of (x, y) locations x <- obj$x y <- obj$y x.new <- loc[,1] @@ -445,10 +465,10 @@ doCimplAnalysis <- function( } .getTumorDensities <- function(cimplObject, cumulative=FALSE) { - + ids <- unique(cimplObject@data[[.getSampleIDColName(cimplObject@data)]]) tds <- matrix(0, length(cimplObject@kse$x), length(ids)) - + for(i in 1:length(ids)) { if (is.null(cimplObject@data$hop)) { insertions <- cimplObject@data$location[cimplObject@data[[.getSampleIDColName(cimplObject@data)]] == ids[i]] @@ -461,22 +481,22 @@ doCimplAnalysis <- function( } } colnames(tds) <- ids - + if (cumulative) { tds[is.na(tds)] <- 0 for(i in 1:dim(tds)[1]) { tds[i,] <- cumsum(tds[i,]) } } - + tds } getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', 'external_gene_id'), mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl")) { genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position'), filters='chromosome_name', values=substring(cimplAnalysis@chromosomes, 4), mart=mart) - + # , 'unigene' - # filter out unknown + # filter out unknown if (!all(i <- geneIdentifiers %in% listAttributes(mart)[,1])) { stop(paste(geneIdentifiers[!i], ' is/are no valid biomaRt attributes for this mart.')) } @@ -484,7 +504,7 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', if (length(geneIdentifiers) > 0) { if ( !(length(geneIdentifiers) == 1 & geneIdentifiers[1] == 'ensembl_gene_id' )) { annot <- getBM(attributes=unique(c('ensembl_gene_id', geneIdentifiers)), filters='ensembl_gene_id', values=genes$ensembl_gene_id, mart=mart) - + extra_annot <- sapply(genes$ensembl_gene_id, function(ens) { idx <- annot$ensembl_gene_id == ens sapply(2:ncol(annot), function(col) { @@ -496,7 +516,7 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', if (!is.null(dim(extra_annot))) extra_annot <- t(extra_annot) - + extra_annot <- data.frame(extra_annot, stringsAsFactors=FALSE) colnames(extra_annot) <- colnames(annot)[-1] genes <- cbind(genes, extra_annot) @@ -505,20 +525,20 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', if (! 'ensembl_gene_id' %in% geneIdentifiers) { genes$ensembl_gene_id <- NULL } - + attr(genes, 'geneIdentifiers') <- geneIdentifiers - + genes } # @deprecated: use getCISs .getPeaks <- function(cimplAnalysis, alpha=0.05, mul.test=TRUE, mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl"), genes = getEnsemblGenes(cimplAnalysis, mart), order.by='p_value') { - + df <- do.call('rbind', lapply(cimplAnalysis@chromosomes, function(chr) { chr.idx <- which(cimplAnalysis@chromosomes == chr) - + chr_genes <- genes[genes$chromosome_name == substring(chr, 4), ,drop=FALSE] - + do.call('rbind', lapply(cimplAnalysis@scales, function(kw) { kw.idx <- which(cimplAnalysis@scales == kw) @@ -538,12 +558,12 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', } else if ( sum(idx) == 1 ) { as.character(chr_genes[which(idx) ,c(1, 2)]) } else { - # this peak has hit multiple genes! - c(paste(chr_genes[which(idx) , 1], collapse='|'), + # this peak has hit multiple genes! + c(paste(chr_genes[which(idx) , 1], collapse='|'), paste(chr_genes[which(idx) , 2], collapse='|') ) } }) - + data.frame( location = round(cimpl@peaks$x), chromosome = rep(chr, cimpl@n_peaks), @@ -561,7 +581,7 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', } .CISThresholdLine <- function(cimplObject, alpha) { - + if ( length(cimplObject@null_cdf) == 0 ) { list(x=cimplObject@kse$x, y=rep( quantile(cimplObject@null_peaks$y, prob=1-alpha) , length(cimplObject@kse$x))) } else { @@ -597,9 +617,11 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', list(start_pos=start_pos, end_pos=end_pos) } +## Genes can be supplied from Ensembl as: +## mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl") +## genes = getEnsemblGenes(cimplAnalysis, mart) +getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromosomes, scales=cimplAnalysis@scales, mul.test=TRUE, genes=NULL, order.by=c('p_value', 'n_insertions'), decreasing=c(FALSE, TRUE)) { -getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromosomes, scales=cimplAnalysis@scales, mul.test=TRUE, mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl"), genes = getEnsemblGenes(cimplAnalysis, mart), order.by=c('p_value', 'n_insertions'), decreasing=c(FALSE, TRUE)) { - df <- do.call('rbind', lapply(chromosomes, function(chr) { chr.idx <- which(cimplAnalysis@chromosomes == chr) do.call('rbind', lapply(scales, function(kw) { @@ -610,14 +632,14 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos } else { n_tests <- 1 } - + cimplObject <- cimplAnalysis@cimplObjects[[chr.idx]][[kw.idx]] - + regions <- .getRegionsAboveThreshold(cimplObject, alpha / n_tests) n_regions <- length(regions$start_pos) if (n_regions == 0) { - return(NULL) + return(NULL) } else if (n_regions == 1) { annDf <- .annotateRegion(chr, kw, regions$start_pos, regions$end_pos, cimplObject, genes) } else { @@ -625,19 +647,19 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos .annotateRegion(chr, kw, regions$start_pos[i], regions$end_pos[i], cimplObject, genes) })) } - + return(annDf) })) })) - if (is.null(df)) { + if (is.null(df)) { # no CISs found! NULL } else if(dim(df)[1] == 0) { # no CISs found! NULL } else { - + # make some nicely formatted CIS ids and set them as rownames rownames(df) <- .makeCISID(df$chromosome, df$peak_location, df$scale) @@ -652,14 +674,16 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos .makeCISID <- function(chr, loc, scale) { paste('CIS', substring(chr, 4), ':', loc, '_', .formatScales(scale), sep='') } - -.annotateRegion <- function(chr, scale, bpStart, bpEnd, cimplObject, genes) { - - chr_genes <- genes[genes$chromosome_name == substring(chr, 4), ,drop=FALSE] - + +.annotateRegion <- function(chr, scale, bpStart, bpEnd, cimplObject, genes=NULL) { + if (!is.null(genes)) + chr_genes <- genes[genes$chromosome_name == substring(chr, 4), ,drop=FALSE] + else + chr_genes <- NULL + peak.idx <- which(round(cimplObject@peaks$x) >= bpStart & round(cimplObject@peaks$x) <= bpEnd) n_peaks <- length(peak.idx) - + if (n_peaks == 0) { # CIS contains no peaks return(NULL) @@ -674,9 +698,9 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos } } -.annotatePeak <- function(chr, scale, cisStart, cisEnd, peak.idx, cimplObject, chr_genes) { +.annotatePeak <- function(chr, scale, cisStart, cisEnd, peak.idx, cimplObject, chr_genes=NULL) { peakLoc <- round(cimplObject@peaks$x[peak.idx]) - + ann <- data.frame(row.names='') @@ -686,26 +710,28 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos ann$start <- cisStart ann$end <- cisEnd ann$width <- cisEnd - cisStart + 1 - + snappedLocs <- .snapToPeaks(cimplObject@data$location, round(cimplObject@peaks$x)) ann$n_insertions <- sum(snappedLocs >= cisStart & snappedLocs <= cisEnd) ann$p_value <- cimplObject@peaks$p_vals[peak.idx] ann$scale <- scale - ags <- .associateGenes(chr, cisStart, cisEnd, peakLoc, chr_genes) - for(id in attr(chr_genes, 'geneIdentifiers')) { - - a <- chr_genes[ags$associated, id] - a <- a[a!=''] - - o <- chr_genes[ags$other, id] - o <- o[o!=''] - - ann[paste('associated_', id, sep='')] <- paste(a, collapse='|') - ann[paste('other_', id, sep='')] <- paste(o, collapse='|') + if (!is.null(chr_genes)) { + ags <- .associateGenes(chr, cisStart, cisEnd, peakLoc, chr_genes) + for(id in attr(chr_genes, 'geneIdentifiers')) { + + a <- chr_genes[ags$associated, id] + a <- a[a!=''] + + o <- chr_genes[ags$other, id] + o <- o[o!=''] + + ann[paste('associated_', id, sep='')] <- paste(a, collapse='|') + ann[paste('other_', id, sep='')] <- paste(o, collapse='|') + } } - + return(ann) } @@ -714,7 +740,7 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos margin <- 100e3 # the genes within the region + margin gene.idx <- which(chr_genes$start_position < (cisEnd + margin) & chr_genes$end_position > (cisStart - margin)) - + if (length(gene.idx) == 0) { list( associated = numeric(0), @@ -728,7 +754,7 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos } else { # 2. calculate the distances dists <- pmin(abs(chr_genes$start_position[gene.idx] - peakLoc), abs(chr_genes$end_position[gene.idx] - peakLoc)) - + # genes which contain the peak get distance 0 dists[chr_genes$start_position[gene.idx] <= peakLoc & chr_genes$end_position[gene.idx] >= peakLoc] <- 0 @@ -739,10 +765,10 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos # dists[curated.idx] <- dists[curated.idx] - 20 * max.dist # dists[automatic.idx] <- dists[automatic.idx] - 10 * max.dist - gene.order <- order(dists, decreasing=FALSE) - + gene.order <- order(dists, decreasing=FALSE) + min.ties <- length(which(dists == min(dists))) - + list( associated = gene.idx[gene.order][1:min.ties], other = gene.idx[gene.order][-(1:min.ties)] @@ -753,7 +779,7 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos # ) } } - + .formatScales <- function(scales) { str <- as.character(scales) k.idx <- scales %% 1e3 == 0 @@ -770,29 +796,28 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos getCISMatrix <- function(cimplAnalysis, ciss) { df <- do.call('rbind', lapply(cimplAnalysis@chromosomes, function(chr) { chr.idx <- which(cimplAnalysis@chromosomes == chr) - - + chr_data <- cimplAnalysis@cimplObjects[[chr.idx]][[1]]@data - + cisids <- do.call('cbind', lapply(cimplAnalysis@scales, function(kw) { - + kw.idx <- which(cimplAnalysis@scales == kw) cimplObject <- cimplAnalysis@cimplObjects[[chr.idx]][[kw.idx]] - + # snap insertions to peaks (see http://bioinformatics.nki.nl/forum/viewtopic.php?f=4&t=19) snappedLocs <- .snapToPeaks(chr_data$location, cimplObject@peaks$x) - + insertion2cis <- rep('', dim(chr_data)[1]) - + ciss.idx <- which(ciss$chromosome == chr & ciss$scale == kw) for (i in ciss.idx) { # insertion2cis[snappedLocs >= ciss$start[i] & snappedLocs <= ciss$end[i]] <- rownames(ciss)[i] # insertion2cis[snappedLocs == ciss$peak_location[i]] <- rownames(ciss)[i] locs.idx <- snappedLocs >= ciss$start[i] & snappedLocs <= ciss$end[i] - + insertion2cis[locs.idx] <- paste(insertion2cis[locs.idx], rownames(ciss)[i], sep='|') } - + substring(insertion2cis, 2) })) colnames(cisids) <- cimplAnalysis@scales @@ -805,24 +830,24 @@ getInsertions <- function(cimplAnalysis, chr, scale, bpLim) { scale.idx <- which(cimplAnalysis@scales == scale) chr_data <- cimplAnalysis@cimplObjects[[chr.idx]][[scale.idx]]@data #chr_data[chr_data$location >= bpLim[1] & chr_data$location <= bpLim[2], ] - + # snap insertions to peaks (see http://bioinformatics.nki.nl/forum/viewtopic.php?f=4&t=19) snappedLocs <- .snapToPeaks(chr_data$location, cimplAnalysis@cimplObjects[[chr.idx]][[scale.idx]]@peaks$x) chr_data[snappedLocs >= bpLim[1] & snappedLocs <= bpLim[2], ] } -.nToString <- function(n, nDigits, prefix='', postfix='') { - out <- NULL +.nToString <- function(n, nDigits, prefix='', postfix='') { + out <- NULL sapply(n, function(n) { - while(n > 0) { - out <- c(n%%10 , out) + while(n > 0) { + out <- c(n%%10 , out) n <- n%/%10 - } - + } + if (length(out) < nDigits) { out <- c( rep(0, nDigits - length(out)) , out) } - + paste(prefix, paste(out, collapse=''), postfix, sep='') }) } diff --git a/R/plot-methods.R b/R/plot-methods.R index b6e8db0..1b48a26 100644 --- a/R/plot-methods.R +++ b/R/plot-methods.R @@ -1,3 +1,7 @@ + +setGeneric(name='plot', + def=function(x, y, ...) { standardGeneric('plot') }) + setMethod("plot", signature(x="cimplAnalysis", y="missing"), function(x, y, type=c('kse', 'null.cdf', 'scale.space'), chr=x@chromosomes[1], scale=x@scales[1], alpha=0.05, mul.test=TRUE, bpLim, plot.tumor.densities=FALSE, interactive=TRUE, ...) { chr.idx <- which(x@chromosomes == chr) kw.idx <- which(x@scales == scale) From 46c1b034092c80d5e3483e632669a659c96b2c3c Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 6 Feb 2017 11:51:24 +0100 Subject: [PATCH 2/7] Adjusted formatting, dependencies. --- DESCRIPTION | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a32abfd..c074d41 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,9 +7,12 @@ Author: Jelle ten Hoeve and Jeroen de Ridder Maintainer: Jelle ten Hoeve Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites. License: GPL-3 -Depends: methods, Biostrings, KernSmooth, MASS, biomaRt, xtable +Depends: R (>= 2.10), methods, Biostrings, KernSmooth, MASS, biomaRt, + xtable Suggests: BSgenome.Mmusculus.UCSC.mm9 Enhances: -Collate: 'AllClasses.R' 'cimpl.R' 'plot-methods.R' 'export-methods.R' 'show-methods.R' +Collate: 'AllClasses.R' 'cimpl.R' 'plot-methods.R' 'export-methods.R' + 'show-methods.R' LazyLoad: yes biocViews: Genetics, Visualization +Packaged: 2014-02-12 17:27:55 UTC; ar12 From 13c711a5f6636257dc47a78862f8690127742950 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 6 Feb 2017 11:51:31 +0100 Subject: [PATCH 3/7] Added datalist file. --- data/datalist | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 data/datalist diff --git a/data/datalist b/data/datalist new file mode 100644 index 0000000..03a53ae --- /dev/null +++ b/data/datalist @@ -0,0 +1,2 @@ +colorectal +sampleCa From 67e35fd9b4ca0bade9b0bf164908d8087876f08f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 6 Feb 2017 11:52:04 +0100 Subject: [PATCH 4/7] Added conda recipe. --- conda/build.sh | 16 ++++++++++++++++ conda/meta.yaml | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 conda/build.sh create mode 100644 conda/meta.yaml diff --git a/conda/build.sh b/conda/build.sh new file mode 100644 index 0000000..1ceedcf --- /dev/null +++ b/conda/build.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# R refuses to build packages that mark themselves as +# "Priority: Recommended" +mv DESCRIPTION DESCRIPTION.old +grep -v '^Priority: ' DESCRIPTION.old > DESCRIPTION +# +$R CMD INSTALL --build . +# +# # Add more build steps here, if they are necessary. +# +# See +# http://docs.continuum.io/conda/build.html +# for a list of environment variables that are set during the build +# process. +# \ No newline at end of file diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 0000000..23822cc --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,34 @@ +package: + name: r-cimpl + version: 1.0 +source: + git_url: https://github.com/jrderuiter/cimpl.git +build: + number: 0 + rpaths: + - lib/R/lib/ + - lib/ +requirements: + build: + - r + - 'r-kernsmooth' + - 'r-mass' + - 'r-xtable' + - 'bioconductor-biomart' + - 'bioconductor-biostrings' + run: + - r + - 'r-kernsmooth' + - 'r-mass' + - 'r-xtable' + - 'bioconductor-biomart' + - 'bioconductor-biostrings' +test: + commands: + - '$R -e "library(''cimpl'')"' +about: + home: http://ccb.nki.nl/software/ + license: GPL-3 + summary: 'An analysis package for multi sample insertional mutagenesis data + (including viral- and transposon-based systems) using Gaussian kernel + convolution to identify common insertion sites.' \ No newline at end of file From f2607a8fba4aa6fa64558b5d66173ef0393eccf7 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 6 Feb 2017 12:41:02 +0100 Subject: [PATCH 5/7] Rename threads -> cores, added parameter documentation. --- DESCRIPTION | 4 ++-- R/cimpl.R | 18 +++++++++--------- conda/meta.yaml | 2 +- man/doCimplAnalysis.Rd | 1 + 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c074d41..d1f51f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: cimpl Type: Package Title: Common Insertion site Mapping PLatform -Version: 0.99.22 -Date: 2010-03-23 +Version: 1.0.1 +Date: 2017-02-06 Author: Jelle ten Hoeve and Jeroen de Ridder Maintainer: Jelle ten Hoeve Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites. diff --git a/R/cimpl.R b/R/cimpl.R index cbf3cb2..94cced3 100644 --- a/R/cimpl.R +++ b/R/cimpl.R @@ -10,7 +10,7 @@ doCimplAnalysis <- function( specificity.pattern, lhc.method = c('none', 'exclude'), # local hopping correction method verbose=TRUE, - threads=1 + cores=1 ) { if (any(!chromosomes %in% data$chr)) { @@ -85,7 +85,7 @@ doCimplAnalysis <- function( #null_peaks <- .getPeakDistribution(null_densities) if (verbose) cat('null-peaks...') - null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads) + null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, cores=cores) if (verbose) cat('cimpl_object...') cimplObject <- .getCimplObject(chr_data, chr, null_peaks, h=h, n_sample_points=n_sample_points, verbose=verbose) @@ -102,7 +102,7 @@ doCimplAnalysis <- function( #null_peaks <- .getPeakDistribution(null_densities) if (verbose) cat('null-peaks...') - null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads) + null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, cores=cores) if (verbose) cat('at-density...') bg_density <- density(background, bw=h, n=n_sample_points) @@ -131,7 +131,7 @@ doCimplAnalysis <- function( ) } -.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) { +.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, cores=1) { apply_func <- function(i) { r_insertions <- sort(floor(runif(n_insertions, min=1, max=chr_length))) @@ -140,9 +140,9 @@ doCimplAnalysis <- function( list(max_pos=d$x[lms$max_pos], max_val=lms$max_val) } - if (threads > 1) { + if (cores > 1) { suppressMessages(library(parallel)) - local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads) + local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=cores) } else { local_maxima_list <- lapply(1:n_iterations, apply_func) } @@ -159,7 +159,7 @@ doCimplAnalysis <- function( } -.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) { +.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, cores=1) { apply_func <- function(i) { r_insertions <- sort(sample(background, n_insertions, replace=TRUE)) @@ -168,9 +168,9 @@ doCimplAnalysis <- function( list(max_pos=d$x[lms$max_pos], max_val=lms$max_val) } - if (threads > 1) { + if (cores > 1) { suppressMessages(library(parallel)) - local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads) + local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=cores) } else { local_maxima_list <- lapply(1:n_iterations, apply_func) } diff --git a/conda/meta.yaml b/conda/meta.yaml index 23822cc..64c274d 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,6 +1,6 @@ package: name: r-cimpl - version: 1.0 + version: 1.0.1 source: git_url: https://github.com/jrderuiter/cimpl.git build: diff --git a/man/doCimplAnalysis.Rd b/man/doCimplAnalysis.Rd index f09cb51..70a90c7 100644 --- a/man/doCimplAnalysis.Rd +++ b/man/doCimplAnalysis.Rd @@ -22,6 +22,7 @@ doCimplAnalysis(data, scales = (1:15) * 10000, n_iterations = 100, sample_period \item{specificity.pattern}{The specificity pattern at which the insertion locations are restricted. If no system is set, the specificity pattern can be user-defined.} \item{lhc.method}{The local hopping correction method. Possible values are 'none' or 'exclude'. When the distance between two neighboring insertions is less than three kernel widths, the insertion with the smallest 'contig_depth' is the hopped insertion. If \code{lhc.method == 'exclude'}, hopped insertions are excluded from the analysis.} \item{verbose}{Echo progress indicators?} + \item{threads}{Number of cores to use when calculating the null model.} } \details{ From 06b039c888b5d1ecb2c4a72f9bba9cf7d071c30f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 6 Feb 2017 12:48:28 +0100 Subject: [PATCH 6/7] Changed conda source url, downgraded version. --- DESCRIPTION | 4 ++-- conda/meta.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d1f51f7..c51bb04 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: cimpl Type: Package Title: Common Insertion site Mapping PLatform -Version: 1.0.1 -Date: 2017-02-06 +Version: 1.0 +Date: 2010-03-23 Author: Jelle ten Hoeve and Jeroen de Ridder Maintainer: Jelle ten Hoeve Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites. diff --git a/conda/meta.yaml b/conda/meta.yaml index 64c274d..290d62f 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,8 +1,8 @@ package: name: r-cimpl - version: 1.0.1 + version: 1.0 source: - git_url: https://github.com/jrderuiter/cimpl.git + git_url: https://github.com/NKI-CCB/cimpl.git build: number: 0 rpaths: From 1d5742a308573c86159ce3efdb374b4233025259 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 12:30:54 +0100 Subject: [PATCH 7/7] Update docs + version strings. --- DESCRIPTION | 4 ++-- conda/meta.yaml | 6 +++--- man/doCimplAnalysis.Rd | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c51bb04..29e3260 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: cimpl Type: Package Title: Common Insertion site Mapping PLatform -Version: 1.0 -Date: 2010-03-23 +Version: 1.1 +Date: 2017-03-21 Author: Jelle ten Hoeve and Jeroen de Ridder Maintainer: Jelle ten Hoeve Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites. diff --git a/conda/meta.yaml b/conda/meta.yaml index 290d62f..1c4479c 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,8 +1,8 @@ package: name: r-cimpl - version: 1.0 + version: 1.1 source: - git_url: https://github.com/NKI-CCB/cimpl.git + path: ../ build: number: 0 rpaths: @@ -31,4 +31,4 @@ about: license: GPL-3 summary: 'An analysis package for multi sample insertional mutagenesis data (including viral- and transposon-based systems) using Gaussian kernel - convolution to identify common insertion sites.' \ No newline at end of file + convolution to identify common insertion sites.' diff --git a/man/doCimplAnalysis.Rd b/man/doCimplAnalysis.Rd index 70a90c7..f459261 100644 --- a/man/doCimplAnalysis.Rd +++ b/man/doCimplAnalysis.Rd @@ -4,7 +4,7 @@ Do "cimpl" analysis } \description{ -Calculate a \linkS4class{cimplAnalysis} object using insertion data. +Calculate a \linkS4class{cimplAnalysis} object using insertion data. } \usage{ doCimplAnalysis(data, scales = (1:15) * 10000, n_iterations = 100, sample_period = 0.1, max_sample_points = 2^19, chromosomes = unique(data$chr), BSgenome, system = c("MMTV", "MuLV", "SB", "PB"), specificity.pattern, lhc.method = c("none", "exclude"), verbose = TRUE) @@ -22,7 +22,7 @@ doCimplAnalysis(data, scales = (1:15) * 10000, n_iterations = 100, sample_period \item{specificity.pattern}{The specificity pattern at which the insertion locations are restricted. If no system is set, the specificity pattern can be user-defined.} \item{lhc.method}{The local hopping correction method. Possible values are 'none' or 'exclude'. When the distance between two neighboring insertions is less than three kernel widths, the insertion with the smallest 'contig_depth' is the hopped insertion. If \code{lhc.method == 'exclude'}, hopped insertions are excluded from the analysis.} \item{verbose}{Echo progress indicators?} - \item{threads}{Number of cores to use when calculating the null model.} + \item{cores}{Number of cores to use when calculating the null model.} } \details{ @@ -32,7 +32,7 @@ Then, for each chromosome and each scale: 1. The data is randomized, i.e each insertion (if \code{lhc.method=='exclude'} hops are excluded) is given a random location on the chromosome. If \code{specificity.pattern != ''} possible locations are restricted accordingly. -2. Gaussian kernel density estimation is applied on the randomized data and the peaks are located. For each peak, the density (or peak height) and the background density at the peak location are added to the (joint) null-distribution. +2. Gaussian kernel density estimation is applied on the randomized data and the peaks are located. For each peak, the density (or peak height) and the background density at the peak location are added to the (joint) null-distribution. 3. Steps 1 and 2 are repeated \code{n_iteration} times.