From 30165e93ffd373828618ef6e83f175347f27935e Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Mon, 6 Feb 2017 11:51:11 +0100
Subject: [PATCH 1/7] Added threads and verbose arguments to cimpl.

---
 R/cimpl.R        | 269 ++++++++++++++++++++++++++---------------------
 R/plot-methods.R |   4 +
 2 files changed, 151 insertions(+), 122 deletions(-)

diff --git a/R/cimpl.R b/R/cimpl.R
index 84fa420..cbf3cb2 100644
--- a/R/cimpl.R
+++ b/R/cimpl.R
@@ -9,9 +9,16 @@ doCimplAnalysis <- function(
 	system = c('MMTV', 'MuLV', 'SB', 'PB'),
 	specificity.pattern,
 	lhc.method = c('none', 'exclude'),  # local hopping correction method
-	verbose=TRUE
+	verbose=TRUE,
+	threads=1
 	) {
 
+	if (any(!chromosomes %in% data$chr)) {
+		warning(sprintf('Dropped chromosomes not present in data: %s',
+						 paste(chromosomes[!chromosomes %in% data$chr], collapse=', ')))
+		chromosomes = chromosomes[chromosomes %in% data$chr]
+	}
+
 	# processing arguments
 	if (!missing(system)) {
 		system <- match.arg(system)
@@ -29,8 +36,8 @@ doCimplAnalysis <- function(
 	}
 
 	lhc.method <- match.arg(lhc.method)
-	
-	print(match.call(expand.dots=TRUE))
+
+	#print(match.call(expand.dots=TRUE))
 	# starting the analysis
 	if (verbose) cat('Starting CIMPL analysis...\n')
 
@@ -38,8 +45,13 @@ doCimplAnalysis <- function(
 		if (verbose) cat(paste('>>>> chromosome', chr, '<<<<\n'))
 
 		if ( specificity.pattern != '' ) {
+
 			strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
-			background <- c(matchPattern(specificity.pattern, BSgenome[[chr]])@start, matchPattern(strReverse(specificity.pattern), BSgenome[[chr]])@start)
+
+			background <- c(matchPattern(specificity.pattern, BSgenome[[chr]])@ranges@start, matchPattern(strReverse(specificity.pattern), BSgenome[[chr]])@ranges@start)
+                        # for R 2.11.1
+                        #background <- c(matchPattern(specificity.pattern, BSgenome[[chr]])@start, matchPattern(strReverse(specificity.pattern), BSgenome[[chr]])@start)
+
 		} else {
 			background <- NULL
 		}
@@ -53,7 +65,7 @@ doCimplAnalysis <- function(
 #				if (verbose) cat(paste('Warning: undersampling for h =', h , '!! Sample rate set to', max_sample_points, '. '))
 				n_sample_points <- max_sample_points
 			}
-			
+
 			chr_data <- data[data$chr == chr, ]
 			if (lhc.method != 'none') {
 				chr_data$hop <- .isHop(chr_data, h * 3)
@@ -71,10 +83,10 @@ doCimplAnalysis <- function(
 
 				#if (verbose) cat('null-peaks...')
 				#null_peaks <- .getPeakDistribution(null_densities)
-								
+
 				if (verbose) cat('null-peaks...')
-				null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose)
-				
+				null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads)
+
 				if (verbose) cat('cimpl_object...')
 				cimplObject <- .getCimplObject(chr_data, chr, null_peaks, h=h, n_sample_points=n_sample_points, verbose=verbose)
 
@@ -90,8 +102,8 @@ doCimplAnalysis <- function(
 				#null_peaks <- .getPeakDistribution(null_densities)
 
 				if (verbose) cat('null-peaks...')
-				null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose)
-				
+				null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads)
+
 				if (verbose) cat('at-density...')
 				bg_density <- density(background, bw=h, n=n_sample_points)
 
@@ -103,7 +115,7 @@ doCimplAnalysis <- function(
 			}
 		})
 	})
-	
+
 	new('cimplAnalysis',
 		cimplObjects = cimplObjects,
 		chromosomes = chromosomes,
@@ -119,54 +131,58 @@ doCimplAnalysis <- function(
 	)
 }
 
-.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE) {
-	local_maxima_list <- lapply(1:n_iterations, function(i) {
-		if (verbose) cat('.')
+.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) {
 
+	apply_func <- function(i) {
 		r_insertions <- sort(floor(runif(n_insertions, min=1, max=chr_length)))
-				
-		# calculate density
-		d <- density(r_insertions, bw=h, n=n_sample_points)
-		
-		# get peaks
-		lms <- .getLocalMaxima(d$y)
+		d <- density(r_insertions, bw=h, n=n_sample_points)  # calculate density
+		lms <- .getLocalMaxima(d$y)                          # get peaks
 		list(max_pos=d$x[lms$max_pos], max_val=lms$max_val)
-	})
-	
+	}
+
+	if (threads > 1) {
+		suppressMessages(library(parallel))
+		local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads)
+	} else {
+		local_maxima_list <- lapply(1:n_iterations, apply_func)
+	}
+
 	x <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_pos))
 	y <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_val))
-	
+
 	# sort ascending
 	srt <- sort.int(x, index.return=TRUE)
 	x <- x[srt$ix]
 	y <- y[srt$ix]
-	
+
 	return( list( x=x, y=y ) )
 }
 
 
-.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE) {
-	local_maxima_list <- lapply(1:n_iterations, function(i) {
-		if (verbose) cat('.')
-		# draw from TA/AT sites background
+.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) {
+
+	apply_func <- function(i) {
 		r_insertions <- sort(sample(background, n_insertions, replace=TRUE))
-		
-		# calculate density
-		d <- density(r_insertions, bw=h, n=n_sample_points)
-		
-		# get peaks
-		lms <- .getLocalMaxima(d$y)
+		d <- density(r_insertions, bw=h, n=n_sample_points)  # calculate density
+		lms <- .getLocalMaxima(d$y)                          # get peaks
 		list(max_pos=d$x[lms$max_pos], max_val=lms$max_val)
-	})
+	}
+
+	if (threads > 1) {
+		suppressMessages(library(parallel))
+		local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads)
+	} else {
+		local_maxima_list <- lapply(1:n_iterations, apply_func)
+	}
 
 	x <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_pos))
 	y <- do.call('c', lapply(local_maxima_list, function(lm) lm$max_val))
-	
+
 	# sort ascending
 	srt <- sort.int(x, index.return=TRUE)
 	x <- x[srt$ix]
 	y <- y[srt$ix]
-	
+
 	return( list( x=x, y=y ) )
 }
 
@@ -193,16 +209,20 @@ doCimplAnalysis <- function(
 }
 
 .getHopIndices <- function(location, contig_depth, max.hopping.distance) {
+    if (is.null(contig_depth)) {
+        stop('Missing contig_depth column for hop exclusion')
+    }
+
 	# compute distance matrix between locations
 	dist_m <- as.matrix(dist(location))
-	
+
 	# which insertions (pairs) are within hopping range
 	pairs <- which(dist_m <= max.hopping.distance, arr.ind=TRUE)
-	
+
 	# remove diagonal and duplicates
 	pairs <- pairs[pairs[, 1] < pairs[, 2], , drop=FALSE]
 	#distances <- dist_m[pairs]
-	
+
 	# for each pair, the insertion with the smallest contig depth is the hopped insertion
 	idx1 <- contig_depth[pairs[, 1]] < contig_depth[pairs[, 2]]
 	unique ( c ( pairs[idx1, 1], pairs[!idx1, 2] ) )
@@ -222,14 +242,14 @@ doCimplAnalysis <- function(
 .getHopWeights <- function(location, contig_depth, max.hopping.distance) {
 	# compute distance matrix between locations
 	dist_m <- as.matrix(dist(location))
-	
+
 	# which insertions (pairs) are within hopping range
 	pairs <- which(dist_m <= max.hopping.distance, arr.ind=TRUE)
-	
+
 	# remove diagonal and duplicates (upper part)
 	pairs <- pairs[pairs[, 1] < pairs[, 2], , drop=FALSE]
 	#distances <- dist_m[pairs]
-		
+
 	weights <- rep(1, length(location))
 	for (i in 1:dim(pairs)[1]) {
 		i1 <- pairs[i, 1]
@@ -253,8 +273,8 @@ doCimplAnalysis <- function(
 		if (verbose) cat('.')
 		# draw from uniform background
 		r_insertions <- sort(floor(runif(n_insertions, min=1, max=chr_length)))
-		
-		# calculate density 
+
+		# calculate density
 		density(r_insertions, bw=h, n=n_sample_points)
 	})
 
@@ -268,11 +288,11 @@ doCimplAnalysis <- function(
 		if (verbose) cat('.')
 		# draw from TA/AT sites background
 		r_insertions <- sort(sample(background, n_insertions, replace=TRUE))
-		
+
 		# calculate density
 		density(r_insertions, bw=h, n=n_sample_points)
 	})
-	
+
 	return( null_densities )
 }
 
@@ -289,7 +309,7 @@ doCimplAnalysis <- function(
 	srt <- sort.int(x, index.return=TRUE)
 	x <- x[srt$ix]
 	y <- y[srt$ix]
-	
+
 	return( list( x=x, y=y ) )
 }
 
@@ -311,7 +331,7 @@ doCimplAnalysis <- function(
 		insertions <- chr_data$location[!chr_data$hop]
 	}
 	d <- density(insertions, bw=h, n=n_sample_points)
-	
+
 	# locate peaks
 	lms <- .getLocalMaxima(d$y)
 	peaks <- list( x=d$x[lms$max_pos], y=lms$max_val )
@@ -337,11 +357,11 @@ doCimplAnalysis <- function(
 			est <- kde2d(null_peaks$priors, null_peaks$y, n=100)        # calculate prior-ph density
 		}
 		est$z <- t(apply( est$z, 1, function(x) cumsum(x)/sum(x) )) # make it a CDF-surface
-		
+
 		# calculate p-vals
 		peaks$p_vals <- 1 - .biapprox( est, cbind(peaks$priors, peaks$y) )    # do a bi-linear approximation for the observed ph's - priors.
 		peaks$p_vals[is.na(peaks$p_vals)] <- as.numeric( peaks$y[is.na(peaks$p_vals)] < max(est$y) )  # fix peaks that fall outside the density boundaries
-		
+
 		# somehow, numerical aberrations occur resulting in negative p-vals
 		peaks$p_vals[peaks$p_vals < 0] <- 0
 	} else {
@@ -350,7 +370,7 @@ doCimplAnalysis <- function(
 			sum(null_peaks$y >= height) / length(null_peaks$y)
 		})
 	}
-	
+
 	n_peaks <- length(peaks$x)
 
 	if (!missing(bg_density)) {
@@ -403,7 +423,7 @@ doCimplAnalysis <- function(
 
 .biapprox <- function (obj, loc) {
 	# obj is a surface object like the list for contour or image.
-	# loc is a matrix of (x, y) locations 
+	# loc is a matrix of (x, y) locations
 	x <- obj$x
 	y <- obj$y
 	x.new <- loc[,1]
@@ -445,10 +465,10 @@ doCimplAnalysis <- function(
 }
 
 .getTumorDensities <- function(cimplObject, cumulative=FALSE) {
-	
+
 	ids <- unique(cimplObject@data[[.getSampleIDColName(cimplObject@data)]])
 	tds <- matrix(0, length(cimplObject@kse$x), length(ids))
-	
+
 	for(i in 1:length(ids)) {
 		if (is.null(cimplObject@data$hop)) {
 			insertions <- cimplObject@data$location[cimplObject@data[[.getSampleIDColName(cimplObject@data)]] == ids[i]]
@@ -461,22 +481,22 @@ doCimplAnalysis <- function(
 		}
 	}
 	colnames(tds) <- ids
-	
+
 	if (cumulative) {
 		tds[is.na(tds)] <- 0
 		for(i in 1:dim(tds)[1]) {
 			tds[i,] <- cumsum(tds[i,])
 		}
 	}
-	
+
 	tds
 }
 
 getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id', 'external_gene_id'), mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl")) {
 	genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position'), filters='chromosome_name', values=substring(cimplAnalysis@chromosomes, 4), mart=mart)
-	
+
 	# , 'unigene'
-	# filter out unknown 
+	# filter out unknown
 	if (!all(i <- geneIdentifiers %in% listAttributes(mart)[,1])) {
 		stop(paste(geneIdentifiers[!i], ' is/are no valid biomaRt attributes for this mart.'))
 	}
@@ -484,7 +504,7 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id',
 	if (length(geneIdentifiers) > 0) {
 		if ( !(length(geneIdentifiers) == 1 & geneIdentifiers[1] == 'ensembl_gene_id' )) {
 			annot <- getBM(attributes=unique(c('ensembl_gene_id', geneIdentifiers)), filters='ensembl_gene_id', values=genes$ensembl_gene_id, mart=mart)
-			
+
 			extra_annot <- sapply(genes$ensembl_gene_id, function(ens) {
 				idx <- annot$ensembl_gene_id == ens
 				sapply(2:ncol(annot), function(col) {
@@ -496,7 +516,7 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id',
 
 			if (!is.null(dim(extra_annot)))
 				extra_annot <- t(extra_annot)
-			
+
 			extra_annot <- data.frame(extra_annot, stringsAsFactors=FALSE)
 			colnames(extra_annot) <- colnames(annot)[-1]
 			genes <- cbind(genes, extra_annot)
@@ -505,20 +525,20 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id',
 	if (! 'ensembl_gene_id' %in% geneIdentifiers) {
 		genes$ensembl_gene_id <- NULL
 	}
-	
+
 	attr(genes, 'geneIdentifiers') <- geneIdentifiers
-	
+
 	genes
 }
 
 # @deprecated: use getCISs
 .getPeaks <- function(cimplAnalysis, alpha=0.05, mul.test=TRUE, mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl"), genes = getEnsemblGenes(cimplAnalysis, mart), order.by='p_value') {
-		
+
 	df <- do.call('rbind', lapply(cimplAnalysis@chromosomes, function(chr) {
 		chr.idx <- which(cimplAnalysis@chromosomes == chr)
-		
+
 		chr_genes <- genes[genes$chromosome_name == substring(chr, 4), ,drop=FALSE]
-		
+
 		do.call('rbind', lapply(cimplAnalysis@scales, function(kw) {
 
 			kw.idx <- which(cimplAnalysis@scales == kw)
@@ -538,12 +558,12 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id',
 				} else if ( sum(idx) == 1 ) {
 					as.character(chr_genes[which(idx) ,c(1, 2)])
 				} else {
-					# this peak has hit multiple genes! 
-					c(paste(chr_genes[which(idx) , 1], collapse='|'), 
+					# this peak has hit multiple genes!
+					c(paste(chr_genes[which(idx) , 1], collapse='|'),
 					  paste(chr_genes[which(idx) , 2], collapse='|')  )
 				}
 			})
-	
+
 			data.frame(
 				location         = round(cimpl@peaks$x),
 				chromosome       = rep(chr, cimpl@n_peaks),
@@ -561,7 +581,7 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id',
 }
 
 .CISThresholdLine <- function(cimplObject, alpha) {
-	
+
 	if ( length(cimplObject@null_cdf) == 0 ) {
 		list(x=cimplObject@kse$x, y=rep( quantile(cimplObject@null_peaks$y, prob=1-alpha) , length(cimplObject@kse$x)))
 	} else {
@@ -597,9 +617,11 @@ getEnsemblGenes <- function(cimplAnalysis, geneIdentifiers=c('ensembl_gene_id',
 	list(start_pos=start_pos, end_pos=end_pos)
 }
 
+## Genes can be supplied from Ensembl as:
+## 	 mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl")
+##   genes = getEnsemblGenes(cimplAnalysis, mart)
+getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromosomes, scales=cimplAnalysis@scales, mul.test=TRUE, genes=NULL, order.by=c('p_value', 'n_insertions'), decreasing=c(FALSE, TRUE)) {
 
-getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromosomes, scales=cimplAnalysis@scales, mul.test=TRUE, mart=useMart("ensembl", dataset = "mmusculus_gene_ensembl"), genes = getEnsemblGenes(cimplAnalysis, mart), order.by=c('p_value', 'n_insertions'), decreasing=c(FALSE, TRUE)) {	
-	
 	df <- do.call('rbind', lapply(chromosomes, function(chr) {
 		chr.idx <- which(cimplAnalysis@chromosomes == chr)
 		do.call('rbind', lapply(scales, function(kw) {
@@ -610,14 +632,14 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 			} else {
 				n_tests <- 1
 			}
-			
+
 			cimplObject <- cimplAnalysis@cimplObjects[[chr.idx]][[kw.idx]]
-			
+
 			regions <- .getRegionsAboveThreshold(cimplObject, alpha / n_tests)
 			n_regions <- length(regions$start_pos)
 
 			if (n_regions == 0) {
-				return(NULL)	
+				return(NULL)
 			} else if (n_regions == 1) {
 				annDf <- .annotateRegion(chr, kw, regions$start_pos, regions$end_pos, cimplObject, genes)
 			} else {
@@ -625,19 +647,19 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 					.annotateRegion(chr, kw, regions$start_pos[i], regions$end_pos[i], cimplObject, genes)
 				}))
 			}
-						
+
 			return(annDf)
 		}))
 	}))
 
-	if (is.null(df)) { 
+	if (is.null(df)) {
 		# no CISs found!
 		NULL
 	} else if(dim(df)[1] == 0) {
 		# no CISs found!
 		NULL
 	} else {
-		
+
 		# make some nicely formatted CIS ids and set them as rownames
 		rownames(df) <- .makeCISID(df$chromosome, df$peak_location, df$scale)
 
@@ -652,14 +674,16 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 .makeCISID <- function(chr, loc, scale) {
 	paste('CIS', substring(chr, 4), ':', loc, '_', .formatScales(scale), sep='')
 }
-	
-.annotateRegion <- function(chr, scale, bpStart, bpEnd, cimplObject, genes) {
-	
-	chr_genes <- genes[genes$chromosome_name == substring(chr, 4), ,drop=FALSE]
-	
+
+.annotateRegion <- function(chr, scale, bpStart, bpEnd, cimplObject, genes=NULL) {
+	if (!is.null(genes))
+		chr_genes <- genes[genes$chromosome_name == substring(chr, 4), ,drop=FALSE]
+	else
+		chr_genes <- NULL
+
 	peak.idx <- which(round(cimplObject@peaks$x) >= bpStart & round(cimplObject@peaks$x) <= bpEnd)
 	n_peaks <- length(peak.idx)
-	
+
 	if (n_peaks == 0) {
 		# CIS contains no peaks
 		return(NULL)
@@ -674,9 +698,9 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 	}
 }
 
-.annotatePeak <- function(chr, scale, cisStart, cisEnd, peak.idx, cimplObject, chr_genes) {
+.annotatePeak <- function(chr, scale, cisStart, cisEnd, peak.idx, cimplObject, chr_genes=NULL) {
 	peakLoc <- round(cimplObject@peaks$x[peak.idx])
-	
+
 
 	ann <- data.frame(row.names='')
 
@@ -686,26 +710,28 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 	ann$start <- cisStart
 	ann$end <- cisEnd
 	ann$width <- cisEnd - cisStart + 1
-	
+
 	snappedLocs <- .snapToPeaks(cimplObject@data$location, round(cimplObject@peaks$x))
 	ann$n_insertions <- sum(snappedLocs >= cisStart & snappedLocs <= cisEnd)
 
 	ann$p_value <- cimplObject@peaks$p_vals[peak.idx]
 	ann$scale <- scale
 
-	ags <- .associateGenes(chr, cisStart, cisEnd, peakLoc, chr_genes)
-	for(id in attr(chr_genes, 'geneIdentifiers')) {
-		
-		a <- chr_genes[ags$associated, id]
-		a <- a[a!='']
-		
-		o <- chr_genes[ags$other, id]
-		o <- o[o!='']
-
-		ann[paste('associated_', id, sep='')] <- paste(a, collapse='|')
-		ann[paste('other_', id, sep='')] <- paste(o, collapse='|')
+	if (!is.null(chr_genes)) {
+		ags <- .associateGenes(chr, cisStart, cisEnd, peakLoc, chr_genes)
+		for(id in attr(chr_genes, 'geneIdentifiers')) {
+
+			a <- chr_genes[ags$associated, id]
+			a <- a[a!='']
+
+			o <- chr_genes[ags$other, id]
+			o <- o[o!='']
+
+			ann[paste('associated_', id, sep='')] <- paste(a, collapse='|')
+			ann[paste('other_', id, sep='')] <- paste(o, collapse='|')
+		}
 	}
-	
+
 	return(ann)
 }
 
@@ -714,7 +740,7 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 	margin <- 100e3
 	# the genes within the region + margin
 	gene.idx <- which(chr_genes$start_position < (cisEnd + margin) & chr_genes$end_position > (cisStart - margin))
-	
+
 	if (length(gene.idx) == 0) {
 		list(
 			associated = numeric(0),
@@ -728,7 +754,7 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 	} else {
 		# 2. calculate the distances
 		dists <- pmin(abs(chr_genes$start_position[gene.idx] - peakLoc), abs(chr_genes$end_position[gene.idx] - peakLoc))
-		
+
 		# genes which contain the peak get distance 0
 		dists[chr_genes$start_position[gene.idx] <= peakLoc & chr_genes$end_position[gene.idx] >= peakLoc] <- 0
 
@@ -739,10 +765,10 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 #		dists[curated.idx] <- dists[curated.idx] - 20 * max.dist
 #		dists[automatic.idx] <- dists[automatic.idx] - 10 * max.dist
 
-		gene.order <- order(dists, decreasing=FALSE)	
-		
+		gene.order <- order(dists, decreasing=FALSE)
+
 		min.ties <- length(which(dists == min(dists)))
-		
+
 		list(
 			associated = gene.idx[gene.order][1:min.ties],
 			other      = gene.idx[gene.order][-(1:min.ties)]
@@ -753,7 +779,7 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 #		)
 	}
 }
-		
+
 .formatScales <- function(scales) {
 	str <- as.character(scales)
 	k.idx <- scales %% 1e3 == 0
@@ -770,29 +796,28 @@ getCISs <- function(cimplAnalysis, alpha=0.05, chromosomes=cimplAnalysis@chromos
 getCISMatrix <- function(cimplAnalysis, ciss) {
 	df <- do.call('rbind', lapply(cimplAnalysis@chromosomes, function(chr) {
 		chr.idx <- which(cimplAnalysis@chromosomes == chr)
-		
-		
+
 		chr_data <- cimplAnalysis@cimplObjects[[chr.idx]][[1]]@data
-		
+
 		cisids <- do.call('cbind', lapply(cimplAnalysis@scales, function(kw) {
-							
+
 			kw.idx <- which(cimplAnalysis@scales == kw)
 			cimplObject <- cimplAnalysis@cimplObjects[[chr.idx]][[kw.idx]]
-			
+
 			# snap insertions to peaks (see http://bioinformatics.nki.nl/forum/viewtopic.php?f=4&t=19)
 			snappedLocs <- .snapToPeaks(chr_data$location, cimplObject@peaks$x)
-			
+
 			insertion2cis <- rep('', dim(chr_data)[1])
-			
+
 			ciss.idx <- which(ciss$chromosome == chr & ciss$scale == kw)
 			for (i in ciss.idx) {
 #				insertion2cis[snappedLocs >= ciss$start[i] & snappedLocs <= ciss$end[i]] <- rownames(ciss)[i]
 #				insertion2cis[snappedLocs == ciss$peak_location[i]] <- rownames(ciss)[i]
 				locs.idx <- snappedLocs >= ciss$start[i] & snappedLocs <= ciss$end[i]
-				
+
 				insertion2cis[locs.idx] <- paste(insertion2cis[locs.idx], rownames(ciss)[i], sep='|')
 			}
-			
+
 			substring(insertion2cis, 2)
 		}))
 		colnames(cisids) <- cimplAnalysis@scales
@@ -805,24 +830,24 @@ getInsertions <- function(cimplAnalysis, chr, scale, bpLim) {
 	scale.idx <- which(cimplAnalysis@scales == scale)
 	chr_data <- cimplAnalysis@cimplObjects[[chr.idx]][[scale.idx]]@data
 	#chr_data[chr_data$location >= bpLim[1] & chr_data$location <= bpLim[2], ]
-	
+
 	# snap insertions to peaks (see http://bioinformatics.nki.nl/forum/viewtopic.php?f=4&t=19)
 	snappedLocs <- .snapToPeaks(chr_data$location, cimplAnalysis@cimplObjects[[chr.idx]][[scale.idx]]@peaks$x)
 	chr_data[snappedLocs >= bpLim[1] & snappedLocs <= bpLim[2], ]
 }
 
-.nToString <- function(n, nDigits, prefix='', postfix='') { 
-	out <- NULL 
+.nToString <- function(n, nDigits, prefix='', postfix='') {
+	out <- NULL
 	sapply(n, function(n) {
-		while(n > 0) { 
-			out <- c(n%%10 , out) 
+		while(n > 0) {
+			out <- c(n%%10 , out)
 			n <- n%/%10
-		} 
-		
+		}
+
 		if (length(out) < nDigits) {
 			out <- c( rep(0, nDigits - length(out)) , out)
 		}
-		
+
 		paste(prefix, paste(out, collapse=''), postfix, sep='')
 	})
 }
diff --git a/R/plot-methods.R b/R/plot-methods.R
index b6e8db0..1b48a26 100644
--- a/R/plot-methods.R
+++ b/R/plot-methods.R
@@ -1,3 +1,7 @@
+
+setGeneric(name='plot', 
+           def=function(x, y, ...) { standardGeneric('plot') })
+
 setMethod("plot", signature(x="cimplAnalysis", y="missing"), function(x, y, type=c('kse', 'null.cdf', 'scale.space'), chr=x@chromosomes[1], scale=x@scales[1], alpha=0.05, mul.test=TRUE, bpLim, plot.tumor.densities=FALSE, interactive=TRUE, ...) {
 	chr.idx <- which(x@chromosomes == chr)
 	kw.idx <- which(x@scales == scale)

From 46c1b034092c80d5e3483e632669a659c96b2c3c Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Mon, 6 Feb 2017 11:51:24 +0100
Subject: [PATCH 2/7] Adjusted formatting, dependencies.

---
 DESCRIPTION | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a32abfd..c074d41 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -7,9 +7,12 @@ Author: Jelle ten Hoeve and Jeroen de Ridder
 Maintainer: Jelle ten Hoeve <j.t.hoeve@nki.nl>
 Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites.
 License: GPL-3
-Depends: methods, Biostrings, KernSmooth, MASS, biomaRt, xtable
+Depends: R (>= 2.10), methods, Biostrings, KernSmooth, MASS, biomaRt,
+        xtable
 Suggests: BSgenome.Mmusculus.UCSC.mm9
 Enhances:
-Collate: 'AllClasses.R' 'cimpl.R' 'plot-methods.R' 'export-methods.R' 'show-methods.R'
+Collate: 'AllClasses.R' 'cimpl.R' 'plot-methods.R' 'export-methods.R'
+        'show-methods.R'
 LazyLoad: yes
 biocViews: Genetics, Visualization
+Packaged: 2014-02-12 17:27:55 UTC; ar12

From 13c711a5f6636257dc47a78862f8690127742950 Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Mon, 6 Feb 2017 11:51:31 +0100
Subject: [PATCH 3/7] Added datalist file.

---
 data/datalist | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 data/datalist

diff --git a/data/datalist b/data/datalist
new file mode 100644
index 0000000..03a53ae
--- /dev/null
+++ b/data/datalist
@@ -0,0 +1,2 @@
+colorectal
+sampleCa

From 67e35fd9b4ca0bade9b0bf164908d8087876f08f Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Mon, 6 Feb 2017 11:52:04 +0100
Subject: [PATCH 4/7] Added conda recipe.

---
 conda/build.sh  | 16 ++++++++++++++++
 conda/meta.yaml | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 conda/build.sh
 create mode 100644 conda/meta.yaml

diff --git a/conda/build.sh b/conda/build.sh
new file mode 100644
index 0000000..1ceedcf
--- /dev/null
+++ b/conda/build.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# R refuses to build packages that mark themselves as
+# "Priority: Recommended"
+mv DESCRIPTION DESCRIPTION.old
+grep -v '^Priority: ' DESCRIPTION.old > DESCRIPTION
+#
+$R CMD INSTALL --build .
+#
+# # Add more build steps here, if they are necessary.
+#
+# See
+# http://docs.continuum.io/conda/build.html
+# for a list of environment variables that are set during the build
+# process.
+#
\ No newline at end of file
diff --git a/conda/meta.yaml b/conda/meta.yaml
new file mode 100644
index 0000000..23822cc
--- /dev/null
+++ b/conda/meta.yaml
@@ -0,0 +1,34 @@
+package:
+  name: r-cimpl
+  version: 1.0
+source:
+  git_url: https://github.com/jrderuiter/cimpl.git
+build:
+  number: 0
+  rpaths:
+    - lib/R/lib/
+    - lib/
+requirements:
+  build:
+    - r
+    - 'r-kernsmooth'
+    - 'r-mass'
+    - 'r-xtable'
+    - 'bioconductor-biomart'
+    - 'bioconductor-biostrings'
+  run:
+    - r
+    - 'r-kernsmooth'
+    - 'r-mass'
+    - 'r-xtable'
+    - 'bioconductor-biomart'
+    - 'bioconductor-biostrings'
+test:
+  commands:
+    - '$R -e "library(''cimpl'')"'
+about:
+  home: http://ccb.nki.nl/software/
+  license: GPL-3
+  summary: 'An analysis package for multi sample insertional mutagenesis data
+    (including viral- and transposon-based systems) using Gaussian kernel
+    convolution to identify common insertion sites.'
\ No newline at end of file

From f2607a8fba4aa6fa64558b5d66173ef0393eccf7 Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Mon, 6 Feb 2017 12:41:02 +0100
Subject: [PATCH 5/7] Rename threads -> cores, added parameter documentation.

---
 DESCRIPTION            |  4 ++--
 R/cimpl.R              | 18 +++++++++---------
 conda/meta.yaml        |  2 +-
 man/doCimplAnalysis.Rd |  1 +
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c074d41..d1f51f7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: cimpl
 Type: Package
 Title: Common Insertion site Mapping PLatform
-Version: 0.99.22
-Date: 2010-03-23
+Version: 1.0.1
+Date: 2017-02-06
 Author: Jelle ten Hoeve and Jeroen de Ridder
 Maintainer: Jelle ten Hoeve <j.t.hoeve@nki.nl>
 Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites.
diff --git a/R/cimpl.R b/R/cimpl.R
index cbf3cb2..94cced3 100644
--- a/R/cimpl.R
+++ b/R/cimpl.R
@@ -10,7 +10,7 @@ doCimplAnalysis <- function(
 	specificity.pattern,
 	lhc.method = c('none', 'exclude'),  # local hopping correction method
 	verbose=TRUE,
-	threads=1
+	cores=1
 	) {
 
 	if (any(!chromosomes %in% data$chr)) {
@@ -85,7 +85,7 @@ doCimplAnalysis <- function(
 				#null_peaks <- .getPeakDistribution(null_densities)
 
 				if (verbose) cat('null-peaks...')
-				null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads)
+				null_peaks <- .generatePeakDistribution(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, cores=cores)
 
 				if (verbose) cat('cimpl_object...')
 				cimplObject <- .getCimplObject(chr_data, chr, null_peaks, h=h, n_sample_points=n_sample_points, verbose=verbose)
@@ -102,7 +102,7 @@ doCimplAnalysis <- function(
 				#null_peaks <- .getPeakDistribution(null_densities)
 
 				if (verbose) cat('null-peaks...')
-				null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, threads=threads)
+				null_peaks <- .generatePeakDistributionFromBackground(background, h, n_insertions, n_iterations, n_sample_points, verbose=verbose, cores=cores)
 
 				if (verbose) cat('at-density...')
 				bg_density <- density(background, bw=h, n=n_sample_points)
@@ -131,7 +131,7 @@ doCimplAnalysis <- function(
 	)
 }
 
-.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) {
+.generatePeakDistribution <- function(chr_length, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, cores=1) {
 
 	apply_func <- function(i) {
 		r_insertions <- sort(floor(runif(n_insertions, min=1, max=chr_length)))
@@ -140,9 +140,9 @@ doCimplAnalysis <- function(
 		list(max_pos=d$x[lms$max_pos], max_val=lms$max_val)
 	}
 
-	if (threads > 1) {
+	if (cores > 1) {
 		suppressMessages(library(parallel))
-		local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads)
+		local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=cores)
 	} else {
 		local_maxima_list <- lapply(1:n_iterations, apply_func)
 	}
@@ -159,7 +159,7 @@ doCimplAnalysis <- function(
 }
 
 
-.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, threads=1) {
+.generatePeakDistributionFromBackground <- function(background, h, n_insertions, n_iterations, n_sample_points, verbose=TRUE, cores=1) {
 
 	apply_func <- function(i) {
 		r_insertions <- sort(sample(background, n_insertions, replace=TRUE))
@@ -168,9 +168,9 @@ doCimplAnalysis <- function(
 		list(max_pos=d$x[lms$max_pos], max_val=lms$max_val)
 	}
 
-	if (threads > 1) {
+	if (cores > 1) {
 		suppressMessages(library(parallel))
-		local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=threads)
+		local_maxima_list <- mclapply(1:n_iterations, apply_func, mc.cores=cores)
 	} else {
 		local_maxima_list <- lapply(1:n_iterations, apply_func)
 	}
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 23822cc..64c274d 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: r-cimpl
-  version: 1.0
+  version: 1.0.1
 source:
   git_url: https://github.com/jrderuiter/cimpl.git
 build:
diff --git a/man/doCimplAnalysis.Rd b/man/doCimplAnalysis.Rd
index f09cb51..70a90c7 100644
--- a/man/doCimplAnalysis.Rd
+++ b/man/doCimplAnalysis.Rd
@@ -22,6 +22,7 @@ doCimplAnalysis(data, scales = (1:15) * 10000, n_iterations = 100, sample_period
   \item{specificity.pattern}{The specificity pattern at which the insertion locations are restricted. If no system is set, the specificity pattern can be user-defined.}
   \item{lhc.method}{The local hopping correction method. Possible values are 'none' or 'exclude'. When the distance between two neighboring insertions is less than three kernel widths, the insertion with the smallest 'contig_depth' is the hopped insertion. If \code{lhc.method == 'exclude'}, hopped insertions are excluded from the analysis.}
   \item{verbose}{Echo progress indicators?}
+  \item{threads}{Number of cores to use when calculating the null model.}
 }
 \details{
 

From 06b039c888b5d1ecb2c4a72f9bba9cf7d071c30f Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Mon, 6 Feb 2017 12:48:28 +0100
Subject: [PATCH 6/7] Changed conda source url, downgraded version.

---
 DESCRIPTION     | 4 ++--
 conda/meta.yaml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d1f51f7..c51bb04 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: cimpl
 Type: Package
 Title: Common Insertion site Mapping PLatform
-Version: 1.0.1
-Date: 2017-02-06
+Version: 1.0
+Date: 2010-03-23
 Author: Jelle ten Hoeve and Jeroen de Ridder
 Maintainer: Jelle ten Hoeve <j.t.hoeve@nki.nl>
 Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites.
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 64c274d..290d62f 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,8 +1,8 @@
 package:
   name: r-cimpl
-  version: 1.0.1
+  version: 1.0
 source:
-  git_url: https://github.com/jrderuiter/cimpl.git
+  git_url: https://github.com/NKI-CCB/cimpl.git
 build:
   number: 0
   rpaths:

From 1d5742a308573c86159ce3efdb374b4233025259 Mon Sep 17 00:00:00 2001
From: Julian de Ruiter <julianderuiter@gmail.com>
Date: Tue, 21 Mar 2017 12:30:54 +0100
Subject: [PATCH 7/7] Update docs + version strings.

---
 DESCRIPTION            | 4 ++--
 conda/meta.yaml        | 6 +++---
 man/doCimplAnalysis.Rd | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c51bb04..29e3260 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: cimpl
 Type: Package
 Title: Common Insertion site Mapping PLatform
-Version: 1.0
-Date: 2010-03-23
+Version: 1.1
+Date: 2017-03-21
 Author: Jelle ten Hoeve and Jeroen de Ridder
 Maintainer: Jelle ten Hoeve <j.t.hoeve@nki.nl>
 Description: Analysis package for multi-sample retroviral and transposon insertional mutagenesis data using Gaussian kernel convolution to identify common insertion sites.
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 290d62f..1c4479c 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,8 +1,8 @@
 package:
   name: r-cimpl
-  version: 1.0
+  version: 1.1
 source:
-  git_url: https://github.com/NKI-CCB/cimpl.git
+  path: ../
 build:
   number: 0
   rpaths:
@@ -31,4 +31,4 @@ about:
   license: GPL-3
   summary: 'An analysis package for multi sample insertional mutagenesis data
     (including viral- and transposon-based systems) using Gaussian kernel
-    convolution to identify common insertion sites.'
\ No newline at end of file
+    convolution to identify common insertion sites.'
diff --git a/man/doCimplAnalysis.Rd b/man/doCimplAnalysis.Rd
index 70a90c7..f459261 100644
--- a/man/doCimplAnalysis.Rd
+++ b/man/doCimplAnalysis.Rd
@@ -4,7 +4,7 @@
 Do "cimpl" analysis
 }
 \description{
-Calculate a \linkS4class{cimplAnalysis} object using insertion data. 
+Calculate a \linkS4class{cimplAnalysis} object using insertion data.
 }
 \usage{
 doCimplAnalysis(data, scales = (1:15) * 10000, n_iterations = 100, sample_period = 0.1, max_sample_points = 2^19, chromosomes = unique(data$chr), BSgenome, system = c("MMTV", "MuLV", "SB", "PB"), specificity.pattern, lhc.method = c("none", "exclude"), verbose = TRUE)
@@ -22,7 +22,7 @@ doCimplAnalysis(data, scales = (1:15) * 10000, n_iterations = 100, sample_period
   \item{specificity.pattern}{The specificity pattern at which the insertion locations are restricted. If no system is set, the specificity pattern can be user-defined.}
   \item{lhc.method}{The local hopping correction method. Possible values are 'none' or 'exclude'. When the distance between two neighboring insertions is less than three kernel widths, the insertion with the smallest 'contig_depth' is the hopped insertion. If \code{lhc.method == 'exclude'}, hopped insertions are excluded from the analysis.}
   \item{verbose}{Echo progress indicators?}
-  \item{threads}{Number of cores to use when calculating the null model.}
+  \item{cores}{Number of cores to use when calculating the null model.}
 }
 \details{
 
@@ -32,7 +32,7 @@ Then, for each chromosome and each scale:
 
 1. The data is randomized, i.e each insertion (if \code{lhc.method=='exclude'} hops are excluded) is given a random location on the chromosome. If \code{specificity.pattern != ''} possible locations are restricted accordingly.
 
-2. Gaussian kernel density estimation is applied on the randomized data and the peaks are located. For each peak, the density (or peak height) and the background density at the peak location are added to the (joint) null-distribution. 
+2. Gaussian kernel density estimation is applied on the randomized data and the peaks are located. For each peak, the density (or peak height) and the background density at the peak location are added to the (joint) null-distribution.
 
 3. Steps 1 and 2 are repeated \code{n_iteration} times.