diff --git a/CHANGES.txt b/CHANGES.txt index fff34e99c..4a02ee074 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -4,6 +4,7 @@ * Added the `std` plot type to plotProfile in Galaxy (issue #782) * `bamCompare` now has a `--skipZeroOverZero` option to allow skipping bins where both input files lack coverage (issue #785) * `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784) + * `multiBamSummary` now has a `--scaleFactors` option, which computes scale factors in the same manner as DESeq2 to a file. Note that the produced scaling factors are meant to be used with `bamCoverage`. If you want to use them directly in DESeq2 (or a similar package) you will need to invert them (take 1/scale factor). (issue #800) * Fixed an issue with large numbers of samples and small genome sizes sometimes causing nothing to be processed. (issue #801) 3.1.3 diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py index 925fbf890..178dad250 100644 --- a/deeptools/countReadsPerBin.py +++ b/deeptools/countReadsPerBin.py @@ -949,6 +949,29 @@ def remove_row_of_zeros(matrix): return matrix[to_keep, :] +def estimateSizeFactors(m): + """ + Compute size factors in the same way as DESeq2. + The inverse of that is returned, as it's then compatible with bamCoverage. + + m : a numpy ndarray + + >>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]]) + >>> sf = estimateSizeFactors(m) + >>> assert(np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4)) + >>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]]) + >>> sf = estimateSizeFactors(m) + >>> assert(np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4)) + """ + loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1] + # Mask after computing the geometric mean + m = np.ma.masked_where(m <= 0, m) + loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans) + # DESeq2 ratio-based size factor + sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0)) + return 1. / sf + + class Tester(object): def __init__(self): diff --git a/deeptools/multiBamSummary.py b/deeptools/multiBamSummary.py index 059849133..5d0ca554d 100644 --- a/deeptools/multiBamSummary.py +++ b/deeptools/multiBamSummary.py @@ -105,8 +105,7 @@ def bamcorrelate_args(case='bins'): help='File name to save the coverage matrix. This matrix ' 'can be subsequently plotted using plotCorrelation or ' 'or plotPCA.', - type=parserCommon.writableFile, - required=True) + type=parserCommon.writableFile) optional = parser.add_argument_group('Optional arguments') @@ -172,6 +171,14 @@ def bamcorrelate_args(case='bins'): type=parserCommon.writableFile, metavar='FILE') + group.add_argument('--scalingFactors', + help='Compute scaling factors (in the DESeq2 manner) ' + 'compatible for use with bamCoverage and write them to a ' + 'file. The file has tab-separated columns "sample" and ' + '"scalingFactor".', + type=parserCommon.writableFile, + metavar='FILE') + return parser @@ -205,9 +212,9 @@ def main(args=None): else: bed_regions = None - if len(args.bamfiles) == 1 and not args.outRawCounts: + if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors): sys.stderr.write("You've input a single BAM file and not specified " - "--outRawCounts. The resulting output will NOT be " + "--outRawCounts or --scalingFactors. The resulting output will NOT be " "useful with any deepTools program!\n") stepsize = args.binSize + args.distanceBetweenBins @@ -243,11 +250,20 @@ def main(args=None): "region is covered by reads.\n") # numpy will append .npz to the file name if we don't do this... - f = open(args.outFileName, "wb") - np.savez_compressed(f, - matrix=num_reads_per_bin, - labels=args.labels) - f.close() + if args.outFileName: + f = open(args.outFileName, "wb") + np.savez_compressed(f, + matrix=num_reads_per_bin, + labels=args.labels) + f.close() + + if args.scalingFactors: + f = open(args.scalingFactors, 'w') + f.write("sample\tscalingFactor\n") + scalingFactors = countR.estimateSizeFactors(num_reads_per_bin) + for sample, scalingFactor in zip(args.labels, scalingFactors): + f.write("{}\t{:6.4f}\n".format(sample, scalingFactor)) + f.close() if args.outRawCounts: # append to the generated file the diff --git a/deeptools/test/test_multiBamSummary.py b/deeptools/test/test_multiBamSummary.py index b6fc5fdc0..c1716352e 100644 --- a/deeptools/test/test_multiBamSummary.py +++ b/deeptools/test/test_multiBamSummary.py @@ -9,6 +9,8 @@ BAM = ROOT + "test1.bam" CRAM = ROOT + "test1.cram" GTF = ROOT + "test.gtf" +BAMA = ROOT + "testA.bam" +BAMB = ROOT + "testB.bam" def test_multiBamSummary_gtf(): @@ -43,3 +45,12 @@ def test_multiBamSummary_metagene(): nt.assert_allclose(matrix, np.array([[25.0, 25.0], [31.0, 31.0]])) unlink(outfile) + + +def test_multiBamSummary_scalingFactors(): + outfile = '/tmp/test.scalingFactors.txt' + args = 'bins --binSize 50 -b {} {} --scalingFactors {}'.format(BAMA, BAMB, outfile).split() + mbs.main(args) + resp = open(outfile).read().strip().split('\n') + nt.assert_equal(resp, ["sample\tscalingFactor", "testA.bam\t1.1892", "testB.bam\t0.8409"]) + unlink(outfile) diff --git a/galaxy/wrapper/multiBamSummary.xml b/galaxy/wrapper/multiBamSummary.xml index f3c88d9c1..808d43a6d 100644 --- a/galaxy/wrapper/multiBamSummary.xml +++ b/galaxy/wrapper/multiBamSummary.xml @@ -23,6 +23,10 @@ --outRawCounts '$outFileRawCounts' #end if + #if $scalingFactors: + --scalingFactors '$scalingFactorsFile' + #end if + #if $mode.modeOpt == "bins": --binSize '$mode.binSize' --distanceBetweenBins '$mode.distanceBetweenBins' @@ -80,6 +84,7 @@ + @@ -87,6 +92,9 @@ outRawCounts is True + + scalingFactors is True +