diff --git a/CHANGES.txt b/CHANGES.txt
index fff34e99c..4a02ee074 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,6 +4,7 @@
* Added the `std` plot type to plotProfile in Galaxy (issue #782)
* `bamCompare` now has a `--skipZeroOverZero` option to allow skipping bins where both input files lack coverage (issue #785)
* `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
+ * `multiBamSummary` now has a `--scaleFactors` option, which computes scale factors in the same manner as DESeq2 to a file. Note that the produced scaling factors are meant to be used with `bamCoverage`. If you want to use them directly in DESeq2 (or a similar package) you will need to invert them (take 1/scale factor). (issue #800)
* Fixed an issue with large numbers of samples and small genome sizes sometimes causing nothing to be processed. (issue #801)
3.1.3
diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py
index 925fbf890..178dad250 100644
--- a/deeptools/countReadsPerBin.py
+++ b/deeptools/countReadsPerBin.py
@@ -949,6 +949,29 @@ def remove_row_of_zeros(matrix):
return matrix[to_keep, :]
+def estimateSizeFactors(m):
+ """
+ Compute size factors in the same way as DESeq2.
+ The inverse of that is returned, as it's then compatible with bamCoverage.
+
+ m : a numpy ndarray
+
+ >>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
+ >>> sf = estimateSizeFactors(m)
+ >>> assert(np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4))
+ >>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
+ >>> sf = estimateSizeFactors(m)
+ >>> assert(np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4))
+ """
+ loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
+ # Mask after computing the geometric mean
+ m = np.ma.masked_where(m <= 0, m)
+ loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
+ # DESeq2 ratio-based size factor
+ sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
+ return 1. / sf
+
+
class Tester(object):
def __init__(self):
diff --git a/deeptools/multiBamSummary.py b/deeptools/multiBamSummary.py
index 059849133..5d0ca554d 100644
--- a/deeptools/multiBamSummary.py
+++ b/deeptools/multiBamSummary.py
@@ -105,8 +105,7 @@ def bamcorrelate_args(case='bins'):
help='File name to save the coverage matrix. This matrix '
'can be subsequently plotted using plotCorrelation or '
'or plotPCA.',
- type=parserCommon.writableFile,
- required=True)
+ type=parserCommon.writableFile)
optional = parser.add_argument_group('Optional arguments')
@@ -172,6 +171,14 @@ def bamcorrelate_args(case='bins'):
type=parserCommon.writableFile,
metavar='FILE')
+ group.add_argument('--scalingFactors',
+ help='Compute scaling factors (in the DESeq2 manner) '
+ 'compatible for use with bamCoverage and write them to a '
+ 'file. The file has tab-separated columns "sample" and '
+ '"scalingFactor".',
+ type=parserCommon.writableFile,
+ metavar='FILE')
+
return parser
@@ -205,9 +212,9 @@ def main(args=None):
else:
bed_regions = None
- if len(args.bamfiles) == 1 and not args.outRawCounts:
+ if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors):
sys.stderr.write("You've input a single BAM file and not specified "
- "--outRawCounts. The resulting output will NOT be "
+ "--outRawCounts or --scalingFactors. The resulting output will NOT be "
"useful with any deepTools program!\n")
stepsize = args.binSize + args.distanceBetweenBins
@@ -243,11 +250,20 @@ def main(args=None):
"region is covered by reads.\n")
# numpy will append .npz to the file name if we don't do this...
- f = open(args.outFileName, "wb")
- np.savez_compressed(f,
- matrix=num_reads_per_bin,
- labels=args.labels)
- f.close()
+ if args.outFileName:
+ f = open(args.outFileName, "wb")
+ np.savez_compressed(f,
+ matrix=num_reads_per_bin,
+ labels=args.labels)
+ f.close()
+
+ if args.scalingFactors:
+ f = open(args.scalingFactors, 'w')
+ f.write("sample\tscalingFactor\n")
+ scalingFactors = countR.estimateSizeFactors(num_reads_per_bin)
+ for sample, scalingFactor in zip(args.labels, scalingFactors):
+ f.write("{}\t{:6.4f}\n".format(sample, scalingFactor))
+ f.close()
if args.outRawCounts:
# append to the generated file the
diff --git a/deeptools/test/test_multiBamSummary.py b/deeptools/test/test_multiBamSummary.py
index b6fc5fdc0..c1716352e 100644
--- a/deeptools/test/test_multiBamSummary.py
+++ b/deeptools/test/test_multiBamSummary.py
@@ -9,6 +9,8 @@
BAM = ROOT + "test1.bam"
CRAM = ROOT + "test1.cram"
GTF = ROOT + "test.gtf"
+BAMA = ROOT + "testA.bam"
+BAMB = ROOT + "testB.bam"
def test_multiBamSummary_gtf():
@@ -43,3 +45,12 @@ def test_multiBamSummary_metagene():
nt.assert_allclose(matrix, np.array([[25.0, 25.0],
[31.0, 31.0]]))
unlink(outfile)
+
+
+def test_multiBamSummary_scalingFactors():
+ outfile = '/tmp/test.scalingFactors.txt'
+ args = 'bins --binSize 50 -b {} {} --scalingFactors {}'.format(BAMA, BAMB, outfile).split()
+ mbs.main(args)
+ resp = open(outfile).read().strip().split('\n')
+ nt.assert_equal(resp, ["sample\tscalingFactor", "testA.bam\t1.1892", "testB.bam\t0.8409"])
+ unlink(outfile)
diff --git a/galaxy/wrapper/multiBamSummary.xml b/galaxy/wrapper/multiBamSummary.xml
index f3c88d9c1..808d43a6d 100644
--- a/galaxy/wrapper/multiBamSummary.xml
+++ b/galaxy/wrapper/multiBamSummary.xml
@@ -23,6 +23,10 @@
--outRawCounts '$outFileRawCounts'
#end if
+ #if $scalingFactors:
+ --scalingFactors '$scalingFactorsFile'
+ #end if
+
#if $mode.modeOpt == "bins":
--binSize '$mode.binSize'
--distanceBetweenBins '$mode.distanceBetweenBins'
@@ -80,6 +84,7 @@
+
@@ -87,6 +92,9 @@
outRawCounts is True
+
+ scalingFactors is True
+