Skip to content

Commit

Permalink
Implement800 (#809)
Browse files Browse the repository at this point in the history
* add --sizeFactors

* Still need to make the galaxy wrapper

* Add tests for scalingFactor

* pep8

* wrong function name

* Add --scalingFactors to the galaxy wrapper

* assert equal

* foo
  • Loading branch information
dpryan79 authored Feb 17, 2019
1 parent 96d7ae4 commit d3bd4a5
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 9 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Added the `std` plot type to plotProfile in Galaxy (issue #782)
* `bamCompare` now has a `--skipZeroOverZero` option to allow skipping bins where both input files lack coverage (issue #785)
* `bamCompare` and `bigwigCompare` can now take two pseudocounts, in case you want a different value for the numerator and the denominator (issue #784)
* `multiBamSummary` now has a `--scaleFactors` option, which computes scale factors in the same manner as DESeq2 to a file. Note that the produced scaling factors are meant to be used with `bamCoverage`. If you want to use them directly in DESeq2 (or a similar package) you will need to invert them (take 1/scale factor). (issue #800)
* Fixed an issue with large numbers of samples and small genome sizes sometimes causing nothing to be processed. (issue #801)

3.1.3
Expand Down
23 changes: 23 additions & 0 deletions deeptools/countReadsPerBin.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,29 @@ def remove_row_of_zeros(matrix):
return matrix[to_keep, :]


def estimateSizeFactors(m):
"""
Compute size factors in the same way as DESeq2.
The inverse of that is returned, as it's then compatible with bamCoverage.
m : a numpy ndarray
>>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
>>> sf = estimateSizeFactors(m)
>>> assert(np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4))
>>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
>>> sf = estimateSizeFactors(m)
>>> assert(np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4))
"""
loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
# Mask after computing the geometric mean
m = np.ma.masked_where(m <= 0, m)
loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
# DESeq2 ratio-based size factor
sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
return 1. / sf


class Tester(object):

def __init__(self):
Expand Down
34 changes: 25 additions & 9 deletions deeptools/multiBamSummary.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,7 @@ def bamcorrelate_args(case='bins'):
help='File name to save the coverage matrix. This matrix '
'can be subsequently plotted using plotCorrelation or '
'or plotPCA.',
type=parserCommon.writableFile,
required=True)
type=parserCommon.writableFile)

optional = parser.add_argument_group('Optional arguments')

Expand Down Expand Up @@ -172,6 +171,14 @@ def bamcorrelate_args(case='bins'):
type=parserCommon.writableFile,
metavar='FILE')

group.add_argument('--scalingFactors',
help='Compute scaling factors (in the DESeq2 manner) '
'compatible for use with bamCoverage and write them to a '
'file. The file has tab-separated columns "sample" and '
'"scalingFactor".',
type=parserCommon.writableFile,
metavar='FILE')

return parser


Expand Down Expand Up @@ -205,9 +212,9 @@ def main(args=None):
else:
bed_regions = None

if len(args.bamfiles) == 1 and not args.outRawCounts:
if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors):
sys.stderr.write("You've input a single BAM file and not specified "
"--outRawCounts. The resulting output will NOT be "
"--outRawCounts or --scalingFactors. The resulting output will NOT be "
"useful with any deepTools program!\n")

stepsize = args.binSize + args.distanceBetweenBins
Expand Down Expand Up @@ -243,11 +250,20 @@ def main(args=None):
"region is covered by reads.\n")

# numpy will append .npz to the file name if we don't do this...
f = open(args.outFileName, "wb")
np.savez_compressed(f,
matrix=num_reads_per_bin,
labels=args.labels)
f.close()
if args.outFileName:
f = open(args.outFileName, "wb")
np.savez_compressed(f,
matrix=num_reads_per_bin,
labels=args.labels)
f.close()

if args.scalingFactors:
f = open(args.scalingFactors, 'w')
f.write("sample\tscalingFactor\n")
scalingFactors = countR.estimateSizeFactors(num_reads_per_bin)
for sample, scalingFactor in zip(args.labels, scalingFactors):
f.write("{}\t{:6.4f}\n".format(sample, scalingFactor))
f.close()

if args.outRawCounts:
# append to the generated file the
Expand Down
11 changes: 11 additions & 0 deletions deeptools/test/test_multiBamSummary.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
BAM = ROOT + "test1.bam"
CRAM = ROOT + "test1.cram"
GTF = ROOT + "test.gtf"
BAMA = ROOT + "testA.bam"
BAMB = ROOT + "testB.bam"


def test_multiBamSummary_gtf():
Expand Down Expand Up @@ -43,3 +45,12 @@ def test_multiBamSummary_metagene():
nt.assert_allclose(matrix, np.array([[25.0, 25.0],
[31.0, 31.0]]))
unlink(outfile)


def test_multiBamSummary_scalingFactors():
outfile = '/tmp/test.scalingFactors.txt'
args = 'bins --binSize 50 -b {} {} --scalingFactors {}'.format(BAMA, BAMB, outfile).split()
mbs.main(args)
resp = open(outfile).read().strip().split('\n')
nt.assert_equal(resp, ["sample\tscalingFactor", "testA.bam\t1.1892", "testB.bam\t0.8409"])
unlink(outfile)
8 changes: 8 additions & 0 deletions galaxy/wrapper/multiBamSummary.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
--outRawCounts '$outFileRawCounts'
#end if
#if $scalingFactors:
--scalingFactors '$scalingFactorsFile'
#end if
#if $mode.modeOpt == "bins":
--binSize '$mode.binSize'
--distanceBetweenBins '$mode.distanceBetweenBins'
Expand Down Expand Up @@ -80,13 +84,17 @@
</expand>

<param argument="--outRawCounts" type="boolean" label="Save raw counts (coverages) to file" help=""/>
<param argument="--scalingFactors" type="boolean" label="Save scaling factors" help="Scaling factors calculated as in DESeq2 and made directly compatible with bamCoverage."/>

</inputs>
<outputs>
<data format="deeptools_coverage_matrix" name="outFile" label="${tool.name} on ${on_string}: correlation matrix" />
<data format="tabular" name="outFileRawCounts" label="${tool.name} on ${on_string}: bin counts">
<filter>outRawCounts is True</filter>
</data>
<data format="tabular" name="scalingFactorsFile" label="${tool.name} on ${on_string}: scaling factors">
<filter>scalingFactors is True</filter>
</data>
</outputs>
<tests>
<test>
Expand Down

0 comments on commit d3bd4a5

Please sign in to comment.