Skip to content

Commit

Permalink
1.3 - Fix extend-sam-basespace, allow for extension over impure As, f…
Browse files Browse the repository at this point in the history
…ilter peaks with short tail length, local alignment with bowtie2
  • Loading branch information
pfh committed Sep 13, 2018
1 parent e76f301 commit ceedc4e
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 32 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@

1.3 - Long standing bug in extend-sam-basespace did not extend over lower case letters in reference, fixed.
Allow up to 60% mismatch on A extension in extend-sam-basespace.
Filter out peaks with short average tail length.
Use --sensitive-local mode in bowtie2.

1.2 - Allow raw poly(A) lengthh and templated width download from mPAT shiny report.
Fix bug setting three prime bounds for cumulative read distribution plots (missing dplyr:: for lead and lag)

Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ for name, tags in tags:
# clip_runs_basespace = tail_tools.Clip_runs_basespace(
# adaptor='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC',
# clip_quality=0, length=20),

#To allow for looser mispriming, lower this.
#To only allow for stricter mispriming, raise this (maximum 1).
#extension_prop_a = 0.6
))


Expand All @@ -237,6 +241,16 @@ action = tail_tools.Analyse_polya_batch(
# (Left blank since it's easy to forget to change.)
extension = ... ,

# Size of peak features generated
# ie how far back from a site a read can end and still be counted towards it
# Should be read length or a little shorter
peak_length = 300,

# Minimum average tail length required to call a peak.
# Set higher then 0.0 if there is mispriming.
# 15.0 may be reasonable.
peak_min_tail = ...,

# Optional: Species to use in GO term analysis, choices are: Sc Ce Mm Hs
species="Sc",

Expand Down
3 changes: 3 additions & 0 deletions doc/bam-files.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ AA:i:...
AN:i:...
- Gives the observed non-templated poly(A) tail length

AG:i:...
- Gives the number of templated "A"s. Note this is the length in the genome, but may be as low as 60% actual "A"s in the genome. Non-"A"s are still counted in this length.

AD:i:...
- Gives the number of adaptor sequence bases observed after the end of
the poly(A) sequence. If the adaptor sequence is observed, then we have
Expand Down
2 changes: 1 addition & 1 deletion tail_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = '1.2dev'
VERSION = '1.3'
#^ Note: this first line is read by the setup.py script to get the version

import nesoni
Expand Down
6 changes: 6 additions & 0 deletions tail_tools/clip_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def run(self):
@config.Int_flag('min_score', 'Minimum score to call a poly(A) tail, essentially number of As+adaptor bases matched.')
@config.String_flag('adaptor', 'Adaptor sequence expected after poly-A tail (basespace only).')
@config.Int_flag('length', 'Minimum length.')
@config.Int_flag('only', 'Only use first NNN reads (for debugging). 0 means use all reads.')
@config.Bool_flag('debug', 'Show detected poly-A region and adaptor location in each read.')
@config.Main_section('filenames', 'Input FASTQ files.')
class Clip_runs_basespace(config.Action_with_prefix):
Expand All @@ -104,6 +105,7 @@ class Clip_runs_basespace(config.Action_with_prefix):
min_score = 10
length = 20
debug = False
only = 0
filenames = [ ]

def run(self):
Expand Down Expand Up @@ -278,6 +280,10 @@ def run(self):

if n%10000 == 0:
grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')

# Option to do a quick subsample
if self.only and self.only <= n:
break

grace.status('')

Expand Down
60 changes: 50 additions & 10 deletions tail_tools/extend_sam.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

import nesoni
from nesoni import config, io
from nesoni import config, io, annotation

import sys, os, array, itertools

Expand Down Expand Up @@ -326,10 +326,12 @@ def run(self):
The actual tail length is stored in an 'AN' attribute.
""")
@config.Int_flag('tail', 'Minimum tail length.')
@config.Float_flag('prop_a', 'Percent genomic A required to extend.')
@config.Main_section('reference_filenames', 'Reference sequences in FASTA format.')
@config.Section('clips', '.clips.gz file produced by "clip-reads-basespace:".')
class Extend_sam_basespace(config.Action_filter):
tail = 4
prop_a = 0.6
reference_filenames = [ ]
clips = [ ]

Expand All @@ -353,6 +355,10 @@ def run(self):
in_file = self.begin_input()
out_file = self.begin_output()

assert self.prop_a >= 0.0 and self.prop_a <= 1.0
a_score = 1-self.prop_a
non_a_score = -self.prop_a

for line in in_file:
line = line.rstrip()
if line.startswith('@'):
Expand All @@ -364,7 +370,7 @@ def run(self):
if al.flag & FLAG_UNMAPPED:
continue

ref = references[al.rname]
#ref = references[al.rname]

reverse = al.flag & FLAG_REVERSE
if reverse:
Expand All @@ -378,19 +384,53 @@ def run(self):

n_tail = tail_lengths[al.qname]

#if reverse:
# if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
# bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])
#else:
# if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
# bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
#
#extension = 0
#while extension < n_tail and bases_ref[extension] == 'A':
# extension += 1

if reverse:
if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])
feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1)
else:
if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail]
feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1)
bases_ref = feat.get_seq(references).upper()

# Allow up to 60% mismatch on As
# Treat soft clipping as insertion for simplicity
cigar = cigar.replace("S","I")
assert "H" not in cigar, "Can't handle hard clipping"

extension = 0
while extension < n_tail and bases_ref[extension] == 'A':
extension += 1
best_score = 0.0
score = 0.0

# Soft clipping treated as a mismatch
i = len(cigar)-1
while i >= 0 and cigar[i] in "I":
score += non_a_score
i -= 1

for i in xrange(n_tail):
if bases_ref[i] == "A":
score += a_score
else:
score += non_a_score

if score >= best_score:
extension = i+1
best_score = score
#print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref


if n_tail-extension > 0:
al.extra.append('AN:i:%d' % (n_tail-extension))
al.extra.append('AG:i:%d' % (extension))
if adaptor_bases[al.qname]:
al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
if n_tail-extension >= self.tail:
Expand All @@ -402,7 +442,7 @@ def run(self):
al.extra.append('AA:i:1')

cigar += 'M' * extension
read_bases += 'A' * extension
read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain)
read_qual += chr(33+20) * extension #Arbitrarily give quality 20
al.length += extension
if reverse:
Expand All @@ -414,7 +454,7 @@ def run(self):
al.seq = read_bases
al.qual = read_qual
al.cigar = cigar_encode(cigar)

print >> out_file, al

self.end_output(out_file)
Expand Down
54 changes: 43 additions & 11 deletions tail_tools/peaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ class Find_peaks(config.Action_with_prefix):
def run(self):
spans = collections.defaultdict(list)

for item in legion.parallel_imap(self._load_bam, self.filenames):
for key,value in item.items():
#for item in legion.parallel_imap(self._load_bam, self.filenames):
# for key,value in item.items():
for filename in self.filenames:
for key,value in self._load_bam(filename).items():
spans[key].extend(value)

grace.status('Calling peaks')
Expand All @@ -64,13 +66,22 @@ def run(self):
n = 0

for (rname, strand), span_list in spans.items():
depth = [ 0.0 ] * (1+max( item[1] for item in span_list ))
for start, end in span_list:
length = 1+max( item[1] for item in span_list )
depth = [ 0.0 ] * length
AN_total = [ 0.0 ] * length
AG_total = [ 0.0 ] * length
for start, end, AN, AG in span_list:
depth[start] += 1.0
depth[end] -= 1.0
AN_total[start] += AN
AN_total[end] -= AN
AG_total[start] += AG
AG_total[end] -= AG

for i in xrange(1,len(depth)):
for i in xrange(1,length):
depth[i] += depth[i-1]
AN_total[i] += AN_total[i-1]
AG_total[i] += AG_total[i-1]

for start, end in self._find_spans(depth):
if end-self.lap-start <= 0: continue
Expand All @@ -80,16 +91,20 @@ def run(self):
id = 'peak%d' % n

ann = annotation.Annotation()
ann.source = 'nesoni'
ann.source = 'tailtools'
ann.type = self.type
ann.seqid = rname
ann.start = start
ann.end = end - self.lap
assert ann.end == ann.start+1
ann.strand = strand
ann.score = None
ann.phase = None
ann.attr = {
'id' : id,
'n' : str(depth[start+self.lap//2]),
'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]),
'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]),
'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080',
}
print >> f, ann.as_gff()
Expand All @@ -114,6 +129,12 @@ def _load_bam(self, filename):

if self.polya and not any( item.startswith("AA:i:") for item in alignment.extra ):
continue

AN = 0.0
AG = 0.0
for item in alignment.extra:
if item.startswith("AN:i:"): AN = float(item[5:])
if item.startswith("AG:i:"): AG = float(item[5:])

strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1

Expand All @@ -131,7 +152,7 @@ def _load_bam(self, filename):
rname = alignment.reference_name
if (rname,strand) not in spans:
spans[(rname,strand)] = [ ]
spans[(rname, strand)].append((start,end+self.lap))
spans[(rname, strand)].append((start,end+self.lap, AN,AG))

return spans

Expand Down Expand Up @@ -179,13 +200,17 @@ def _three_prime(feature):
result.parents = feature.parents
return result


# Note: this now also does some filtering.
@config.String_flag('parent')
@config.String_flag('child')
@config.Int_flag('extension', 'How far downstrand of the gene can the peak be.')
class Relate_peaks_to_genes(config.Action_with_prefix):
@config.Float_flag('min_tail', 'Minimum tail length to retain peak.')
class Filter_and_relate_peaks_to_genes(config.Action_with_prefix):
parent = None
child = None
extension = None
min_tail = 0.0

def run(self):
items = list(annotation.read_annotations(self.parent))
Expand All @@ -204,7 +229,11 @@ def run(self):
exon_index = span_index.index_annotations(exons)
utr_index = span_index.index_annotations(utrs)

peaks = list(annotation.read_annotations(self.child))
peaks = [ ]
for peak in annotation.read_annotations(self.child):
if float(peak.attr.get("mean_tail","0.0")) < self.min_tail:
continue
peaks.append(peak)

for peak in peaks:
# Query is final base in genome before poly(A) starts
Expand Down Expand Up @@ -272,13 +301,15 @@ def run(self):
@config.String_flag('annotations', 'Annotation file. A GFF file containing genes, which contain mRNAs, which contain exons and a three_prime_utr.')
@config.Int_flag('extension', 'How far downstrand of the gene can the peak be.')
@config.Bool_flag('polya', 'Only use poly(A) reads.')
@config.Float_flag('min_tail', 'Minimum average tail length to retain peak.')
@config.Main_section('samples', 'List of sample directories as produced by "analyse-polya:" or "analyse-polya-batch:".')
class Call_peaks(config.Action_with_output_dir):
lap = 10
radius = 50
min_depth = 50
peak_length = 100
polya = True
min_tail = 15.0

annotations = None
extension = None
Expand Down Expand Up @@ -307,11 +338,12 @@ def run(self):
shift_start = str(-self.peak_length),
).make()

Relate_peaks_to_genes(
Filter_and_relate_peaks_to_genes(
outspace/'relation',
parent = self.annotations,
child = working/'peaks.gff',
extension = self.extension
extension = self.extension,
min_tail = self.min_tail,
).make()


Expand Down
Loading

0 comments on commit ceedc4e

Please sign in to comment.