1.3 - Fix extend-sam-basespace, allow for extension over impure As, f…

…ilter peaks with short tail length, local alignment with bowtie2
Monash-RNA-Systems-Biology-Laboratory · Sep 13, 2018 · ceedc4e · ceedc4e
1 parent e76f301
commit ceedc4e
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 32 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,4 +1,9 @@
 
+1.3  - Long standing bug in extend-sam-basespace did not extend over lower case letters in reference, fixed.
+       Allow up to 60% mismatch on A extension in extend-sam-basespace.
+       Filter out peaks with short average tail length.
+       Use --sensitive-local mode in bowtie2.
+
 1.2  - Allow raw poly(A) lengthh and templated width download from mPAT shiny report.
        Fix bug setting three prime bounds for cumulative read distribution plots (missing dplyr:: for lead and lag)
 

diff --git a/README.md b/README.md
@@ -217,6 +217,10 @@ for name, tags in tags:
         # clip_runs_basespace = tail_tools.Clip_runs_basespace(
         #    adaptor='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', 
         #    clip_quality=0, length=20),
+
+        #To allow for looser mispriming, lower this.
+        #To only allow for stricter mispriming, raise this (maximum 1).
+        #extension_prop_a = 0.6
         ))
 
 
@@ -237,6 +241,16 @@ action = tail_tools.Analyse_polya_batch(
         # (Left blank since it's easy to forget to change.)
         extension = ... ,
 
+        # Size of peak features generated
+        # ie how far back from a site a read can end and still be counted towards it
+        # Should be read length or a little shorter
+        peak_length = 300,
+
+        # Minimum average tail length required to call a peak.
+        # Set higher then 0.0 if there is mispriming.
+        # 15.0 may be reasonable.
+        peak_min_tail = ...,
+
         # Optional: Species to use in GO term analysis, choices are: Sc Ce Mm Hs
         species="Sc",
 

diff --git a/doc/bam-files.md b/doc/bam-files.md
@@ -11,6 +11,9 @@ AA:i:...
 AN:i:...  
 - Gives the observed non-templated poly(A) tail length
 
+AG:i:...
+- Gives the number of templated "A"s. Note this is the length in the genome, but may be as low as 60% actual "A"s in the genome. Non-"A"s are still counted in this length.
+
 AD:i:...  
 - Gives the number of adaptor sequence bases observed after the end of 
   the poly(A) sequence. If the adaptor sequence is observed, then we have

diff --git a/tail_tools/__init__.py b/tail_tools/__init__.py
@@ -1,4 +1,4 @@
-VERSION = '1.2dev'
+VERSION = '1.3'
 #^ Note: this first line is read by the setup.py script to get the version
 
 import nesoni

diff --git a/tail_tools/clip_runs.py b/tail_tools/clip_runs.py
@@ -94,6 +94,7 @@ def run(self):
 @config.Int_flag('min_score', 'Minimum score to call a poly(A) tail, essentially number of As+adaptor bases matched.')
 @config.String_flag('adaptor', 'Adaptor sequence expected after poly-A tail (basespace only).')
 @config.Int_flag('length', 'Minimum length.')
+@config.Int_flag('only', 'Only use first NNN reads (for debugging). 0 means use all reads.')
 @config.Bool_flag('debug', 'Show detected poly-A region and adaptor location in each read.')
 @config.Main_section('filenames', 'Input FASTQ files.')
 class Clip_runs_basespace(config.Action_with_prefix):
@@ -104,6 +105,7 @@ class Clip_runs_basespace(config.Action_with_prefix):
     min_score = 10
     length = 20
     debug = False
+    only = 0
     filenames = [ ]
 
     def run(self):
@@ -278,6 +280,10 @@ def run(self):
 
                     if n%10000 == 0: 
                         grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
+
+                    # Option to do a quick subsample
+                    if self.only and self.only <= n:
+                        break
 
         grace.status('')
 

diff --git a/tail_tools/extend_sam.py b/tail_tools/extend_sam.py
@@ -1,6 +1,6 @@
 
 import nesoni
-from nesoni import config, io
+from nesoni import config, io, annotation
 
 import sys, os, array, itertools
 
@@ -326,10 +326,12 @@ def run(self):
 The actual tail length is stored in an 'AN' attribute.
 """)
 @config.Int_flag('tail', 'Minimum tail length.')
+@config.Float_flag('prop_a', 'Percent genomic A required to extend.')
 @config.Main_section('reference_filenames', 'Reference sequences in FASTA format.')
 @config.Section('clips', '.clips.gz file produced by "clip-reads-basespace:".')
 class Extend_sam_basespace(config.Action_filter):
     tail = 4
+    prop_a = 0.6
     reference_filenames = [ ]
     clips = [ ]
 
@@ -353,6 +355,10 @@ def run(self):
         in_file = self.begin_input()
         out_file = self.begin_output()
 
+        assert self.prop_a >= 0.0 and self.prop_a <= 1.0
+        a_score = 1-self.prop_a
+        non_a_score = -self.prop_a
+
         for line in in_file:
             line = line.rstrip()
             if line.startswith('@'):
@@ -364,7 +370,7 @@ def run(self):
             if al.flag & FLAG_UNMAPPED:
                 continue
 
-            ref = references[al.rname]
+            #ref = references[al.rname]
 
             reverse = al.flag & FLAG_REVERSE
             if reverse:
@@ -378,19 +384,53 @@ def run(self):
 
             n_tail = tail_lengths[al.qname]
 
+            #if reverse:
+            #    if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
+            #    bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])    
+            #else:
+            #    if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
+            #    bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
+            #
+            #extension = 0
+            #while extension < n_tail and bases_ref[extension] == 'A':
+            #    extension += 1
+
             if reverse:
-                if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
-                bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])    
+                feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1)
             else:
-                if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
-                bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail]
+                feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1)
+            bases_ref = feat.get_seq(references).upper()
+
+            # Allow up to 60% mismatch on As
+            # Treat soft clipping as insertion for simplicity
+            cigar = cigar.replace("S","I")
+            assert "H" not in cigar, "Can't handle hard clipping"
 
             extension = 0
-            while extension < n_tail and bases_ref[extension] == 'A':
-                extension += 1
+            best_score = 0.0
+            score = 0.0
+
+            # Soft clipping treated as a mismatch
+            i = len(cigar)-1
+            while i >= 0 and cigar[i] in "I":
+                score += non_a_score
+                i -= 1
+
+            for i in xrange(n_tail):
+                if bases_ref[i] == "A":
+                    score += a_score
+                else:
+                    score += non_a_score
+
+                if score >= best_score:
+                    extension = i+1
+                    best_score = score
+            #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref
+
 
             if n_tail-extension > 0:
                 al.extra.append('AN:i:%d' % (n_tail-extension))
+                al.extra.append('AG:i:%d' % (extension))
             if adaptor_bases[al.qname]:
                 al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
             if n_tail-extension >= self.tail:
@@ -402,7 +442,7 @@ def run(self):
                 al.extra.append('AA:i:1')
 
             cigar += 'M' * extension
-            read_bases += 'A' * extension
+            read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain)
             read_qual += chr(33+20) * extension #Arbitrarily give quality 20
             al.length += extension
             if reverse:
@@ -414,7 +454,7 @@ def run(self):
                 al.seq = read_bases
                 al.qual = read_qual
                 al.cigar = cigar_encode(cigar)
-
+                
             print >> out_file, al
 
         self.end_output(out_file)

diff --git a/tail_tools/peaks.py b/tail_tools/peaks.py
@@ -52,8 +52,10 @@ class Find_peaks(config.Action_with_prefix):
     def run(self):
         spans = collections.defaultdict(list)
 
-        for item in legion.parallel_imap(self._load_bam, self.filenames):
-            for key,value in item.items():
+        #for item in legion.parallel_imap(self._load_bam, self.filenames):
+        #    for key,value in item.items():
+        for filename in self.filenames:
+            for key,value in self._load_bam(filename).items():
                 spans[key].extend(value)
 
         grace.status('Calling peaks')
@@ -64,13 +66,22 @@ def run(self):
         n = 0
 
         for (rname, strand), span_list in spans.items():
-            depth = [ 0.0 ] * (1+max( item[1] for item in span_list ))
-            for start, end in span_list:
+            length = 1+max( item[1] for item in span_list )
+            depth = [ 0.0 ] * length
+            AN_total = [ 0.0 ] * length
+            AG_total = [ 0.0 ] * length 
+            for start, end, AN, AG in span_list:
                 depth[start] += 1.0
                 depth[end] -= 1.0
+                AN_total[start] += AN
+                AN_total[end] -= AN
+                AG_total[start] += AG
+                AG_total[end] -= AG
 
-            for i in xrange(1,len(depth)):
+            for i in xrange(1,length):
                 depth[i] += depth[i-1]
+                AN_total[i] += AN_total[i-1]
+                AG_total[i] += AG_total[i-1]
 
             for start, end in self._find_spans(depth):
                 if end-self.lap-start <= 0: continue
@@ -80,16 +91,20 @@ def run(self):
                 id = 'peak%d' % n
 
                 ann = annotation.Annotation()
-                ann.source = 'nesoni'
+                ann.source = 'tailtools'
                 ann.type = self.type
                 ann.seqid = rname
                 ann.start = start
                 ann.end = end - self.lap
+                assert ann.end == ann.start+1
                 ann.strand = strand
                 ann.score = None
                 ann.phase = None
                 ann.attr = { 
                     'id' : id,
+                    'n' : str(depth[start+self.lap//2]),
+                    'mean_tail' : str(AN_total[start+self.lap//2]/depth[start+self.lap//2]),
+                    'mean_genomic' : str(AG_total[start+self.lap//2]/depth[start+self.lap//2]),
                     'color' : '#00ff00' if strand > 0 else '#0000ff' if strand < 0 else '#008080',
                     }
                 print >> f, ann.as_gff()
@@ -114,6 +129,12 @@ def _load_bam(self, filename):
 
             if self.polya and not any( item.startswith("AA:i:") for item in alignment.extra ):
                 continue
+
+            AN = 0.0
+            AG = 0.0
+            for item in alignment.extra:
+                if item.startswith("AN:i:"): AN = float(item[5:])
+                if item.startswith("AG:i:"): AG = float(item[5:])
 
             strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1
 
@@ -131,7 +152,7 @@ def _load_bam(self, filename):
             rname = alignment.reference_name
             if (rname,strand) not in spans: 
                 spans[(rname,strand)] = [ ]          
-            spans[(rname, strand)].append((start,end+self.lap))
+            spans[(rname, strand)].append((start,end+self.lap, AN,AG))
 
         return spans
 
@@ -179,13 +200,17 @@ def _three_prime(feature):
     result.parents = feature.parents
     return result
 
+
+# Note: this now also does some filtering.
 @config.String_flag('parent')
 @config.String_flag('child')
 @config.Int_flag('extension', 'How far downstrand of the gene can the peak be.')
-class Relate_peaks_to_genes(config.Action_with_prefix):
+@config.Float_flag('min_tail', 'Minimum tail length to retain peak.')
+class Filter_and_relate_peaks_to_genes(config.Action_with_prefix):
     parent = None
     child = None
     extension = None
+    min_tail = 0.0
 
     def run(self):
         items = list(annotation.read_annotations(self.parent))
@@ -204,7 +229,11 @@ def run(self):
         exon_index = span_index.index_annotations(exons)
         utr_index = span_index.index_annotations(utrs)
 
-        peaks = list(annotation.read_annotations(self.child))
+        peaks = [ ]
+        for peak in annotation.read_annotations(self.child):
+            if float(peak.attr.get("mean_tail","0.0")) < self.min_tail:
+                continue
+            peaks.append(peak)
 
         for peak in peaks:
             # Query is final base in genome before poly(A) starts
@@ -272,13 +301,15 @@ def run(self):
 @config.String_flag('annotations', 'Annotation file. A GFF file containing genes, which contain mRNAs, which contain exons and a three_prime_utr.')
 @config.Int_flag('extension', 'How far downstrand of the gene can the peak be.')
 @config.Bool_flag('polya', 'Only use poly(A) reads.')
+@config.Float_flag('min_tail', 'Minimum average tail length to retain peak.')
 @config.Main_section('samples', 'List of sample directories as produced by "analyse-polya:" or "analyse-polya-batch:".')
 class Call_peaks(config.Action_with_output_dir):
     lap = 10
     radius = 50
     min_depth = 50
     peak_length = 100
     polya = True
+    min_tail = 15.0
 
     annotations = None
     extension = None
@@ -307,11 +338,12 @@ def run(self):
             shift_start = str(-self.peak_length),
             ).make()
 
-        Relate_peaks_to_genes(
+        Filter_and_relate_peaks_to_genes(
             outspace/'relation',
             parent = self.annotations,
             child = working/'peaks.gff',
-            extension = self.extension
+            extension = self.extension,
+            min_tail = self.min_tail,
             ).make()