Update version Sniffles 2.2

fritzsedlazeck · Jul 6, 2023 · 31cf016 · 31cf016
1 parent 60e7015
commit 31cf016
Show file tree

Hide file tree

Showing 13 changed files with 334 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .DS_Store
 src/sniffles.egg-info
 dist
+__pycache__
+*.pyc
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ or
 
 If you previously installed Sniffles1 using conda and want to upgrade to Sniffles2, you can use:
 
-`conda update sniffles=2.0`
+`conda update sniffles=2.2`
 
 ## Requirements
 * Python >= 3.7
@@ -36,7 +36,7 @@ Please cite our paper at:
 
 https://www.nature.com/articles/s41592-018-0001-7
 
-A new preprint for the new methods and improvements introduced with Sniffles2 is here: https://www.biorxiv.org/content/10.1101/2022.04.04.487055v1 
+A new preprint for the new methods and improvements introduced with Sniffles2 is here: https://www.biorxiv.org/content/10.1101/2022.04.04.487055v1
 
 ## Use-Cases / Modes
 
@@ -55,10 +55,10 @@ Multi-sample SV calling using Sniffles2 population mode works in two steps:
 Alternatively, for step 2. you can supply a .tsv file, containing a list of .snf files, and custom sample ids in an optional second column (one sample per line), .e.g.:
 2. Combined calling using a .tsv as sample list: `sniffles --input snf_files_list.tsv --vcf multisample.vcf`
 
-### C. Non-Germline SV Calling (Somatic)
-To call non-germline SVs (i.e. somatic/mosaic) SVs, the *--non-germline* option should be added, i.e.:
+### C. Mosaic SV Calling (Non-germline or somatic SVs)
+To call mosaic SVs, the *--mosaic* option should be added, i.e.:
 
-`sniffles --input mapped_input.bam --vcf output.vcf --non-germline`
+`sniffles --input mapped_input.bam --vcf output.vcf --mosaic`
 
 ### D. Genotyping a known set of SVs (Force Calling)
 Example command, to determine the genotype of each SV in *input_known_svs.vcf* for *sample.bam* and write the re-genotyped SVs to *output_genotypes.vcf*:

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = sniffles
-version = 2.0.7
+version = 2.2
 author = Moritz Smolka
 author_email = moritz.g.smolka@gmail.com
 description = A fast structural variation caller for long-read sequencing data

diff --git a/src/sniffles/cluster.py b/src/sniffles/cluster.py
@@ -36,6 +36,7 @@ def compute_metrics(self,max_n=100):
             self.stdev_start=0
             return
 
+        # REVIEW: might need fix based on issue #407
         step=int(len(self.leads)/n)
         if n>1:
             self.mean_svlen=sum(self.leads[i].svlen for i in range(0,len(self.leads),step))/float(n)
@@ -247,6 +248,12 @@ def resolve(svtype,leadtab_provider,config,tr):
             i=max(0,i-2)
         i+=1
 
+    if config.dev_trace_read:
+        for c in clusters:
+            for ld in c.leads:
+                if ld.read_qname==config.dev_trace_read:
+                    print(f"[DEV_TRACE_READ [2/4] [cluster.resolve] Read lead {ld} is in cluster {c.id}, containing a total of {len(c.leads)} leads")
+
     if config.dev_dump_clusters:
         filename=f"{config.input}.clusters.{svtype}.{leadtab_provider.contig}.{leadtab_provider.start}.{leadtab_provider.end}.bed"
         print(f"Dumping clusters to {filename}")
@@ -265,7 +272,7 @@ def resolve(svtype,leadtab_provider,config,tr):
             if config.dev_no_resplit:
                 yield cluster
             else:
-                for new_cluster in resplit_bnd(cluster,merge_threshold=config.bnd_cluster_resplit):
+                for new_cluster in resplit_bnd(cluster,merge_threshold=config.cluster_merge_bnd):
                     yield new_cluster
         else:
             if svtype=="INS" or svtype=="DEL":

diff --git a/src/sniffles/config.py b/src/sniffles/config.py
diff --git a/src/sniffles/consensus.py b/src/sniffles/consensus.py
@@ -354,6 +354,7 @@ def novel_from_reads(best_lead,other_leads,klen,skip,skip_repetitive,debug=False
                     conseq_new.append("-"*len(buffer))
         conseq="".join(conseq_new)
 
+        # FIXME: can lead to ZeroDivisionError
         if span/float(len(best_lead.seq)) > minspan:
             alignments.append(conseq)
 

diff --git a/src/sniffles/leadprov.py b/src/sniffles/leadprov.py
@@ -162,6 +162,7 @@ def read_iterindels(read_id,read,contig,config,use_clips,read_nm):
 
     pos_read=0
     pos_ref=read.reference_start
+
     for op,oplength in read.cigartuples:
         add_read,add_ref,event=OPLIST[op]
         if event and oplength >= minsvlen:
@@ -212,6 +213,34 @@ def read_iterindels(read_id,read,contig,config,use_clips,read_nm):
         pos_read+=add_read*oplength
         pos_ref+=add_ref*oplength
 
+def get_cigar_indels(read_id,read,contig,config,use_clips,read_nm):
+    minsvlen=config.minsvlen_screen
+    longinslen=config.long_ins_length/2.0
+    seq_cache_maxlen=config.dev_seq_cache_maxlen
+    qname=read.query_name
+    mapq=read.mapping_quality
+    strand="-" if read.is_reverse else "+"
+    CINS=pysam.CINS
+    CDEL=pysam.CDEL
+    CSOFT_CLIP=pysam.CSOFT_CLIP
+
+    INS_SUM=0
+    DEL_SUM=0
+
+    pos_read=0
+    pos_ref=read.reference_start
+    for op,oplength in read.cigartuples:
+        add_read,add_ref,event=OPLIST[op]
+        if event:
+            if op==CINS:
+                INS_SUM+=oplength
+            elif op==CDEL:
+                DEL_SUM+=oplength
+        pos_read+=add_read*oplength
+        pos_ref+=add_ref*oplength
+
+    return INS_SUM,DEL_SUM
+
 def read_itersplits_bnd(read_id,read,contig,config,read_nm):
     assert(read.is_supplementary)
     #SA:refname,pos,strand,CIGAR,MAPQ,NM
@@ -276,7 +305,7 @@ def read_itersplits_bnd(read_id,read,contig,config,read_nm):
                               split_qry_start+readspan,
                               strand,
                               mapq,
-                              nm/float(readspan+1),
+                              read_nm,
                               "SPLIT_SUP",
                               "?"))
 
@@ -307,10 +336,14 @@ def read_itersplits(read_id,read,contig,config,read_nm):
     #SA:refname,pos,strand,CIGAR,MAPQ,NM
     all_leads=[]
     supps=[part.split(",") for part in read.get_tag("SA").split(";") if len(part)>0]
+    trace_read=config.dev_trace_read!=False and config.dev_trace_read==read.query_name
 
     if len(supps) > config.max_splits_base + config.max_splits_kb*(read.query_length/1000.0):
         return
 
+    if trace_read:
+        print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] passed max_splits check")
+
     #QC on: 18Aug21, HG002.ont.chr22; O.K.
     #cigarl=CIGAR_tolist(read.cigarstring)
     #if read.is_reverse:
@@ -376,7 +409,7 @@ def read_itersplits(read_id,read,contig,config,read_nm):
                               split_qry_start+readspan,
                               strand,
                               mapq,
-                              nm/float(readspan+1),
+                              read_nm,
                               "SPLIT_SUP",
                               "?"))
 
@@ -388,8 +421,28 @@ def read_itersplits(read_id,read,contig,config,read_nm):
         #assert(CIGAR_listreadstart_rev(cigarl)==readstart_rev)
         #End QC
 
+    if trace_read:
+        print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] all_leads: {all_leads}")
+
     sv.classify_splits(read,all_leads,config,contig)
 
+    if trace_read:
+        print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] classify_splits(all_leads): {all_leads}")
+
+
+    """
+    if config.dev_trace_read != False:
+        print(read.query_name)
+        if read.query_name == config.dev_trace_read:
+            for lead_i, lead in enumerate(all_leads):
+                for svtype, svstart, arg in lead.svtypes_starts_lens:
+                    min_mapq=min(lead.mapq,all_leads[max(0,lead_i-1)].mapq)
+                    keep=True
+                    if not config.dev_keep_lowqual_splits and min_mapq < config.mapq:
+                        keep=False
+                    print(f"[DEV_TRACE_READ] [REPORT_LEAD_SPLIT] Splits identified from read {read.read_id}")
+    """
+
     for lead_i, lead in enumerate(all_leads):
         for svtype, svstart, arg in lead.svtypes_starts_lens:
             min_mapq=min(lead.mapq,all_leads[max(0,lead_i-1)].mapq)
@@ -488,7 +541,6 @@ def build_leadtab(self,contig,start,end,bam):
         for ld in self.iter_region(bam,contig,start,end):
             ld_contig,ld_ref_start=ld.contig,ld.ref_start
 
-            #TODO: Handle leads overlapping region ends (start/end)
             if contig==ld_contig and ld_ref_start >= start and ld_ref_start < end:
                 pos_leadtab=int(ld_ref_start/ld_binsize)*ld_binsize
                 self.record_lead(ld,pos_leadtab)
@@ -507,13 +559,20 @@ def iter_region(self,bam,contig,start=None,end=None):
         coverage_shift_bins=self.config.coverage_shift_bins
         coverage_shift_min_aln_len=self.config.coverage_shift_bins_min_aln_length
         long_ins_threshold=self.config.long_ins_length*0.5
-        qc_nm=self.config.qc_nm
+        qc_nm=self.config.qc_nm_measure
         phase=self.config.phase
         advanced_tags=qc_nm or phase
         mapq_min=self.config.mapq
         alen_min=self.config.min_alignment_length
+        nm_sum=0
+        nm_count=0
+        trace_read=self.config.dev_trace_read
 
         for read in bam.fetch(contig,start,end,until_eof=False):
+            if trace_read!=False:
+                if trace_read==read.query_name:
+                    print(f"[DEV_TRACE_READ] [0b/4] [LeadProvider.iter_region] [{contig}:{start}-{end}] [{read.query_name}] has been fetched and is entering pre-filtering")
+
             #if self.read_count % 1000000 == 0:
             #    gc.collect()
             if read.reference_start < start or read.reference_start >= end:
@@ -534,22 +593,48 @@ def iter_region(self,bam,contig,start=None,end=None):
             if advanced_tags:
                 if qc_nm:
                     if read.has_tag("NM"):
-                        nm=read.get_tag("NM")/float(read.query_alignment_length+1)
+                        nm_raw=read.get_tag("NM")
+                        nm_ratio=read.get_tag("NM")/float(read.query_alignment_length+1)
+                        ins_sum,del_sum=get_cigar_indels(curr_read_id,read,contig,self.config,use_clips,read_nm=nm)
+                        nm_adj=(nm_raw-(ins_sum+del_sum))
+                        nm_adj_ratio=nm_adj/float(read.query_alignment_length+1)
+                        nm=nm_adj_ratio
+                        nm_sum+=nm
+                        nm_count+=1
 
                 if phase:
                     curr_read_id=(self.read_id,str(read.get_tag("HP")) if read.has_tag("HP") else "NULL",str(read.get_tag("PS")) if read.has_tag("PS") else "NULL")
 
+            if trace_read!=False:
+                if trace_read==read.query_name:
+                    print(f"[DEV_TRACE_READ] [0b/4] [LeadProvider.iter_region] [{contig}:{start}-{end}] [{read.query_name}] passed pre-filtering (whole-read), begin to extract leads")
+
             #Extract small indels
             for lead in read_iterindels(curr_read_id,read,contig,self.config,use_clips,read_nm=nm):
+                if trace_read!=False:
+                    if trace_read==read.query_name:
+                        print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_iterindels] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
                 yield lead
 
             #Extract read splits
             if has_sa:
                 if read.is_supplementary:
+                    if trace_read!=False:
+                        if trace_read==read.query_name:
+                            print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits_bnd] [{contig}:{start}-{end}] [{read.query_name}] is entering read_itersplits_bnd")
                     for lead in read_itersplits_bnd(curr_read_id,read,contig,self.config,read_nm=nm):
+                        if trace_read!=False:
+                            if trace_read==read.query_name:
+                                print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits_bnd] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
                         yield lead
                 else:
+                    if trace_read!=False:
+                        if trace_read==read.query_name:
+                            print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits] [{contig}:{start}-{end}] [{read.query_name}] is entering read_itersplits")
                     for lead in read_itersplits(curr_read_id,read,contig,self.config,read_nm=nm):
+                        if trace_read!=False:
+                            if trace_read==read.query_name:
+                                print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
                         yield lead
 
             #Record in coverage table
@@ -570,6 +655,10 @@ def iter_region(self,bam,contig,start=None,end=None):
                 if read_end <= self.end:
                     target_tab[covr_end_bin]=target_tab[covr_end_bin]-1 if covr_end_bin in target_tab else -1
 
+        self.config.average_regional_nm=nm_sum/float(max(1,nm_count))
+        self.config.qc_nm_threshold=self.config.average_regional_nm
+        #print(f"Contig {contig} avg. regional NM={self.config.average_regional_nm}, threshold={self.config.qc_nm_threshold}")
+
 
     def dev_leadtab_filename(self,contig,start,end):
         scriptloc=os.path.dirname(os.path.realpath(sys.argv[0]))

diff --git a/src/sniffles/parallel.py b/src/sniffles/parallel.py
@@ -47,6 +47,16 @@ def call_candidates(self,keep_qc_fails,config):
         for svtype in sv.TYPES:
             for svcluster in cluster.resolve(svtype,self.lead_provider,config,self.tandem_repeats):
                 for svcall in sv.call_from(svcluster,config,keep_qc_fails,self):
+                    if config.dev_trace_read!=False:
+                        cluster_has_read=False
+                        for ld in svcluster.leads:
+                            if ld.read_qname==config.dev_trace_read:
+                                cluster_has_read=True
+                        if cluster_has_read:
+                            import copy
+                            svcall_copy=copy.deepcopy(svcall)
+                            svcall_copy.postprocess=None
+                            print(f"[DEV_TRACE_READ] [3/4] [Task.call_candidates] Read {config.dev_trace_read} -> Cluster {svcluster.id} -> preliminary SVCall {svcall_copy}")
                     candidates.append(svcall)
 
         self.coverage_average_fwd,self.coverage_average_rev=postprocessing.coverage(candidates,self.lead_provider,config)
@@ -66,6 +76,18 @@ def finalize_candidates(self,candidates,keep_qc_fails,config):
             postprocessing.annotate_sv(svcall,config)
 
             svcall.qc=svcall.qc and postprocessing.qc_sv_post_annotate(svcall,config)
+
+            if config.dev_trace_read!=False:
+                cluster_has_read=False
+                for ld in svcall.postprocess.cluster.leads:
+                    if ld.read_qname==config.dev_trace_read:
+                        cluster_has_read=True
+                if cluster_has_read:
+                    import copy
+                    svcall_copy=copy.deepcopy(svcall)
+                    svcall_copy.postprocess=None
+                    print(f"[DEV_TRACE_READ] [4/4] [Task.finalize_candidates] Read {config.dev_trace_read} -> Cluster {svcall.postprocess.cluster.id} -> finalized SVCall, QC={svcall_copy.qc}: {svcall_copy}")
+
             if not keep_qc_fails and not svcall.qc:
                 continue
 
@@ -80,7 +102,7 @@ def combine(self,config):
             snf_in.read_header()
             samples_headers_snf[snf_info["internal_id"]]=(snf_info["filename"],snf_in.header,snf_in)
 
-            if not config.snf_combine_keep_open:
+            if config.combine_close_handles:
                 snf_in.close()
 
         svcalls=[]