Skip to content

Commit

Permalink
Update version Sniffles 2.2
Browse files Browse the repository at this point in the history
  • Loading branch information
lfpaulin committed Jul 6, 2023
1 parent 60e7015 commit 31cf016
Show file tree
Hide file tree
Showing 13 changed files with 334 additions and 60 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.DS_Store
src/sniffles.egg-info
dist
__pycache__
*.pyc
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ or

If you previously installed Sniffles1 using conda and want to upgrade to Sniffles2, you can use:

`conda update sniffles=2.0`
`conda update sniffles=2.2`

## Requirements
* Python >= 3.7
Expand All @@ -36,7 +36,7 @@ Please cite our paper at:

https://www.nature.com/articles/s41592-018-0001-7

A new preprint for the new methods and improvements introduced with Sniffles2 is here: https://www.biorxiv.org/content/10.1101/2022.04.04.487055v1
A new preprint for the new methods and improvements introduced with Sniffles2 is here: https://www.biorxiv.org/content/10.1101/2022.04.04.487055v1

## Use-Cases / Modes

Expand All @@ -55,10 +55,10 @@ Multi-sample SV calling using Sniffles2 population mode works in two steps:
Alternatively, for step 2. you can supply a .tsv file, containing a list of .snf files, and custom sample ids in an optional second column (one sample per line), .e.g.:
2. Combined calling using a .tsv as sample list: `sniffles --input snf_files_list.tsv --vcf multisample.vcf`

### C. Non-Germline SV Calling (Somatic)
To call non-germline SVs (i.e. somatic/mosaic) SVs, the *--non-germline* option should be added, i.e.:
### C. Mosaic SV Calling (Non-germline or somatic SVs)
To call mosaic SVs, the *--mosaic* option should be added, i.e.:

`sniffles --input mapped_input.bam --vcf output.vcf --non-germline`
`sniffles --input mapped_input.bam --vcf output.vcf --mosaic`

### D. Genotyping a known set of SVs (Force Calling)
Example command, to determine the genotype of each SV in *input_known_svs.vcf* for *sample.bam* and write the re-genotyped SVs to *output_genotypes.vcf*:
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = sniffles
version = 2.0.7
version = 2.2
author = Moritz Smolka
author_email = moritz.g.smolka@gmail.com
description = A fast structural variation caller for long-read sequencing data
Expand Down
9 changes: 8 additions & 1 deletion src/sniffles/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def compute_metrics(self,max_n=100):
self.stdev_start=0
return

# REVIEW: might need fix based on issue #407
step=int(len(self.leads)/n)
if n>1:
self.mean_svlen=sum(self.leads[i].svlen for i in range(0,len(self.leads),step))/float(n)
Expand Down Expand Up @@ -247,6 +248,12 @@ def resolve(svtype,leadtab_provider,config,tr):
i=max(0,i-2)
i+=1

if config.dev_trace_read:
for c in clusters:
for ld in c.leads:
if ld.read_qname==config.dev_trace_read:
print(f"[DEV_TRACE_READ [2/4] [cluster.resolve] Read lead {ld} is in cluster {c.id}, containing a total of {len(c.leads)} leads")

if config.dev_dump_clusters:
filename=f"{config.input}.clusters.{svtype}.{leadtab_provider.contig}.{leadtab_provider.start}.{leadtab_provider.end}.bed"
print(f"Dumping clusters to {filename}")
Expand All @@ -265,7 +272,7 @@ def resolve(svtype,leadtab_provider,config,tr):
if config.dev_no_resplit:
yield cluster
else:
for new_cluster in resplit_bnd(cluster,merge_threshold=config.bnd_cluster_resplit):
for new_cluster in resplit_bnd(cluster,merge_threshold=config.cluster_merge_bnd):
yield new_cluster
else:
if svtype=="INS" or svtype=="DEL":
Expand Down
76 changes: 50 additions & 26 deletions src/sniffles/config.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/sniffles/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ def novel_from_reads(best_lead,other_leads,klen,skip,skip_repetitive,debug=False
conseq_new.append("-"*len(buffer))
conseq="".join(conseq_new)

# FIXME: can lead to ZeroDivisionError
if span/float(len(best_lead.seq)) > minspan:
alignments.append(conseq)

Expand Down
99 changes: 94 additions & 5 deletions src/sniffles/leadprov.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def read_iterindels(read_id,read,contig,config,use_clips,read_nm):

pos_read=0
pos_ref=read.reference_start

for op,oplength in read.cigartuples:
add_read,add_ref,event=OPLIST[op]
if event and oplength >= minsvlen:
Expand Down Expand Up @@ -212,6 +213,34 @@ def read_iterindels(read_id,read,contig,config,use_clips,read_nm):
pos_read+=add_read*oplength
pos_ref+=add_ref*oplength

def get_cigar_indels(read_id,read,contig,config,use_clips,read_nm):
minsvlen=config.minsvlen_screen
longinslen=config.long_ins_length/2.0
seq_cache_maxlen=config.dev_seq_cache_maxlen
qname=read.query_name
mapq=read.mapping_quality
strand="-" if read.is_reverse else "+"
CINS=pysam.CINS
CDEL=pysam.CDEL
CSOFT_CLIP=pysam.CSOFT_CLIP

INS_SUM=0
DEL_SUM=0

pos_read=0
pos_ref=read.reference_start
for op,oplength in read.cigartuples:
add_read,add_ref,event=OPLIST[op]
if event:
if op==CINS:
INS_SUM+=oplength
elif op==CDEL:
DEL_SUM+=oplength
pos_read+=add_read*oplength
pos_ref+=add_ref*oplength

return INS_SUM,DEL_SUM

def read_itersplits_bnd(read_id,read,contig,config,read_nm):
assert(read.is_supplementary)
#SA:refname,pos,strand,CIGAR,MAPQ,NM
Expand Down Expand Up @@ -276,7 +305,7 @@ def read_itersplits_bnd(read_id,read,contig,config,read_nm):
split_qry_start+readspan,
strand,
mapq,
nm/float(readspan+1),
read_nm,
"SPLIT_SUP",
"?"))

Expand Down Expand Up @@ -307,10 +336,14 @@ def read_itersplits(read_id,read,contig,config,read_nm):
#SA:refname,pos,strand,CIGAR,MAPQ,NM
all_leads=[]
supps=[part.split(",") for part in read.get_tag("SA").split(";") if len(part)>0]
trace_read=config.dev_trace_read!=False and config.dev_trace_read==read.query_name

if len(supps) > config.max_splits_base + config.max_splits_kb*(read.query_length/1000.0):
return

if trace_read:
print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] passed max_splits check")

#QC on: 18Aug21, HG002.ont.chr22; O.K.
#cigarl=CIGAR_tolist(read.cigarstring)
#if read.is_reverse:
Expand Down Expand Up @@ -376,7 +409,7 @@ def read_itersplits(read_id,read,contig,config,read_nm):
split_qry_start+readspan,
strand,
mapq,
nm/float(readspan+1),
read_nm,
"SPLIT_SUP",
"?"))

Expand All @@ -388,8 +421,28 @@ def read_itersplits(read_id,read,contig,config,read_nm):
#assert(CIGAR_listreadstart_rev(cigarl)==readstart_rev)
#End QC

if trace_read:
print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] all_leads: {all_leads}")

sv.classify_splits(read,all_leads,config,contig)

if trace_read:
print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] classify_splits(all_leads): {all_leads}")


"""
if config.dev_trace_read != False:
print(read.query_name)
if read.query_name == config.dev_trace_read:
for lead_i, lead in enumerate(all_leads):
for svtype, svstart, arg in lead.svtypes_starts_lens:
min_mapq=min(lead.mapq,all_leads[max(0,lead_i-1)].mapq)
keep=True
if not config.dev_keep_lowqual_splits and min_mapq < config.mapq:
keep=False
print(f"[DEV_TRACE_READ] [REPORT_LEAD_SPLIT] Splits identified from read {read.read_id}")
"""

for lead_i, lead in enumerate(all_leads):
for svtype, svstart, arg in lead.svtypes_starts_lens:
min_mapq=min(lead.mapq,all_leads[max(0,lead_i-1)].mapq)
Expand Down Expand Up @@ -488,7 +541,6 @@ def build_leadtab(self,contig,start,end,bam):
for ld in self.iter_region(bam,contig,start,end):
ld_contig,ld_ref_start=ld.contig,ld.ref_start

#TODO: Handle leads overlapping region ends (start/end)
if contig==ld_contig and ld_ref_start >= start and ld_ref_start < end:
pos_leadtab=int(ld_ref_start/ld_binsize)*ld_binsize
self.record_lead(ld,pos_leadtab)
Expand All @@ -507,13 +559,20 @@ def iter_region(self,bam,contig,start=None,end=None):
coverage_shift_bins=self.config.coverage_shift_bins
coverage_shift_min_aln_len=self.config.coverage_shift_bins_min_aln_length
long_ins_threshold=self.config.long_ins_length*0.5
qc_nm=self.config.qc_nm
qc_nm=self.config.qc_nm_measure
phase=self.config.phase
advanced_tags=qc_nm or phase
mapq_min=self.config.mapq
alen_min=self.config.min_alignment_length
nm_sum=0
nm_count=0
trace_read=self.config.dev_trace_read

for read in bam.fetch(contig,start,end,until_eof=False):
if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [0b/4] [LeadProvider.iter_region] [{contig}:{start}-{end}] [{read.query_name}] has been fetched and is entering pre-filtering")

#if self.read_count % 1000000 == 0:
# gc.collect()
if read.reference_start < start or read.reference_start >= end:
Expand All @@ -534,22 +593,48 @@ def iter_region(self,bam,contig,start=None,end=None):
if advanced_tags:
if qc_nm:
if read.has_tag("NM"):
nm=read.get_tag("NM")/float(read.query_alignment_length+1)
nm_raw=read.get_tag("NM")
nm_ratio=read.get_tag("NM")/float(read.query_alignment_length+1)
ins_sum,del_sum=get_cigar_indels(curr_read_id,read,contig,self.config,use_clips,read_nm=nm)
nm_adj=(nm_raw-(ins_sum+del_sum))
nm_adj_ratio=nm_adj/float(read.query_alignment_length+1)
nm=nm_adj_ratio
nm_sum+=nm
nm_count+=1

if phase:
curr_read_id=(self.read_id,str(read.get_tag("HP")) if read.has_tag("HP") else "NULL",str(read.get_tag("PS")) if read.has_tag("PS") else "NULL")

if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [0b/4] [LeadProvider.iter_region] [{contig}:{start}-{end}] [{read.query_name}] passed pre-filtering (whole-read), begin to extract leads")

#Extract small indels
for lead in read_iterindels(curr_read_id,read,contig,self.config,use_clips,read_nm=nm):
if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_iterindels] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
yield lead

#Extract read splits
if has_sa:
if read.is_supplementary:
if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits_bnd] [{contig}:{start}-{end}] [{read.query_name}] is entering read_itersplits_bnd")
for lead in read_itersplits_bnd(curr_read_id,read,contig,self.config,read_nm=nm):
if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits_bnd] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
yield lead
else:
if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits] [{contig}:{start}-{end}] [{read.query_name}] is entering read_itersplits")
for lead in read_itersplits(curr_read_id,read,contig,self.config,read_nm=nm):
if trace_read!=False:
if trace_read==read.query_name:
print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
yield lead

#Record in coverage table
Expand All @@ -570,6 +655,10 @@ def iter_region(self,bam,contig,start=None,end=None):
if read_end <= self.end:
target_tab[covr_end_bin]=target_tab[covr_end_bin]-1 if covr_end_bin in target_tab else -1

self.config.average_regional_nm=nm_sum/float(max(1,nm_count))
self.config.qc_nm_threshold=self.config.average_regional_nm
#print(f"Contig {contig} avg. regional NM={self.config.average_regional_nm}, threshold={self.config.qc_nm_threshold}")


def dev_leadtab_filename(self,contig,start,end):
scriptloc=os.path.dirname(os.path.realpath(sys.argv[0]))
Expand Down
24 changes: 23 additions & 1 deletion src/sniffles/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ def call_candidates(self,keep_qc_fails,config):
for svtype in sv.TYPES:
for svcluster in cluster.resolve(svtype,self.lead_provider,config,self.tandem_repeats):
for svcall in sv.call_from(svcluster,config,keep_qc_fails,self):
if config.dev_trace_read!=False:
cluster_has_read=False
for ld in svcluster.leads:
if ld.read_qname==config.dev_trace_read:
cluster_has_read=True
if cluster_has_read:
import copy
svcall_copy=copy.deepcopy(svcall)
svcall_copy.postprocess=None
print(f"[DEV_TRACE_READ] [3/4] [Task.call_candidates] Read {config.dev_trace_read} -> Cluster {svcluster.id} -> preliminary SVCall {svcall_copy}")
candidates.append(svcall)

self.coverage_average_fwd,self.coverage_average_rev=postprocessing.coverage(candidates,self.lead_provider,config)
Expand All @@ -66,6 +76,18 @@ def finalize_candidates(self,candidates,keep_qc_fails,config):
postprocessing.annotate_sv(svcall,config)

svcall.qc=svcall.qc and postprocessing.qc_sv_post_annotate(svcall,config)

if config.dev_trace_read!=False:
cluster_has_read=False
for ld in svcall.postprocess.cluster.leads:
if ld.read_qname==config.dev_trace_read:
cluster_has_read=True
if cluster_has_read:
import copy
svcall_copy=copy.deepcopy(svcall)
svcall_copy.postprocess=None
print(f"[DEV_TRACE_READ] [4/4] [Task.finalize_candidates] Read {config.dev_trace_read} -> Cluster {svcall.postprocess.cluster.id} -> finalized SVCall, QC={svcall_copy.qc}: {svcall_copy}")

if not keep_qc_fails and not svcall.qc:
continue

Expand All @@ -80,7 +102,7 @@ def combine(self,config):
snf_in.read_header()
samples_headers_snf[snf_info["internal_id"]]=(snf_info["filename"],snf_in.header,snf_in)

if not config.snf_combine_keep_open:
if config.combine_close_handles:
snf_in.close()

svcalls=[]
Expand Down
Loading

0 comments on commit 31cf016

Please sign in to comment.