diff --git a/README.md b/README.md index 50785d3..48a7d84 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ NanoCaller is a computational method that integrates long reads in deep convolut NanoCaller is distributed under the [MIT License by Wang Genomics Lab](https://wglab.mit-license.org/). ## Latest Updates +_**v3.6.0** (April 22 2024)_: CSI indices generated for VCF files instead of TBI to accommodate larger contigs. + _**v3.5.0** (March 27 2024)_: CRAM files are supported in input as well as in phased output if whatshap version>=2 is being used with NanoCaller. _**v3.4.0** (July 31 2023)_: VCF files contain total and strand-specific allele depths for SNP calls from SNP calling models. A new mode for short ONT reads (5-10kbp) added. `--phase_qual_score` parameter filters out low quality SNP calls from phasing by WhatsHap; these SNP calls are kept in the output, but neither phased nor used for phasing reads. diff --git a/nanocaller_src/indelCaller.py b/nanocaller_src/indelCaller.py index fff9000..1bc7684 100644 --- a/nanocaller_src/indelCaller.py +++ b/nanocaller_src/indelCaller.py @@ -197,7 +197,7 @@ def phase_run(contig_dict, params, indel_dict, job_Q, counter_Q, phased_snp_file input_snp_vcf = params['snp_vcf'] output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.vcf.gz' %contig) phased_snp_files_list.append(output_contig_vcf) - run_cmd('bcftools view %s -r %s|bgziptabix %s' %(input_snp_vcf, contig, output_contig_vcf)) + run_cmd('bcftools view %s -r %s| bgzip > %s && tabix -fp vcf --csi %s' %(input_snp_vcf, contig, output_contig_vcf, output_contig_vcf)) if params['mode']=='snps': counter_Q.put(1) @@ -232,13 +232,13 @@ def phase_run(contig_dict, params, indel_dict, job_Q, counter_Q, phased_snp_file # extract region from VCF run_cmd('bcftools view %s -r %s -i "QUAL>=%.4f" -o %s' %(input_snp_vcf, contig, phase_qual_score,input_contig_vcf), verbose=params['verbose']) - run_cmd('bcftools view %s -r %s -i "QUAL<%.4f"|bgziptabix %s' %(input_snp_vcf, contig, phase_qual_score,lowq_input_contig_vcf), verbose=params['verbose']) + run_cmd('bcftools view %s -r %s -i "QUAL<%.4f"| bgzip > %s && tabix -fp vcf --csi %s' %(input_snp_vcf, contig, phase_qual_score, lowq_input_contig_vcf, lowq_input_contig_vcf), verbose=params['verbose']) #phase VCF run_cmd("whatshap phase %s %s -o %s -r %s --ignore-read-groups --chromosome %s %s" %(input_contig_vcf, params['sam_path'], raw_output_contig_vcf, params['fasta_path'], contig , enable_whatshap), verbose=params['verbose']) - run_cmd("bcftools view -e 'GT=\"0\\0\"' %s|bgziptabix %s" %(raw_output_contig_vcf, output_contig_vcf), verbose=params['verbose']) + run_cmd("bcftools view -e 'GT=\"0\\0\"' %s| bgzip > %s && tabix -fp vcf --csi %s" %(raw_output_contig_vcf, output_contig_vcf, output_contig_vcf), verbose=params['verbose']) if file_type=='BAM': phased_bam = os.path.join(phase_dir, '%s.phased.bam' %contig) @@ -361,7 +361,7 @@ def call_manager(params): output_files['snps']=os.path.join(params['vcf_path'],'%s.snps.phased.vcf.gz' %params['prefix']) phased_snps_files='\n'.join(phased_snp_files_list) - cmd='bcftools concat -a -f - |bgziptabix %s' %(output_files['snps']) + cmd='bcftools concat -a -f - |bgzip > %s && tabix -fp vcf --csi %s' %(output_files['snps'], output_files['snps']) stream=Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) stdout, stderr = stream.communicate(input=phased_snps_files.encode()) @@ -388,13 +388,13 @@ def call_manager(params): print('\n%s: Compressing and indexing indel calls.' %str(datetime.datetime.now())) - run_cmd(' bcftools sort %s|rtg RTG_MEM=2G vcfdecompose -i - -o - |rtg RTG_MEM=2G vcffilter -i - --non-snps-only -o %s' %(raw_indel_vcf, output_files['indels']), verbose=params['verbose']) + run_cmd(' bcftools sort %s|rtg RTG_MEM=2G vcfdecompose -i - -o - |rtg RTG_MEM=2G vcffilter -i - --non-snps-only -o - | bgzip > %s && tabix -fp vcf --csi %s' %(raw_indel_vcf, output_files['indels'], output_files['indels']), verbose=params['verbose']) if params['mode']=='all': final_vcf=os.path.join(params['vcf_path'],'%s.vcf.gz' %params['prefix']) output_files['final']=final_vcf - run_cmd('bcftools concat %s %s -a | bgziptabix %s' %(output_files['snps'], output_files['indels'], output_files['final']), verbose=params['verbose']) + run_cmd('bcftools concat %s %s -a | bgzip > %s && tabix -fp vcf --csi %s' %(output_files['snps'], output_files['indels'], output_files['final'], output_files['final']), verbose=params['verbose']) return output_files diff --git a/nanocaller_src/snpCaller.py b/nanocaller_src/snpCaller.py index 2c61a3a..5d234b2 100644 --- a/nanocaller_src/snpCaller.py +++ b/nanocaller_src/snpCaller.py @@ -269,8 +269,8 @@ def call_manager(params): print('\n%s: Compressing and indexing SNP calls.' %str(datetime.datetime.now())) - run_cmd("bcftools sort %s|bgziptabix %a" %(temp_output_file_path, all_snps_file_path), error=True) - run_cmd("bcftools view %s -f PASS|bgziptabix %a" %(all_snps_file_path, pass_snps_file_path), error=True) + run_cmd("bcftools sort %s|bgzip > %a && tabix -fp vcf --csi %a" %(temp_output_file_path, all_snps_file_path, all_snps_file_path), error=True) + run_cmd("bcftools view %s -f PASS|bgzip > %a && tabix -fp vcf --csi %a" %(all_snps_file_path, pass_snps_file_path, pass_snps_file_path), error=True) return pass_snps_file_path