unfiltered SNP calls with raw probability output, and adjusted qualit…

…y score
WGLab · Jul 14, 2023 · 7443e29 · 7443e29
1 parent 5f039fb
commit 7443e29
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 28 deletions.
diff --git a/nanocaller_src/indelCaller.py b/nanocaller_src/indelCaller.py
@@ -24,14 +24,14 @@
                  }
 
 def get_indel_model(indel_model):
-    if os.path.exists(indel_model):
-        if os.path.isdir(indel_model):
-            indel_model_path=glob.glob(os.path.join(indel_model,'*.index'))[0].rstrip('.index')
-
-    elif indel_model in indel_model_dict:
+    if indel_model in indel_model_dict:
         dirname = os.path.dirname(__file__)
         indel_model_path=os.path.join(dirname, indel_model_dict[indel_model])
 
+    elif os.path.exists(indel_model):
+        if os.path.isdir(indel_model):
+            indel_model_path=glob.glob(os.path.join(indel_model,'*.index'))[0].rstrip('.index')
+
     else:
         return None
 
@@ -89,16 +89,16 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
                             qual_all=-10*np.log10(1e-6+1-batch_prob_all)
 
                             for j in range(len(batch_pred_all)):
-                                q=min(999,qual_all[j,batch_pred_all[j]])
+                                q=min(99,qual_all[j,batch_pred_all[j]])
                                 if batch_pos[j]>prev:
 
                                     if batch_prob_all[j,0]<=0.95:
 
-                                        q=-100*np.log10(1e-6+batch_prob_all[j,0])
+                                        q=-10*np.log10(1e-6+batch_prob_all[j,0])
                                         allele0_data, allele1_data,allele_total_data= batch_alleles_seq[j]
 
                                         if batch_pred_all[j]==1 and allele_total_data[0]:
-                                                    gq=-100*np.log10(1+1e-6-batch_prob_all[j,1])
+                                                    gq=-10*np.log10(1+1e-6-batch_prob_all[j,1])
                                                     s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele_total_data[0], allele_total_data[1], q, gq)
                                                     f.write(s)
                                                     prev=batch_pos[j]+max(len(allele_total_data[0]), len(allele_total_data[1]))
@@ -107,7 +107,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
                                             if allele0_data[0] and allele1_data[0]:
 
                                                 if allele0_data[0]==allele1_data[0] and allele0_data[1]==allele1_data[1]:
-                                                    gq=-100*np.log10(1+1e-6-batch_prob_all[j,1])
+                                                    gq=-10*np.log10(1+1e-6-batch_prob_all[j,1])
                                                     s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq)
                                                     f.write(s)
                                                     prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1]))
@@ -125,7 +125,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
                                                     else:
                                                         ref=ref2
                                                         alt1=alt1+ref2[l:]
-                                                    gq=-100*np.log10(1+1e-6-batch_prob_all[j,3])
+                                                    gq=-10*np.log10(1+1e-6-batch_prob_all[j,3])
                                                     if batch_phase[j]:
                                                         s='%s\t%d\t.\t%s\t%s,%s\t%.2f\tPASS\t.\tGT:GQ:PS\t1|2:%.2f:%d\n' %(chrom, batch_pos[j], ref, alt1, alt2, q, gq, batch_phase[j])
                                                     else:
@@ -134,7 +134,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
                                                     prev=batch_pos[j]+max(len(ref), len(alt1),len(alt2))
 
                                             elif allele0_data[0]:
-                                                gq=-100*np.log10(1+1e-6-batch_prob_all[j,2])
+                                                gq=-10*np.log10(1+1e-6-batch_prob_all[j,2])
                                                 if batch_phase[j]:
                                                     s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ:PS\t0|1:%.2f:%d\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq, batch_phase[j])
                                                 else:
@@ -143,7 +143,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
                                                 prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1]))
 
                                             elif allele1_data[0]:
-                                                gq=-100*np.log10(1+1e-6-batch_prob_all[j,2])
+                                                gq=-10*np.log10(1+1e-6-batch_prob_all[j,2])
                                                 if batch_phase[j]:
                                                     s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ:PS\t1|0:%.2f:%d\n' %(chrom, batch_pos[j], allele1_data[0], allele1_data[1], q, gq, batch_phase[j])
                                                 else:

diff --git a/nanocaller_src/snpCaller.py b/nanocaller_src/snpCaller.py
@@ -34,14 +34,14 @@
                }
 
 def get_SNP_model(snp_model):
-    if os.path.exists(snp_model):
-        if os.path.isdir(snp_model):
-            snp_model_path=glob.glob(os.path.join(snp_model,'*.index'))[0].rstrip('.index')
-
-    elif snp_model in snp_model_dict:
+    if snp_model in snp_model_dict:
         dirname = os.path.dirname(__file__)
         snp_model_path=os.path.join(dirname, snp_model_dict[snp_model])
-
+
+    elif os.path.exists(snp_model):
+        if os.path.isdir(snp_model):
+            snp_model_path=glob.glob(os.path.join(snp_model,'*.index'))[0].rstrip('.index')
+
     else:
         return None,None
 
@@ -118,26 +118,36 @@ def caller(params, chunks_Q, counter_Q, snp_files):
                         sort_probs=np.sort(batch_probs,axis=1)
 
                         for j in range(len(batch_pred_GT)):
-
+                            info_field='PR='+','.join("{:.4f}".format(x) for x in batch_probs[j,[0,3,1,2]])
                             if batch_pred_GT[j]>=2: # if het
                                     pred1,pred2=batch_pred[j,-1],batch_pred[j,-2]
                                     if pred1==batch_ref[j]:
-                                                s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','0/1', batch_dp[j], batch_freq[j])
+                                                s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '0/1', batch_dp[j], batch_freq[j])
                                                 f.write(s)
 
                                     elif pred2==batch_ref[j] and batch_probs[j,pred2]>=0.5:
-                                        s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/0', batch_dp[j], batch_freq[j])
+                                        s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '1/0', batch_dp[j], batch_freq[j])
                                         f.write(s)
 
                                     elif pred2!=batch_ref[j] and pred1!=batch_ref[j] and batch_probs[j,pred2]>=0.5:
-                                        s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %\
-                            (chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/2', batch_dp[j], batch_freq[j])
+                                        s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %\
+                            (chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field,'1/2', batch_dp[j], batch_freq[j])
                                         f.write(s)
 
                             elif batch_pred_GT[j]==1 and batch_ref[j]!=batch_pred[j,-1] and batch_probs[j,batch_pred[j,-1]]>=0.5:
                                 pred1=batch_pred[j,-1]
-                                s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS','1/1', batch_dp[j], batch_freq[j])
+                                s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS',info_field,'1/1', batch_dp[j], batch_freq[j])
                                 f.write(s)
+
+                            else:
+                                if batch_pred_GT[j]==1 and batch_ref[j]==batch_pred[j,-1]:
+                                    pred1=batch_pred[j,-1]
+                                    s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'REF',info_field,'./.', batch_dp[j], batch_freq[j])
+                                    f.write(s)
+                                else:
+                                    s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', 0,'LOW',info_field,'./.', batch_dp[j], batch_freq[j])
+                                    f.write(s)
+
 
                 elif chunk['ploidy']=='haploid':
                     test_ref=test_ref.astype(np.float16)
@@ -214,7 +224,8 @@ def call_manager(params):
     counter_Q.put(None)
     tqdm_proc.join()
 
-    output_file_path=os.path.join(params['vcf_path'],'%s.snps.vcf.gz' %params['prefix'])
+    all_snps_file_path=os.path.join(params['vcf_path'],'%s.unfiltered.snps.vcf.gz' %params['prefix'])
+    pass_snps_file_path=os.path.join(params['vcf_path'],'%s.snps.vcf.gz' %params['prefix'])
 
     temp_output_file_path=os.path.join(params['intermediate_snp_files_dir'],'combined.snps.vcf')
 
@@ -223,14 +234,17 @@ def call_manager(params):
     with open(temp_output_file_path,'wb') as outfile:
         outfile.write(b'##fileformat=VCFv4.2\n')
         outfile.write(b'##FILTER=<ID=PASS,Description="All filters passed">\n')
-
+        outfile.write(b'##FILTER=<ID=LOW,Description="All alleles have probability less than 50%.">\n')
+        outfile.write(b'##FILTER=<ID=REF,Description="Homozygous Reference. Only reference allele has greater than 50% probability. All alternative alleles having probability less than 50%.">\n')
         #loop through contigs
         for chrom in chrom_list:
             outfile.write(b'##contig=<ID=%s>\n' %bytes(chrom.encode('utf-8')))
 
         outfile.write(b'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
         outfile.write(b'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">\n')
         outfile.write(b'##FORMAT=<ID=FQ,Number=1,Type=Float,Description="Alternative Allele Frequency">\n')
+        outfile.write(b'##INFO=<ID=PR,Number=4,Type=Float,Description="Probability of presence of alleles A, C, G and T, in the given order. Probability of each base is out of 1, independent of each other.">\n')
+
         outfile.write(b'#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n' %bytes(params['sample'].encode('utf-8')))
 
         for int_file in snp_files:
@@ -239,7 +253,8 @@ def call_manager(params):
 
     print('\n%s: Compressing and indexing SNP calls.' %str(datetime.datetime.now()))
 
-    run_cmd("bcftools sort %s|bgziptabix %a" %(temp_output_file_path, output_file_path), error=True)
+    run_cmd("bcftools sort %s|bgziptabix %a" %(temp_output_file_path, all_snps_file_path), error=True)
+    run_cmd("bcftools view %s -f PASS|bgziptabix %a" %(all_snps_file_path, pass_snps_file_path), error=True)
 
-    return output_file_path
+    return pass_snps_file_path