Skip to content

Commit

Permalink
unfiltered SNP calls with raw probability output, and adjusted qualit…
Browse files Browse the repository at this point in the history
…y score
  • Loading branch information
umahsn committed Jul 14, 2023
1 parent 5f039fb commit 7443e29
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 28 deletions.
24 changes: 12 additions & 12 deletions nanocaller_src/indelCaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
}

def get_indel_model(indel_model):
if os.path.exists(indel_model):
if os.path.isdir(indel_model):
indel_model_path=glob.glob(os.path.join(indel_model,'*.index'))[0].rstrip('.index')

elif indel_model in indel_model_dict:
if indel_model in indel_model_dict:
dirname = os.path.dirname(__file__)
indel_model_path=os.path.join(dirname, indel_model_dict[indel_model])

elif os.path.exists(indel_model):
if os.path.isdir(indel_model):
indel_model_path=glob.glob(os.path.join(indel_model,'*.index'))[0].rstrip('.index')

else:
return None

Expand Down Expand Up @@ -89,16 +89,16 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
qual_all=-10*np.log10(1e-6+1-batch_prob_all)

for j in range(len(batch_pred_all)):
q=min(999,qual_all[j,batch_pred_all[j]])
q=min(99,qual_all[j,batch_pred_all[j]])
if batch_pos[j]>prev:

if batch_prob_all[j,0]<=0.95:

q=-100*np.log10(1e-6+batch_prob_all[j,0])
q=-10*np.log10(1e-6+batch_prob_all[j,0])
allele0_data, allele1_data,allele_total_data= batch_alleles_seq[j]

if batch_pred_all[j]==1 and allele_total_data[0]:
gq=-100*np.log10(1+1e-6-batch_prob_all[j,1])
gq=-10*np.log10(1+1e-6-batch_prob_all[j,1])
s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele_total_data[0], allele_total_data[1], q, gq)
f.write(s)
prev=batch_pos[j]+max(len(allele_total_data[0]), len(allele_total_data[1]))
Expand All @@ -107,7 +107,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
if allele0_data[0] and allele1_data[0]:

if allele0_data[0]==allele1_data[0] and allele0_data[1]==allele1_data[1]:
gq=-100*np.log10(1+1e-6-batch_prob_all[j,1])
gq=-10*np.log10(1+1e-6-batch_prob_all[j,1])
s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq)
f.write(s)
prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1]))
Expand All @@ -125,7 +125,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
else:
ref=ref2
alt1=alt1+ref2[l:]
gq=-100*np.log10(1+1e-6-batch_prob_all[j,3])
gq=-10*np.log10(1+1e-6-batch_prob_all[j,3])
if batch_phase[j]:
s='%s\t%d\t.\t%s\t%s,%s\t%.2f\tPASS\t.\tGT:GQ:PS\t1|2:%.2f:%d\n' %(chrom, batch_pos[j], ref, alt1, alt2, q, gq, batch_phase[j])
else:
Expand All @@ -134,7 +134,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
prev=batch_pos[j]+max(len(ref), len(alt1),len(alt2))

elif allele0_data[0]:
gq=-100*np.log10(1+1e-6-batch_prob_all[j,2])
gq=-10*np.log10(1+1e-6-batch_prob_all[j,2])
if batch_phase[j]:
s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ:PS\t0|1:%.2f:%d\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq, batch_phase[j])
else:
Expand All @@ -143,7 +143,7 @@ def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list):
prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1]))

elif allele1_data[0]:
gq=-100*np.log10(1+1e-6-batch_prob_all[j,2])
gq=-10*np.log10(1+1e-6-batch_prob_all[j,2])
if batch_phase[j]:
s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ:PS\t1|0:%.2f:%d\n' %(chrom, batch_pos[j], allele1_data[0], allele1_data[1], q, gq, batch_phase[j])
else:
Expand Down
47 changes: 31 additions & 16 deletions nanocaller_src/snpCaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@
}

def get_SNP_model(snp_model):
if os.path.exists(snp_model):
if os.path.isdir(snp_model):
snp_model_path=glob.glob(os.path.join(snp_model,'*.index'))[0].rstrip('.index')

elif snp_model in snp_model_dict:
if snp_model in snp_model_dict:
dirname = os.path.dirname(__file__)
snp_model_path=os.path.join(dirname, snp_model_dict[snp_model])


elif os.path.exists(snp_model):
if os.path.isdir(snp_model):
snp_model_path=glob.glob(os.path.join(snp_model,'*.index'))[0].rstrip('.index')

else:
return None,None

Expand Down Expand Up @@ -118,26 +118,36 @@ def caller(params, chunks_Q, counter_Q, snp_files):
sort_probs=np.sort(batch_probs,axis=1)

for j in range(len(batch_pred_GT)):

info_field='PR='+','.join("{:.4f}".format(x) for x in batch_probs[j,[0,3,1,2]])
if batch_pred_GT[j]>=2: # if het
pred1,pred2=batch_pred[j,-1],batch_pred[j,-2]
if pred1==batch_ref[j]:
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','0/1', batch_dp[j], batch_freq[j])
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '0/1', batch_dp[j], batch_freq[j])
f.write(s)

elif pred2==batch_ref[j] and batch_probs[j,pred2]>=0.5:
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/0', batch_dp[j], batch_freq[j])
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field, '1/0', batch_dp[j], batch_freq[j])
f.write(s)

elif pred2!=batch_ref[j] and pred1!=batch_ref[j] and batch_probs[j,pred2]>=0.5:
s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %\
(chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/2', batch_dp[j], batch_freq[j])
s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %\
(chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS',info_field,'1/2', batch_dp[j], batch_freq[j])
f.write(s)

elif batch_pred_GT[j]==1 and batch_ref[j]!=batch_pred[j,-1] and batch_probs[j,batch_pred[j,-1]]>=0.5:
pred1=batch_pred[j,-1]
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS','1/1', batch_dp[j], batch_freq[j])
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS',info_field,'1/1', batch_dp[j], batch_freq[j])
f.write(s)

else:
if batch_pred_GT[j]==1 and batch_ref[j]==batch_pred[j,-1]:
pred1=batch_pred[j,-1]
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', min(99,-10*np.log10(1e-10+ 1-batch_probs[j,pred1])),'REF',info_field,'./.', batch_dp[j], batch_freq[j])
f.write(s)
else:
s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t%s\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], '.', 0,'LOW',info_field,'./.', batch_dp[j], batch_freq[j])
f.write(s)


elif chunk['ploidy']=='haploid':
test_ref=test_ref.astype(np.float16)
Expand Down Expand Up @@ -214,7 +224,8 @@ def call_manager(params):
counter_Q.put(None)
tqdm_proc.join()

output_file_path=os.path.join(params['vcf_path'],'%s.snps.vcf.gz' %params['prefix'])
all_snps_file_path=os.path.join(params['vcf_path'],'%s.unfiltered.snps.vcf.gz' %params['prefix'])
pass_snps_file_path=os.path.join(params['vcf_path'],'%s.snps.vcf.gz' %params['prefix'])

temp_output_file_path=os.path.join(params['intermediate_snp_files_dir'],'combined.snps.vcf')

Expand All @@ -223,14 +234,17 @@ def call_manager(params):
with open(temp_output_file_path,'wb') as outfile:
outfile.write(b'##fileformat=VCFv4.2\n')
outfile.write(b'##FILTER=<ID=PASS,Description="All filters passed">\n')

outfile.write(b'##FILTER=<ID=LOW,Description="All alleles have probability less than 50%.">\n')
outfile.write(b'##FILTER=<ID=REF,Description="Homozygous Reference. Only reference allele has greater than 50% probability. All alternative alleles having probability less than 50%.">\n')
#loop through contigs
for chrom in chrom_list:
outfile.write(b'##contig=<ID=%s>\n' %bytes(chrom.encode('utf-8')))

outfile.write(b'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
outfile.write(b'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">\n')
outfile.write(b'##FORMAT=<ID=FQ,Number=1,Type=Float,Description="Alternative Allele Frequency">\n')
outfile.write(b'##INFO=<ID=PR,Number=4,Type=Float,Description="Probability of presence of alleles A, C, G and T, in the given order. Probability of each base is out of 1, independent of each other.">\n')

outfile.write(b'#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n' %bytes(params['sample'].encode('utf-8')))

for int_file in snp_files:
Expand All @@ -239,7 +253,8 @@ def call_manager(params):

print('\n%s: Compressing and indexing SNP calls.' %str(datetime.datetime.now()))

run_cmd("bcftools sort %s|bgziptabix %a" %(temp_output_file_path, output_file_path), error=True)
run_cmd("bcftools sort %s|bgziptabix %a" %(temp_output_file_path, all_snps_file_path), error=True)
run_cmd("bcftools view %s -f PASS|bgziptabix %a" %(all_snps_file_path, pass_snps_file_path), error=True)

return output_file_path
return pass_snps_file_path

0 comments on commit 7443e29

Please sign in to comment.