-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_gagp_bcfs_for_ancestral_id.py
80 lines (56 loc) · 2.23 KB
/
preprocess_gagp_bcfs_for_ancestral_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# Generate a summary file of the variants in a single species & chromosome to build references w/ ancestral alleles
import csv
import glob
import re
import gzip
import pdb
import argparse
import os
import pysam
import math
import itertools
import numpy as np
from common import hwe, filter_var, rev_comp, generate_nmer_mutation_list, load_hg18_ref, complement
def preprocess_bcf(species, chr):
bed_file = "./data/hg18.bed"
# point to bcf
gagp_bcf_filename = '/net/harris/vol1/data/great_ape_genome_project/eichlerlab.gs.washington.edu/greatape/data/VCFs/SNPs/' + species + '.bcf'
# Need to make this directory to store preprocessing files
with open('./preprocessed_gagp_bcfs_for_ancestral_identification/' + species + '_' + chr + '_var_summaries.txt', 'w') as open_output_file:
with pysam.VariantFile(gagp_bcf_filename) as bcf_in:
with open(bed_file, 'rU') as open_bed: # Slice by chromosome
curr_output = []
line = ''
while not line.startswith(chr + '\t'):
line = open_bed.readline() # Incrementing bed file to correct chromosome
while line.startswith(chr + '\t'):
(chr, start, stop) = line.split()[:3]
vars = bcf_in.fetch(chr, int(start), int(stop))
for v in vars:
var_state = None
if len(v.alts) > 1:
var_state = 'multiallelic'
elif v.info['AF'][0] == 0.0:
var_state = 'fixed_ref'
elif v.info['AF'][0] == 1.0:
var_state = 'fixed_alt'
else:
var_state = 'biallelic'
curr_output.append('%s\t%i\t%s\t%s\t%s\t%s' % (chr, v.pos, v.ref, '|'.join(v.alts), var_state, species))
if len(curr_output) > 5000:
open_output_file.write('\n'.join(curr_output) + '\n')
curr_output = []
open_output_file.write('\n'.join(curr_output) + '\n')
curr_output = []
line = open_bed.readline()
return None
def argument_parse():
parser = argparse.ArgumentParser()
parser.add_argument("species", help="Name of species, as defined by GAGP (e.g. Homo, Pan_troglodytes)")
parser.add_argument("chr", help="Chromosome (e.g. 'chr10')")
args = parser.parse_args()
return args
if __name__ == '__main__':
args = argument_parse()
preprocess_bcf(args.species, args.chr)