-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_dataset.py
79 lines (70 loc) · 3.09 KB
/
make_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/python
import sys
import os
import numpy as np
import pandas as pd
import joblib
_README_="""
A script to subset pickled dataframes (output from master_dataset.py) for use in DEGAS (tsvd.py).
(you'll probably want at least 64Gb memory for this)
Author: Matthew Aguirre (SUNET: magu)
"""
global data_root=None # redacted
global proj_dir=None
def load_p_and_se(dataset):
rename=lambda s,k:s.replace('_z_','_'+k+'_').replace('_beta_','_'+k+'_')
pf,sef=rename(dataset,'p'),rename(dataset,'se')
return joblib.load(pf), joblib.load(sef)
def make_dataset(dataset, out, p_star=0.01, center=True):
# keep variants not in LD, MAF > 0.01%, and QC'd; get their alt alleles
with open('../reference/variant_qc_v2.prune.in', 'r') as f:
var_set = set([line.rstrip() for line in f])
var_alt = {var:None for var in var_set}
with open(data_root+'array_combined/pgen/ukb24983_cal_hla_cnv.pvar', 'r') as f:
for line in f:
chrom,pos,var,ref,alt = line.rstrip().split()
var_alt[var] = alt
# load data
data=joblib.load(dataset)
p,se=load_p_and_se(dataset)
# filter by p-value and SE (0.2 if binary else 0.08); center if specified
qts=[x for x in se.columns if 'INI' in x or 'QT' in x]
df=data[(p < p_star) & (se < 0.2) & ~((se.isin(se[qts])) & (se >= 0.08))]
if center:
df=df.subtract(df.mean()).divide(data.std())
# write to file
df.to_pickle(out, compression='gzip')
return
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=_README_
)
def p_value(p):
if float(p) < 0 or float(p) > 1:
raise argparse.ArgumentTypeError("%s is an invalid p-value" % p)
return float(p)
parser.add_argument('--p', dest="p", required=True, nargs=1, type=p_value,
default=0.01,
help='Minimum p-value threshold for inclusion.')
parser.add_argument('--center', dest="center", action='store_true',
help='Standardize columns (phenotypes).')
bz=parser.add_mutually_exclusive_group()
bz.add_argument('--beta', dest="beta", action='store_false',
help='Use regression coefficients for dataset.')
bz.add_argument('--z', dest="z", action='store_true',
help='Use regression z-statistics for dataset.')
args=parser.parse_args()
c,p=args.center,float(args.p[0])
# input/output naming
path=os.path.join(proj_dir+'datasets/train/v2/',
'_'.join(('all','z' if args.z else 'beta',
'nonCenter_20200427.full_df.pkl.gz')))
outP=os.path.join(os.path.dirname(path),
'_'.join(('all','z' if args.z else 'beta',
'center' if c else 'nonCenter',
'p'+str(p).replace('.',''),
'20200427.full_df.pkl.gz')))
# everything else goes here
make_dataset(dataset=path, out=outP, p_star=p, center=c)