-
Notifications
You must be signed in to change notification settings - Fork 8
/
tofu.py
118 lines (88 loc) · 3.84 KB
/
tofu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
"""
Script used to generate synthetic data for UK Biobank.
"""
import pandas as pd
from tqdm import tqdm
import argparse
import helpers
def jitter_type(x):
x = int(x)
if x < 0 or x >= 100:
raise argparse.ArgumentTypeError("Jitter value must be >0 and <100")
return x
parser = argparse.ArgumentParser(
description='Generate synthetic UK Biobank baseline data.')
group = parser.add_mutually_exclusive_group()
group.add_argument('-f', '--field', type=int, nargs='*',
help="specify one or more field ids to use")
group.add_argument('-F', '--file', type=str,
help="specify input file with a list of fields to use, one per line; "
"lines starting with # will be ignored")
parser.add_argument('-n', type=int, default=10,
help="specify numbet of patients to generate")
parser.add_argument('-v', '--verbose', action='store_true',
help="be verbose")
parser.add_argument('-j', '--jitter', type=jitter_type, default=0,
help="jitter percentage for missingness")
parser.add_argument('-o', '--out', type=str,
default=helpers.gen_output_filename(),
help="specify output file, defaults to timestamped file")
parser.add_argument('-H', '--human', action='store_true',
help="decode values into human readable format")
args = parser.parse_args()
if args.verbose:
print(args)
TOTAL_PATIENTS = args.n
if __name__ == '__main__':
df_output = pd.DataFrame()
df_output['eid'] = helpers.gen_fake_ids(TOTAL_PATIENTS)
# User has supplied field list
fields = set()
if args.field is not None:
fields = set(args.field)
elif args.file is not None:
fields = helpers.get_fields_from_file(args.file)
if fields:
all_field_ids = helpers.get_field_ids()
fields_to_process = set(all_field_ids).intersection(fields)
assert len(fields_to_process) > 0, "Fields not found in lookup file."
else:
fields_to_process = helpers.get_field_ids()
for field_id in tqdm(fields_to_process):
r = helpers.get_field_metadata(field_id)
if args.verbose is True:
print(r)
field_title = r['title']
field_data_type = r['value_type']
field_encoding_id = r['encoding_id']
# Check if a field is instanced
# i.e. it has been collected at different
# recruitment instances.
field_instance_max = r.get('instance_max')
if field_instance_max == 0:
field_instance_max = 1
# Check if a field is arrayed
# i.e. there can be multiple values
# selected for it.
field_array_max = r.get('array_max')
if field_array_max == 0:
field_array_max = 1
for i in range(field_instance_max):
for a in range(field_array_max):
field_canonical_name = helpers.gen_field_name(field_id, i, a, args.human)
dummy_values = helpers.gen_dummy_data_for_field(
field_id,
TOTAL_PATIENTS)
if args.human:
dummy_values = helpers.decode_values(dummy_values, field_id)
if args.jitter > 0:
dummy_values = helpers.insert_missingness(
dummy_values,
args.jitter)
df_output[field_canonical_name] = dummy_values
df_output.to_csv(args.out, index=False)
print("Wrote %s shape (%d,%d)." % (
args.out,
df_output.shape[0],
df_output.shape[1]))