-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfastq_info.py
111 lines (84 loc) · 3.32 KB
/
fastq_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
import jak_utils
from jakomics import colors
from jakomics.fastq import FASTQ, run_info
import argparse
import os
from tqdm import tqdm
import multiprocessing
multiprocessing.set_start_method("fork") # python 3.8 fix
# stop those pesky future warnings....
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
# OPTIONS #####################################################################
parser = argparse.ArgumentParser(
description='test')
parser.add_argument('-s', '--samples',
help="excel file with samples in S, F, R, I columns",
required=True)
parser.add_argument('--md5',
action='store_true',
help='Run md5 (very slow)')
parser.add_argument('--out',
help="File to write results to",
default="fastq_info_results.txt",
required=False)
args = parser.parse_args()
def get_info(sample):
for fastq_file in sample.files:
if args.md5:
fastq_file.get_md5()
else:
fastq_file.md5 = "NA"
run_info_results = run_info(fastq_file.file_path)
file_results = {"SAMPLE": sample.sample,
"FILE": fastq_file.file_path,
"MD5": fastq_file.md5,
"TYPE": sample.type,
"PAIR": fastq_file.read,
"TOTAL_READS": 0,
"RUN_INFO": {},
}
for result in run_info_results:
file_results["RUN_INFO"][result] = run_info_results[result]
file_results["TOTAL_READS"] += run_info_results[result]
results[f'{sample.sample}_{fastq_file.read}'] = file_results
if __name__ == "__main__":
manager = multiprocessing.Manager()
results = manager.dict()
jak_utils.header()
files = pd.read_excel(args.samples, index_col=0, engine='openpyxl')
sample_list = []
for sample, row in files.iterrows():
d = FASTQ(sample, row)
sample_list.append(d)
pool = multiprocessing.Pool(processes=8)
for _ in tqdm(pool.imap_unordered(get_info, sample_list), total=len(sample_list), desc="Finished", unit=" samples"):
pass
pool.close()
df = pd.DataFrame(columns=['INDEX', 'SAMPLE', 'FILE', 'MD5',
'TYPE', 'PAIR', 'TOTAL_READS', 'RUN_INFO'])
for result in results:
s = pd.Series(data={
'INDEX': result,
'SAMPLE': results[result]["SAMPLE"],
'FILE': results[result]["FILE"],
'MD5': results[result]["MD5"],
'TYPE': results[result]["TYPE"],
'PAIR': results[result]["PAIR"],
'TOTAL_READS': results[result]["TOTAL_READS"],
'RUN_INFO': str(results[result]["RUN_INFO"])
}
)
df = pd.concat([df, s.to_frame().T], ignore_index=True)
#df = df.sort_values(by=['INDEX'])
# write to file with comments
if os.path.exists(args.out):
os.remove(args.out)
f = open(args.out, 'a')
for c in jak_utils.header(r=True):
print(f'# {c}', file=f)
for arg in vars(args):
print(f'# ARG {arg} = {getattr(args, arg)}', file=f)
df.to_csv(f, sep="\t", index=False)