-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcreate_sarek_samplesheet.py
46 lines (34 loc) · 1.29 KB
/
create_sarek_samplesheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python
import os
import sys
import re
fqdir = sys.argv[1]
outfile = sys.argv[2]
sampledict = {}
fqpattern = r'^(.*)_(S\d+)_L(\d+)_(R\d)_001.*$'
fcpattern = r'^.*_[AB]?([^_]+)$'
for root, dirs, fqlinks in os.walk(fqdir):
for link in fqlinks:
fqpath = os.readlink(os.path.join(root, link))
fqbasenm = re.match(fqpattern, link)
if fqbasenm:
sample = fqbasenm.group(1)
ssheet_idx = fqbasenm.group(2)
laneno = int(fqbasenm.group(3))
readnr = fqbasenm.group(4)
else:
continue
m = re.match(fcpattern, os.path.basename(os.path.dirname(fqpath)))
fcid = m.group(1) if m else "NA"
readgrp = f"{fcid}.{laneno}.{ssheet_idx}" # PU, Plattform unit. Will be used as read tag in BAM-files.
if readgrp not in sampledict:
sampledict[readgrp] = {"sample": sample, "R1": "", "R2": ""}
sampledict[readgrp][readnr] = fqpath
with open(outfile, 'w') as fout:
fout.write("patient,sample,lane,fastq_1,fastq_2\n")
for readgrp in sampledict:
samplenm = sampledict[readgrp]["sample"]
R1 = sampledict[readgrp]["R1"]
R2 = sampledict[readgrp]["R2"]
entry = ",".join([samplenm, samplenm, readgrp, R1, R2])
fout.write(f"{entry}\n")