-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
126 lines (109 loc) · 4.15 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Run pipeline from location of the Snakefile:
snakemake --config exp=exp13 run=run1 --cores 4 --use-conda
Pipeline requires:
Input:
- Cellranger multi output of TCR, and pMHC as a minimum.
- Additional Cellranger features such as gene expression (GEX) and cell hashing barcodes can be handled.
- An experimental design template (see barcode_specificity_annotations.xlsx).
"""
import os
RAW_DATA = "experiments/exp13/run1/data/raw.csv.gz" # OBS! Change this to fit your path
WRK_DIR = workflow.basedir
EXP_DIR = os.path.join(WRK_DIR, "experiments", config["exp"], config["run"])
RES_DIR = os.path.join(EXP_DIR, "res")
PLT_DIR = os.path.join(EXP_DIR, "plt")
rule all:
input: EXP_DIR + '/done.ok'
#################################################################
# Perform grid search for optimal filters #
#################################################################
rule eval_clonotypes:
"""
Identify clonotypes with sufficient data to infer threshold based on.
Plots barcode distribution for clonotypes with more than 10 GEMs.
"""
input:
data = RAW_DATA
params:
plots = PLT_DIR + "/eval_clonotypes/%s/%d.pdf"
output:
data = RES_DIR + "/tables/tcr_barcode.valid.csv",
done = touch(expand(PLT_DIR + "/eval_clonotypes/{flag}/dir.done", flag=['significant_match','significant_mismatch','insignificant']))
conda:
"envs/basic_dependencies.yaml"
shell:
"python scripts/F_comp_cred_specificities.py \
--input {input.data} \
--output {output.data} \
--plots {params.plots}"
rule grid_search:
"""
Run grid search on UMI values to define optimal set of thresholds.
"""
input:
valid_df = rules.eval_clonotypes.output.data
output:
grid = RES_DIR + "/eval_clonotypes/grid_search/{ext_thr}.csv"
shell:
"python scripts/G1_grid_search.py \
--input {input.valid_df} \
--ext_thr {wildcards.ext_thr} \
--output {output.grid}"
rule extract_optimal_threshold:
"""
Plot grid and extract optimal thresholds.
"""
input:
valid = rules.eval_clonotypes.output.data,
grids = expand(rules.grid_search.output.grid, ext_thr=[0,1,2])
output:
plots = expand(PLT_DIR + "/eval_clonotypes/grid_search/grid.{ext}", ext=["pdf", "png"]),
opt_thr = RES_DIR + "/eval_clonotypes/threshold/opt.csv"
shell:
"python scripts/G2_extract_optimal_threshold.py \
--data {input.valid} \
--grids {input.grids} \
--output {output.opt_thr} \
--plot {output.plots}"
#################################################################
# Plots #
#################################################################
rule get_filters:
input:
opt_thr = rules.extract_optimal_threshold.output.opt_thr,
valid = rules.eval_clonotypes.output.data
params:
flt = '{filtering_set}'
output:
lbl = RES_DIR + "/eval_clonotypes/threshold/{filtering_set}.yaml",
flt = RES_DIR + "/eval_clonotypes/threshold/{filtering_set}.csv"
shell:
"python scripts/H_set_filters.py \
--data {input.valid} \
--opt-thr {input.opt_thr} \
--setting {params.flt} \
--labels {output.lbl} \
--filters {output.flt}"
rule filter_impact_staircase:
input:
df = rules.eval_clonotypes.output.data,
lbl = rules.get_filters.output.lbl,
flt = rules.get_filters.output.flt
output:
png = PLT_DIR + "/specificity_matrix/{filtering_set}/total.png"
params:
lambda wildcards, output: os.path.dirname(output.png)
conda:
"envs/basic_dependencies.yaml"
shell:
"python scripts/I_filter_impact.staircase.py \
--data {input.df} \
--labels {input.lbl} \
--filters {input.flt} \
--out-dir {params}"
rule ok:
input:
fn = expand(rules.filter_impact_staircase.output.png, filtering_set=['indv','comb'])
output:
tag = touch(EXP_DIR + '/done.ok')