-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpipeline.yml
131 lines (101 loc) · 5.56 KB
/
pipeline.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# ==============================================
# Preprocessing, spatial transcriptomics
# ==============================================
# ---------------------------
# 0. Compute resource options
# ---------------------------
resources:
# Number of threads used for parallel jobs
threads_high: 1 # Must be enough memory to load all input files and create MuDatas
threads_medium: 1 # Must be enough memory to load your mudata and do computationally light tasks
threads_low: 1 # Must be enough memory to load text files and do plotting, requires much less memory
# Path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
condaenv:
# ------------------
## 1. Specify input
# ------------------
# With the preprocess_spatial workflow, one or multiple MuData objects can be preprocessed in one run.
# For the preprocessing, it reads in all '.h5mu' objects of a directory, for example:
# ./qc.data
# ├── Human_LN_alt_unfilt.h5mu
# ├── Human_LN_unfilt.h5mu
# └── Human_heart_unfilt.h5mu
# The workflow then runs the preprocessing for each MuData object separately with the same parameters that are specified in this yaml file.
# Note, that the MuData objects in the directory need to be of the same assay (Vizgen or Visium).
input_dir: ../ingestion/qc.data/
assay: visium # Specify the spatial transcriptomics assay, one of "visium" or "vizgen". If left blank, will use default "visium"
# --------------
## 2. Filtering
# --------------
# The filtering is fully customisable to any columns in the .obs or .var. You are not restricted by the columns given as default. When specifying a column name, make sure it exactly matches the column name in the h5mu object.
# General structure:
# spatial:
# obs: <- spot/cell level filtering
# min: <-- Any column for which you want to run a minimum filter,
# n_genes_by_counts: 500 <--- i.e. each spot/cell must have a minimum of 500 in the n_genes_by_counts column
# max: <-- Any column for which you want to run a maxiumum filter
# pct_counts_mt: 20 <-- i.e. each spot/cell may have a maximum of 20 in the pct_counts_mt column
# bool:
# highly_variable: True <--- boolean columns you want to filter on, in this case any obs['highly_variable'] that are True will be retained in the dataset.
# var: <- gene level filtering
# min:
# max:
# bool
filtering:
run: True
keep_barcodes: # A file containing only barcodes you want to keep, leave blank if not applicable
#------------------------------------------------------
spatial:
#------------------------------------------------------
## Obs filtering: spot/cell level filtering here
obs:
min:
total_counts:
max:
total_counts: 30000
n_genes_by_counts:
pct_counts_mt: # Percent filtering: this should be a value between 0 and 100
pct_counts_rp:
bool:
## Var filtering: gene level filtering here
var:
min:
n_cells_by_counts:
max:
total_counts:
n_cells_by_counts:
# -------------------------
## 3. Post-filter plotting
# -------------------------
# All metrics should be comma-separated without spaces, e.g. a,b,c
# If the metric is in both, .obs and .var, then both are plotted
plotqc:
grouping_var: # Categorical variables to plot by. Not mandatory, can be left empty
spatial_metrics: total_counts
# -----------------------------------------
# 4. Spatial transcriptomics preprocessing
# Normalization + HVG selection + PCA
# -----------------------------------------
spatial:
# How to perform normalization and HVG selection:
norm_hvg_flavour: seurat # ['seurat', 'squidpy']
# 'seurat': HVG selection and normalization by analytic Pearson residuals using sc.experimental.pp.normalize_pearson_residuals() and sc.experimental.pp.highly_variable_genes()
# 'squidpy': HVG selection and normalization using the standard scanpy functions (sc.pp.normalize_total(),sc.pp.log1p(), sc.pp.highly_variable_genes())
# The following parameters are only for the case norm_hvg_flavour == squidpy:
# Parameters for the HVG selection using sc.pp.highly_variable_genes():
squidpy_hvg_flavour: #seurat #['seurat','cellranger','seurat_v3'], leave blank to use default ('seurat'), flavour for the sc.pp.highly_variable_genes() function
min_mean: # Leave blank to use default (0.05), parameter in sc.pp.highly_variable_genes()
max_mean: # Leave blank to use default (1.5), parameter in sc.pp.highly_variable_genes()
min_disp: # Leave blank to use default (0.5), parameter in sc.pp.highly_variable_genes()
# The following parameters are only for the case norm_hvg_flavour == seurat:
theta: # Leave blank to use default (100), the negative binomial overdispersion parameter for Pearson residuals
clip: # Leave blank to use default (None)
# 'clip' can be specified as:
# None: residuals are clipped to the interval [-sqrt(n_obs), sqrt(n_obs)]
# a float value: if float c specified: clipped to the interval [-c, c]
# np.Inf: no clipping
# Parameters for both cases norm_hvg_flavour = "seurat" and "squidpy":
n_top_genes: 2000 # Default is None; mandatory for norm_hvg_flavour = "seurat" and squidpy_hvg_flavor: "seurat_v3"
filter_by_hvg: False # Leave blank to use default (False); if True, subset the data to highly-variable genes after finding them
hvg_batch_key: # Leave blank to use default (None); if specified, highly-variable genes are selected within each batch separately and merged
n_pcs: 50 # Leave blank to use default (50); how many PCs to compute with sc.pp.PCA()