diff --git a/docs/yaml_docs/index.rst b/docs/yaml_docs/index.rst
index f3dcb1d2..64eb6eb2 100644
--- a/docs/yaml_docs/index.rst
+++ b/docs/yaml_docs/index.rst
@@ -12,4 +12,5 @@ Workflows configuration files
spatial_qc
spatial_preprocess
spatial_deconvolution
- pipeline_refmap_yml.md
+ pipeline_refmap_yml
+
diff --git a/docs/yaml_docs/pipeline_clustering_yml.md b/docs/yaml_docs/pipeline_clustering_yml.md
index bc5a22dd..7f476833 100644
--- a/docs/yaml_docs/pipeline_clustering_yml.md
+++ b/docs/yaml_docs/pipeline_clustering_yml.md
@@ -14,7 +14,10 @@ In this documentation, the parameters of the `clustering` configuration yaml fil
This file is generated running `panpipes clustering config`.
The individual steps run by the pipeline are described in [clustering workflow](https://panpipes-pipelines.readthedocs.io/en/latest/workflows/clustering.html)
-When running the clustering workflow, panpipes provides a basic `pipeline.yml` file.
+The `clustering` workflow works with outputs generated by the `integration` workflow, and expects a `MuData` object with
+`neighbors` saved in the `.uns` of the global layer to run clustering on the multimodal embedding. If `neighbors` are calculated on each modality layers, these can be reused or re-calculated on the flight.
+
+When running the clustering workflow, panpipes provides a basic `pipeline.yml` file to customize with parameters.
To run the workflow on your own data, you need to specify the parameters described below in the `pipeline.yml` file to meet the requirements of your data.
However, we do provide pre-filled versions of the `pipeline.yml` file for individual [tutorials](https://panpipes-pipelines.readthedocs.io/en/latest/tutorials/index.html).
@@ -62,16 +65,22 @@ Prefix for the sample that comes out of the filtering/ preprocessing steps of th
Specify the full object if your scaled_obj contains only HVG. If your scaled_obj contains all the genes then leave full_obj blank.
panpipes will use the full object to do marker genes analysis (rank_gene_groups) and for plotting those genes.
- modalities
- - rna `Boolean`, Default: True
+ Which modalities to run clustering on.
+ - rna `Boolean`, Default: True
If set to `True`, the workflow will stop if it doesn't find a modality named 'rna'
- prot `Boolean`, Default: True
+ If set to `True`, the workflow will stop if it doesn't find a modality named 'prot'
- atac `Boolean`, Default: False
+ If set to `True`, the workflow will stop if it doesn't find a modality named 'atac'
+
- spatial `Boolean`, Default: False
- Run clustering on each individual modality.
+ If set to `True`, the workflow will stop if it doesn't find a modality named 'spatial'
+
- multimodal
- - rna_clustering `Boolean`, Default: True
- - integration_method `String`, Default: WNN
- Options here include WNN, mofa, and totalVI, and it tells us where to look for.
+ - rna_clustering `Boolean`, Default: False
If set to True, runs clustering on multimodal embedding
+ - integration_method `String`, Default: None
+ In case you have run WNN and want to run clustering on the wnn embedding, specify "WNN" here. The neigbhours are saved with a different `--neighbors_key` param only for wnn, for every other method (totalvi, multivi, mofa) leave this parameter blank.
+
## Parameters for finding neighbours
@@ -79,7 +88,7 @@ Prefix for the sample that comes out of the filtering/ preprocessing steps of th
Sets the number of neighbors to use when calculating the graph for clustering and umap.
- rna:
- - use_existing `Boolean`, Default: True
+ - use_existing `Boolean`, Default: True
Use existing neighbours in .uns calculated in the `integration` workflow. If `False`, it will recalculate using the following parameters
- dim_red `String`, Default: X_pca
Defines which representation in .obsm to use for nearest neighbors
- n_dim_red `Integer`, Default: 30
@@ -94,7 +103,7 @@ Prefix for the sample that comes out of the filtering/ preprocessing steps of th
- prot:
- - use_existing `Boolean`, Default: True
+ - use_existing `Boolean`, Default: True
Use existing neighbours in .uns calculated in the `integration` workflow. If `False`, it will recalculate using the following parameters
- dim_red `String`, Default: X_pca
Defines which representation in .obsm to use for nearest neighbors
- n_dim_red `Integer`, Default: 30
@@ -109,7 +118,7 @@ Prefix for the sample that comes out of the filtering/ preprocessing steps of th
- atac:
- - use_existing `Boolean`, Default: True
+ - use_existing `Boolean`, Default: True
Use existing neighbours in .uns calculated in the `integration` workflow. If `False`, it will recalculate using the following parameters
- dim_red `String`, Default: X_lsi
Defines which representation in .obsm to use for nearest neighbors
- n_dim_red `Integer`, Default: 1
@@ -125,7 +134,7 @@ Prefix for the sample that comes out of the filtering/ preprocessing steps of th
- spatial:
- - use_existing `Boolean`, Default: False
+ - use_existing `Boolean`, Default: False
Use existing neighbours in .uns calculated in the `integration` workflow. If `False`, it will recalculate using the following parameters
- dim_red `String`, Default: X_pca
Defines which representation in .obsm to use for nearest neighbors
- n_dim_red `Integer`, Default: 30
@@ -142,51 +151,51 @@ Prefix for the sample that comes out of the filtering/ preprocessing steps of th
- umap:
- - run `Boolean`, Default: True
+ - run `Boolean`, Default: True
Set to `True` runs the umap calculation and plotting.
- rna:
- mindist `Float`, Default: 0.5
- Can specify an array: 0.25,0.5
+ Can specify a single float or an array: 0.25,0.5
- prot:
- mindist `Float`, Default: 0.5
- Can specify an array: 0.25,0.5,0.8
+ Can specify a single float or an array: 0.25,0.5,0.8
- atac:
- mindist `Float`, Default: 0.5
- Can specify an array: 0.25,0.5,0.8
+ Can specify a single float or an array: 0.25,0.5,0.8
- multimodal:
- mindist `Float`, Default: 0.5
- Can specify an array: 0.25,0.5,0.8
+ Can specify a single float or an array: 0.25,0.5,0.8
- rna:
- mindist `Float`, Default: 0.5
- Can specify an array: 0.25,0.5,0.8
+ Can specify a single float or an array: 0.25,0.5,0.8
## Parameters for clustering
- clusterspecs:
- rna:
- resolutions `Float`, Default: 0.2, 0.6, 1
- Can specify an array: 0.2,0.6,1
+ Can specify a single float or an array: 0.2,0.6,1
- algorithm `String`, Default: leiden
Options include louvain or leiden.
- prot:
- resolutions `Float`, Default: 0.2, 0.6, 1
- Can specify an array: 0.2,0.6,1
+ Can specify a single float or an array: 0.2,0.6,1
- algorithm `String`, Default: leiden
Options include louvain or leiden.
- atac:
- resolutions `Float`, Default: 0.2, 0.6, 1
- Can specify an array to compute in parallel: 0.2,0.6,1
+ Can specify a single float or an array to compute in parallel: 0.2,0.6,1
- algorithm `String`, Default: leiden
Options include louvain or leiden.
- multimmodal:
- resolutions `Float`, Default: 0.5, 0.7
- Can specify an array to compute in parallel: 0.2,0.6,1
+ Can specify a single float or an array to compute in parallel: 0.2,0.6,1
- algorithm `String`, Default: leiden
Options include louvain or leiden.
- spatial:
- resolutions `Float`, Default: 0.2, 0.6, 1
- Can specify an array to compute in parallel: 0.2,0.6,1
+ Can specify a single float or an array to compute in parallel: 0.2,0.6,1
- algorithm `String`, Default: leiden
Options include louvain or leiden.
@@ -207,8 +216,10 @@ When pseudo_seurat is set to True then a [python implementation](https://github.
Marker analysis is run for clusters >= mincells. If a cluster ncells < mincells , then the cluster is excluded from marker analysis
- pseudo_seurat `Boolean`, Default: False
- minpct `Float`, Default: 0.1
+ Only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations.
This parameter is mandatory if pseudo_seurat is set to True
- threshuse `Float`, Default: 0.25
+ Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
This parameter is mandatory if pseudo_seurat is set to True
- prot:
- run `Boolean`, Default: True
@@ -219,8 +230,10 @@ When pseudo_seurat is set to True then a [python implementation](https://github.
- method `String`, Default: wilcoxon
- pseudo_seurat `Boolean`, Default: False
- minpct `Float`, Default: 0.1
+ Only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations.
This parameter is mandatory if pseudo_seurat is set to True
- threshuse `Float`, Default: 0.25
+ Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
This parameter is mandatory if pseudo_seurat is set to True
- atac:
@@ -234,8 +247,10 @@ When pseudo_seurat is set to True then a [python implementation](https://github.
Options include: ‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’
- pseudo_seurat `Boolean`, Default: False
- minpct `Float`, Default: 0.1
+ Only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations.
This parameter is mandatory if pseudo_seurat is set to True
- threshuse `Float`, Default: 0.25
+ Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
This parameter is mandatory if pseudo_seurat is set to True
@@ -246,9 +261,9 @@ When pseudo_seurat is set to True then a [python implementation](https://github.
Options include: ‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’
- pseudo_seurat `Boolean`, Default: False
- minpct `Float`, Default: 0.1
- This parameter is mandatory if pseudo_seurat is set to True
+ Only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. This parameter is mandatory if pseudo_seurat is set to True
- threshuse `Float`, Default: 0.25
- This parameter is mandatory if pseudo_seurat is set to True
+ Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.This parameter is mandatory if pseudo_seurat is set to True
- spatial:
@@ -261,11 +276,12 @@ When pseudo_seurat is set to True then a [python implementation](https://github.
Marker analysis is run for clusters >= mincells. If a cluster ncells < mincells , then the cluster is excluded from marker analysis
- pseudo_seurat `Boolean`, Default: False
- minpct `Float`, Default: 0.1
- This parameter is mandatory if pseudo_seurat is set to True
+ Only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. This parameter is mandatory if pseudo_seurat is set to True
- threshuse `Float`, Default: 0.25
+ Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells.
This parameter is mandatory if pseudo_seurat is set to True
## Plot specifications
-Used to define which metadata columns are used in the visualizations
+Define which layers are used in the markers visualization
- plotspecs:
- layers:
- rna `String`, Default: logged_counts
diff --git a/panpipes/panpipes/pipeline_clustering.py b/panpipes/panpipes/pipeline_clustering.py
index 99837875..a3caad38 100644
--- a/panpipes/panpipes/pipeline_clustering.py
+++ b/panpipes/panpipes/pipeline_clustering.py
@@ -43,9 +43,10 @@ def set_up_dirs(log_file):
## Single modality scripts
## ------------------------------------
-# -----------------------------------=
+# --------------------------------------
# neighbors
# --------------------------------------
+# TO DO create task to re-run neighbours on multimodal outer representations (this script can only read in each mod layer)
@follows(set_up_dirs)
@originate(PARAMS['mudata_with_knn'])
def run_neighbors(outfile):
diff --git a/panpipes/panpipes/pipeline_clustering/pipeline.yml b/panpipes/panpipes/pipeline_clustering/pipeline.yml
index 2562d2d6..5c841d8a 100644
--- a/panpipes/panpipes/pipeline_clustering/pipeline.yml
+++ b/panpipes/panpipes/pipeline_clustering/pipeline.yml
@@ -29,7 +29,7 @@ modalities:
atac: False
spatial: False
-# if True, will look for WNN, or totalVI output
+# if True, will look for WNN, mofa, multivi, totalVI embeddings
multimodal:
run_clustering: True
integration_method:
@@ -40,8 +40,10 @@ multimodal:
# ---------------------------------------
#
# -----------------------------
+
neighbors:
rna:
+ #use the knn calculated in the integration workflow. If False it will recalculate
use_existing: True
dim_red: X_pca
n_dim_red: 30
@@ -49,6 +51,7 @@ neighbors:
metric: euclidean
method: scanpy
prot:
+ #use the knn calculated in the integration workflow. If False it will recalculate
use_existing: True
dim_red: X_pca
n_dim_red: 30
@@ -56,6 +59,7 @@ neighbors:
metric: euclidean
method: scanpy
atac:
+ #use the knn calculated in the integration workflow. If False it will recalculate
use_existing: True
dim_red: X_lsi
dim_remove: 1
@@ -64,6 +68,7 @@ neighbors:
metric: euclidean
method: scanpy
spatial:
+ #use the knn calculated in the integration workflow. If False it will recalculate
use_existing: False
dim_red: X_pca
n_dim_red: 30
diff --git a/panpipes/python_scripts/run_umap.py b/panpipes/python_scripts/run_umap.py
index 6a5b957b..e4fe42b0 100644
--- a/panpipes/python_scripts/run_umap.py
+++ b/panpipes/python_scripts/run_umap.py
@@ -33,7 +33,7 @@
default=0.1,
help="no. neighbours parameters for sc.pp.neighbors()")
parser.add_argument("--neighbors_key",
- default="neighbors", help="algortihm choice from louvain and leiden")
+ default="neighbors", help="name of the saved knn neighbors")
args, opt = parser.parse_known_args()
L.info(args)