Feature snakemake modular (#41)

* making snakemake runners modular Additional snakemake runners for only running training, only association testing, and full train + association testing pipelines. * bug-fix pretrained model path * Adding additional snakemake pipeline run option to readthedocs * update train snakefile pipeline from PR #42 * bug-fix model path for snakemake pipeline runners * bug-fix f string syntax * Update github-actions.yml * Update github-actions.yml * Update github-actions.yml * Revert "Update github-actions.yml" This reverts commit b8b82e5. * Revert "Update github-actions.yml" This reverts commit d27e6a8. * Revert "Update github-actions.yml" This reverts commit 6cde84f. * Update github-actions.yml * Update github-actions.yml * fix-model path string variable in rules --------- Co-authored-by: Magnus Wahlberg <endast@gmail.com>
PMBio · Dec 22, 2023 · fc14c51 · fc14c51
1 parent 2d4a387
commit fc14c51
Show file tree

Hide file tree

Showing 10 changed files with 363 additions and 459 deletions.
diff --git a/docs/usage.md b/docs/usage.md
@@ -56,6 +56,20 @@ Replace `[path_to_deeprvat]` with the path to your clone of the repository.
 Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.
 
 
+### Run the training pipeline on some example data
+
+```shell
+mkdir example
+cd example
+ln -s [path_to_deeprvat]/example/* .
+snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/run_training.snakefile
+```
+
+Replace `[path_to_deeprvat]` with the path to your clone of the repository.
+
+Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.
+
+
 ### Run the association testing pipeline with pretrained models
 
 ```shell

diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile
@@ -0,0 +1,12 @@
+
+rule association_dataset:
+    input:
+        config = '{phenotype}/deeprvat/hpopt_config.yaml'
+    output:
+        '{phenotype}/deeprvat/association_dataset.pkl'
+    threads: 4
+    shell:
+        'deeprvat_associate make-dataset '
+        + debug +
+        '{input.config} '
+        '{output}'
diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile
@@ -0,0 +1,74 @@
+
+rule link_burdens:
+    priority: 1
+    input:
+        checkpoints = lambda wildcards: [
+            f'{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt'
+            for repeat in range(n_repeats) for bag in range(n_bags)
+        ],
+        dataset = '{phenotype}/deeprvat/association_dataset.pkl',
+        data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
+        model_config = model_path / 'config.yaml',
+    output:
+        '{phenotype}/deeprvat/burdens/chunk{chunk}.linked'
+    threads: 8
+    shell:
+        ' && '.join([
+            ('deeprvat_associate compute-burdens '
+             + debug +
+             ' --n-chunks '+ str(n_burden_chunks) + ' '
+             f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr '
+             '--chunk {wildcards.chunk} '
+             '--dataset-file {input.dataset} '
+             '{input.data_config} '
+             '{input.model_config} '
+             '{input.checkpoints} '
+             '{wildcards.phenotype}/deeprvat/burdens'),
+            'touch {output}'
+        ])
+
+rule compute_burdens:
+    priority: 10
+    input:
+        reversed = model_path / "reverse_finished.tmp",
+        checkpoints = lambda wildcards: [
+            model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt'
+            for repeat in range(n_repeats) for bag in range(n_bags)
+        ],
+        dataset = '{phenotype}/deeprvat/association_dataset.pkl',
+        data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
+        model_config = model_path / 'config.yaml',
+    output:
+        '{phenotype}/deeprvat/burdens/chunk{chunk}.finished'
+    threads: 8
+    shell:
+        ' && '.join([
+            ('deeprvat_associate compute-burdens '
+             + debug +
+             ' --n-chunks '+ str(n_burden_chunks) + ' '
+             '--chunk {wildcards.chunk} '
+             '--dataset-file {input.dataset} '
+             '{input.data_config} '
+             '{input.model_config} '
+             '{input.checkpoints} '
+             '{wildcards.phenotype}/deeprvat/burdens'),
+            'touch {output}'
+        ])
+
+rule reverse_models:
+    input:
+        checkpoints = expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt',
+                             bag=range(n_bags), repeat=range(n_repeats)),
+        model_config = model_path / 'config.yaml',
+        data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml",
+    output:
+        temp(model_path / "reverse_finished.tmp")
+    threads: 4
+    shell:
+        " && ".join([
+            ("deeprvat_associate reverse-models "
+             "{input.model_config} "
+             "{input.data_config} "
+             "{input.checkpoints}"),
+            "touch {output}"
+        ])
diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile
@@ -0,0 +1,63 @@
+
+rule evaluate:
+    input:
+        associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+                              repeat=range(n_repeats)),
+        config = '{phenotype}/deeprvat/hpopt_config.yaml',
+    output:
+        "{phenotype}/deeprvat/eval/significant.parquet",
+        "{phenotype}/deeprvat/eval/all_results.parquet"
+    threads: 1
+    shell:
+        'deeprvat_evaluate '
+        + debug +
+        '--use-seed-genes '
+        '--n-repeats {n_repeats} '
+        '--correction-method FDR '
+        '{input.associations} '
+        '{input.config} '
+        '{wildcards.phenotype}/deeprvat/eval'
+
+rule all_regression:
+    input:
+        expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+               phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)),
+
+rule combine_regression_chunks:
+    input:
+        expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)),
+    output:
+        '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+    threads: 1
+    shell:
+        'deeprvat_associate combine-regression-results '
+        '--model-name repeat_{wildcards.repeat} '
+        '{input} '
+        '{output}'
+
+rule regress:
+    input:
+        config = "{phenotype}/deeprvat/hpopt_config.yaml",
+        chunks = lambda wildcards: expand(
+            ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' +
+             ("finished" if wildcards.phenotype == phenotypes[0] else "linked")),
+            chunk=range(n_burden_chunks)
+        ),
+        phenotype_0_chunks =  expand(
+            phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished',
+            chunk=range(n_burden_chunks)
+        ),
+    output:
+        temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'),
+    threads: 2
+    shell:
+        'deeprvat_associate regress '
+        + debug +
+        '--chunk {wildcards.chunk} '
+        '--n-chunks ' + str(n_regression_chunks) + ' '
+        '--use-bias '
+        '--repeat {wildcards.repeat} '
+        + do_scoretest +
+        '{input.config} '
+        '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats
+        '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results'