PMBio · meyerkm · Dec 22, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 15, 2023
diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -8,26 +8,33 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Training Association Testing smoke test
-        uses: snakemake/snakemake-github-action@v1.24.0
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: 'example'
-          snakefile: 'pipelines/training_association_testing.snakefile'
-          args: '-j 2 -n'
+          environment-name: deeprvat-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
+          cache-environment: true
+          cache-downloads: true
+      - name: Smoketest training_association_testing pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
       - name: Link pretrained models
         run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models
-      - name: Association Testing Pretrained Smoke Test
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: 'example'
-          snakefile: 'pipelines/association_testing_pretrained.snakefile'
-          args: '-j 2 -n'
-      - name: Seed Gene Discovery Smoke Test
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: 'example'
-          snakefile: 'pipelines/seed_gene_discovery.snakefile'
-          args: '-j 2 -n'
+        shell: bash -el {0}
+      - name: Smoketest association_testing_pretrained pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
+      - name: Copy seed gene discovery snakemake config
+        run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml .
+        shell: bash -el {0}
+      - name: Smoketest seed_gene_discovery pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
 
   DeepRVAT-Pipeline-Tests:
     runs-on: ubuntu-latest
@@ -76,41 +83,57 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Preprocessing Smoke Test With QC
-        uses: snakemake/snakemake-github-action@v1.24.0
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: 'example/preprocess'
-          snakefile: 'pipelines/preprocess_with_qc.snakefile'
-          args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml'
-          stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa'
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
+          cache-environment: true
+          cache-downloads: true
 
-      - name: Preprocessing Smoke Test No QC
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: 'example/preprocess'
-          snakefile: 'pipelines/preprocess_no_qc.snakefile'
-          args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml'
-          stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa'
+      - name: Fake fasta data
+        if: steps.cache-fasta.outputs.cache-hit != 'true'
+        run: |
+          cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa
+
+      - name: Run preprocessing pipeline no qc Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
+          --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
+
+
+      - name: Preprocessing pipeline with qc Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
+          --snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
 
 
   DeepRVAT-Annotation-Pipeline-Smoke-Tests:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Annotations Smoke Test
-        uses: snakemake/snakemake-github-action@v1.25.1
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: 'example/annotations'
-          snakefile: 'pipelines/annotations.snakefile'
-          args: '-j 2 -n --configfile pipelines/config/deeprvat_annotation_config.yaml'
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
+          cache-environment: true
+          cache-downloads: true
+      - name: Annotations Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \
+          --snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
 
 
   DeepRVAT-Preprocessing-Pipeline-Tests-No-QC:
     runs-on: ubuntu-latest
     needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests
     steps:
-
       - name: Check out repository code
         uses: actions/checkout@v3
       - uses: mamba-org/setup-micromamba@v1.4.3

diff --git a/docs/usage.md b/docs/usage.md
@@ -56,6 +56,20 @@ Replace `[path_to_deeprvat]` with the path to your clone of the repository.
 Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.
 
 
+### Run the training pipeline on some example data
+
+```shell
+mkdir example
+cd example
+ln -s [path_to_deeprvat]/example/* .
+snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/run_training.snakefile
+```
+
+Replace `[path_to_deeprvat]` with the path to your clone of the repository.
+
+Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.
+
+
 ### Run the association testing pipeline with pretrained models
 
 ```shell

diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile
@@ -0,0 +1,12 @@
+
+rule association_dataset:
+    input:
+        config = '{phenotype}/deeprvat/hpopt_config.yaml'
+    output:
+        '{phenotype}/deeprvat/association_dataset.pkl'
+    threads: 4
+    shell:
+        'deeprvat_associate make-dataset '
+        + debug +
+        '{input.config} '
+        '{output}'
diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile
@@ -0,0 +1,74 @@
+
+rule link_burdens:
+    priority: 1
+    input:
+        checkpoints = lambda wildcards: [
+            f'{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt'
+            for repeat in range(n_repeats) for bag in range(n_bags)
+        ],
+        dataset = '{phenotype}/deeprvat/association_dataset.pkl',
+        data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
+        model_config = model_path / 'config.yaml',
+    output:
+        '{phenotype}/deeprvat/burdens/chunk{chunk}.linked'
+    threads: 8
+    shell:
+        ' && '.join([
+            ('deeprvat_associate compute-burdens '
+             + debug +
+             ' --n-chunks '+ str(n_burden_chunks) + ' '
+             f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr '
+             '--chunk {wildcards.chunk} '
+             '--dataset-file {input.dataset} '
+             '{input.data_config} '
+             '{input.model_config} '
+             '{input.checkpoints} '
+             '{wildcards.phenotype}/deeprvat/burdens'),
+            'touch {output}'
+        ])
+
+rule compute_burdens:
+    priority: 10
+    input:
+        reversed = model_path / "reverse_finished.tmp",
+        checkpoints = lambda wildcards: [
+            model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt'
+            for repeat in range(n_repeats) for bag in range(n_bags)
+        ],
+        dataset = '{phenotype}/deeprvat/association_dataset.pkl',
+        data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
+        model_config = model_path / 'config.yaml',
+    output:
+        '{phenotype}/deeprvat/burdens/chunk{chunk}.finished'
+    threads: 8
+    shell:
+        ' && '.join([
+            ('deeprvat_associate compute-burdens '
+             + debug +
+             ' --n-chunks '+ str(n_burden_chunks) + ' '
+             '--chunk {wildcards.chunk} '
+             '--dataset-file {input.dataset} '
+             '{input.data_config} '
+             '{input.model_config} '
+             '{input.checkpoints} '
+             '{wildcards.phenotype}/deeprvat/burdens'),
+            'touch {output}'
+        ])
+
+rule reverse_models:
+    input:
+        checkpoints = expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt',
+                             bag=range(n_bags), repeat=range(n_repeats)),
+        model_config = model_path / 'config.yaml',
+        data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml",
+    output:
+        temp(model_path / "reverse_finished.tmp")
+    threads: 4
+    shell:
+        " && ".join([
+            ("deeprvat_associate reverse-models "
+             "{input.model_config} "
+             "{input.data_config} "
+             "{input.checkpoints}"),
+            "touch {output}"
+        ])
diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile
@@ -0,0 +1,63 @@
+
+rule evaluate:
+    input:
+        associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+                              repeat=range(n_repeats)),
+        config = '{phenotype}/deeprvat/hpopt_config.yaml',
+    output:
+        "{phenotype}/deeprvat/eval/significant.parquet",
+        "{phenotype}/deeprvat/eval/all_results.parquet"
+    threads: 1
+    shell:
+        'deeprvat_evaluate '
+        + debug +
+        '--use-seed-genes '
+        '--n-repeats {n_repeats} '
+        '--correction-method FDR '
+        '{input.associations} '
+        '{input.config} '
+        '{wildcards.phenotype}/deeprvat/eval'
+
+rule all_regression:
+    input:
+        expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+               phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)),
+
+rule combine_regression_chunks:
+    input:
+        expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)),
+    output:
+        '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+    threads: 1
+    shell:
+        'deeprvat_associate combine-regression-results '
+        '--model-name repeat_{wildcards.repeat} '
+        '{input} '
+        '{output}'
+
+rule regress:
+    input:
+        config = "{phenotype}/deeprvat/hpopt_config.yaml",
+        chunks = lambda wildcards: expand(
+            ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' +
+             ("finished" if wildcards.phenotype == phenotypes[0] else "linked")),
+            chunk=range(n_burden_chunks)
+        ),
+        phenotype_0_chunks =  expand(
+            phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished',
+            chunk=range(n_burden_chunks)
+        ),
+    output:
+        temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'),
+    threads: 2
+    shell:
+        'deeprvat_associate regress '
+        + debug +
+        '--chunk {wildcards.chunk} '
+        '--n-chunks ' + str(n_regression_chunks) + ' '
+        '--use-bias '
+        '--repeat {wildcards.repeat} '
+        + do_scoretest +
+        '{input.config} '
+        '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats
+        '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results'