Merge branch 'main' into feature/add-sample-missingness-rule-back

PMBio · Feb 5, 2024 · 0736725 · 0736725
2 parents 63a9b55 + 21a8e46
commit 0736725
Show file tree

Hide file tree

Showing 15 changed files with 436 additions and 524 deletions.
diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -8,26 +8,33 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Training Association Testing smoke test
-        uses: snakemake/snakemake-github-action@v1.24.0
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: 'example'
-          snakefile: 'pipelines/training_association_testing.snakefile'
-          args: '-j 2 -n'
+          environment-name: deeprvat-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
+          cache-environment: true
+          cache-downloads: true
+      - name: Smoketest training_association_testing pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
       - name: Link pretrained models
         run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models
-      - name: Association Testing Pretrained Smoke Test
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: 'example'
-          snakefile: 'pipelines/association_testing_pretrained.snakefile'
-          args: '-j 2 -n'
-      - name: Seed Gene Discovery Smoke Test
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: 'example'
-          snakefile: 'pipelines/seed_gene_discovery.snakefile'
-          args: '-j 2 -n'
+        shell: bash -el {0}
+      - name: Smoketest association_testing_pretrained pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
+      - name: Copy seed gene discovery snakemake config
+        run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml .
+        shell: bash -el {0}
+      - name: Smoketest seed_gene_discovery pipeline
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
+          --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs
+        shell: micromamba-shell {0}
 
   DeepRVAT-Pipeline-Tests:
     runs-on: ubuntu-latest
@@ -76,41 +83,57 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Preprocessing Smoke Test With QC
-        uses: snakemake/snakemake-github-action@v1.24.0
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: 'example/preprocess'
-          snakefile: 'pipelines/preprocess_with_qc.snakefile'
-          args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml'
-          stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa'
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
+          cache-environment: true
+          cache-downloads: true
 
-      - name: Preprocessing Smoke Test No QC
-        uses: snakemake/snakemake-github-action@v1.24.0
-        with:
-          directory: 'example/preprocess'
-          snakefile: 'pipelines/preprocess_no_qc.snakefile'
-          args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml'
-          stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa'
+      - name: Fake fasta data
+        if: steps.cache-fasta.outputs.cache-hit != 'true'
+        run: |
+          cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa
+
+      - name: Run preprocessing pipeline no qc Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
+          --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
+
+
+      - name: Preprocessing pipeline with qc Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
+          --snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
 
 
   DeepRVAT-Annotation-Pipeline-Smoke-Tests:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Annotations Smoke Test
-        uses: snakemake/snakemake-github-action@v1.25.1
+      - uses: mamba-org/setup-micromamba@v1.4.3
         with:
-          directory: 'example/annotations'
-          snakefile: 'pipelines/annotations.snakefile'
-          args: '-j 2 -n --configfile pipelines/config/deeprvat_annotation_config.yaml'
+          environment-name: deeprvat-preprocess-gh-action
+          environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
+          cache-environment: true
+          cache-downloads: true
+      - name: Annotations Smoke Test
+        run: |
+          python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \
+          --snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \
+          --configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs
+        shell: micromamba-shell {0}
 
 
   DeepRVAT-Preprocessing-Pipeline-Tests-No-QC:
     runs-on: ubuntu-latest
     needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests
     steps:
-
       - name: Check out repository code
         uses: actions/checkout@v3
       - uses: mamba-org/setup-micromamba@v1.4.3

diff --git a/docs/annotations.md b/docs/annotations.md
@@ -22,7 +22,7 @@ BCFtools as well as HTSlib should be installed on the machine,
 
 will be installed by the pipeline together with the [plugins](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html) for primateAI and spliceAI. Annotation data for CADD, spliceAI and primateAI should be downloaded. The path to the data may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml). 
 Download path:
-- [CADD](http://cadd.gs.washington.edu/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their  Tabix Indices
+- [CADD](https://cadd.bihealth.org/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their  Tabix Indices
 - [SpliceAI](https://basespace.illumina.com/s/otSPW8hnhaZR): "genome_scores_v1.3"/"spliceai_scores.raw.snv.hg38.vcf.gz" and "spliceai_scores.raw.indel.hg38.vcf.gz" 
 - [PrimateAI](https://basespace.illumina.com/s/yYGFdGih1rXL) PrimateAI supplementary data/"PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz"
 

diff --git a/docs/preprocessing.md b/docs/preprocessing.md
@@ -50,17 +50,18 @@ An example file is included in this repo: [example config](https://github.com/PM
 # What chromosomes should be processed
 included_chromosomes : [21,22]
 
+# The format of the name of the "raw" vcf files
+vcf_files_list: vcf_files_list.txt
+
+# Number of threads to use in the preprocessing script, separate from snakemake threads
+preprocess_threads: 16
+
 # If you need to run a cmd to load bcf and samtools specify it here, see example
 bcftools_load_cmd : # module load bcftools/1.10.2 &&
 samtools_load_cmd : # module load samtools/1.9 &&
 
 # Path to where you want to write results and intermediate data
 working_dir: workdir
-# Path to ukbb data
-data_dir: data
-
-# These paths are all relative to the data dir
-metadata_dir_name: metadata
 
 # These paths are all relative to the working dir
 # Here will the finished preprocessed files end up
@@ -75,23 +76,14 @@ sparse_dir_name : sparse
 # Expected to be found in working_dir/reference_dir
 reference_fasta_file : GRCh38.primary_assembly.genome.fa
 
-# The format of the name of the "raw" vcf files
-vcf_files_list: vcf_files_list.txt
-
-# Number of threads to use in the preprocessing script, separate from snakemake threads
-preprocess_threads: 16
-
 # You can specify a different zcat cmd for example gzcat here, default zcat
-zcat_cmd: gzcat
+zcat_cmd:
    ```
 
 The config above would use the following directory structure:
 
 ```shell
 parent_directory
-|-- data
-|   |-- metadata
-|   `-- vcf
 `-- workdir
     |-- norm
     |   |-- bcf

diff --git a/docs/usage.md b/docs/usage.md
@@ -56,6 +56,20 @@ Replace `[path_to_deeprvat]` with the path to your clone of the repository.
 Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.
 
 
+### Run the training pipeline on some example data
+
+```shell
+mkdir example
+cd example
+ln -s [path_to_deeprvat]/example/* .
+snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/run_training.snakefile
+```
+
+Replace `[path_to_deeprvat]` with the path to your clone of the repository.
+
+Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.
+
+
 ### Run the association testing pipeline with pretrained models
 
 ```shell

diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile
@@ -0,0 +1,12 @@
+
+rule association_dataset:
+    input:
+        config = '{phenotype}/deeprvat/hpopt_config.yaml'
+    output:
+        '{phenotype}/deeprvat/association_dataset.pkl'
+    threads: 4
+    shell:
+        'deeprvat_associate make-dataset '
+        + debug +
+        '{input.config} '
+        '{output}'
diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile
@@ -0,0 +1,74 @@
+
+rule link_burdens:
+    priority: 1
+    input:
+        checkpoints = lambda wildcards: [
+            f'{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt'
+            for repeat in range(n_repeats) for bag in range(n_bags)
+        ],
+        dataset = '{phenotype}/deeprvat/association_dataset.pkl',
+        data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
+        model_config = model_path / 'config.yaml',
+    output:
+        '{phenotype}/deeprvat/burdens/chunk{chunk}.linked'
+    threads: 8
+    shell:
+        ' && '.join([
+            ('deeprvat_associate compute-burdens '
+             + debug +
+             ' --n-chunks '+ str(n_burden_chunks) + ' '
+             f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr '
+             '--chunk {wildcards.chunk} '
+             '--dataset-file {input.dataset} '
+             '{input.data_config} '
+             '{input.model_config} '
+             '{input.checkpoints} '
+             '{wildcards.phenotype}/deeprvat/burdens'),
+            'touch {output}'
+        ])
+
+rule compute_burdens:
+    priority: 10
+    input:
+        reversed = model_path / "reverse_finished.tmp",
+        checkpoints = lambda wildcards: [
+            model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt'
+            for repeat in range(n_repeats) for bag in range(n_bags)
+        ],
+        dataset = '{phenotype}/deeprvat/association_dataset.pkl',
+        data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
+        model_config = model_path / 'config.yaml',
+    output:
+        '{phenotype}/deeprvat/burdens/chunk{chunk}.finished'
+    threads: 8
+    shell:
+        ' && '.join([
+            ('deeprvat_associate compute-burdens '
+             + debug +
+             ' --n-chunks '+ str(n_burden_chunks) + ' '
+             '--chunk {wildcards.chunk} '
+             '--dataset-file {input.dataset} '
+             '{input.data_config} '
+             '{input.model_config} '
+             '{input.checkpoints} '
+             '{wildcards.phenotype}/deeprvat/burdens'),
+            'touch {output}'
+        ])
+
+rule reverse_models:
+    input:
+        checkpoints = expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt',
+                             bag=range(n_bags), repeat=range(n_repeats)),
+        model_config = model_path / 'config.yaml',
+        data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml",
+    output:
+        temp(model_path / "reverse_finished.tmp")
+    threads: 4
+    shell:
+        " && ".join([
+            ("deeprvat_associate reverse-models "
+             "{input.model_config} "
+             "{input.data_config} "
+             "{input.checkpoints}"),
+            "touch {output}"
+        ])
diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile
@@ -0,0 +1,63 @@
+
+rule evaluate:
+    input:
+        associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+                              repeat=range(n_repeats)),
+        config = '{phenotype}/deeprvat/hpopt_config.yaml',
+    output:
+        "{phenotype}/deeprvat/eval/significant.parquet",
+        "{phenotype}/deeprvat/eval/all_results.parquet"
+    threads: 1
+    shell:
+        'deeprvat_evaluate '
+        + debug +
+        '--use-seed-genes '
+        '--n-repeats {n_repeats} '
+        '--correction-method FDR '
+        '{input.associations} '
+        '{input.config} '
+        '{wildcards.phenotype}/deeprvat/eval'
+
+rule all_regression:
+    input:
+        expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+               phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)),
+
+rule combine_regression_chunks:
+    input:
+        expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)),
+    output:
+        '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
+    threads: 1
+    shell:
+        'deeprvat_associate combine-regression-results '
+        '--model-name repeat_{wildcards.repeat} '
+        '{input} '
+        '{output}'
+
+rule regress:
+    input:
+        config = "{phenotype}/deeprvat/hpopt_config.yaml",
+        chunks = lambda wildcards: expand(
+            ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' +
+             ("finished" if wildcards.phenotype == phenotypes[0] else "linked")),
+            chunk=range(n_burden_chunks)
+        ),
+        phenotype_0_chunks =  expand(
+            phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished',
+            chunk=range(n_burden_chunks)
+        ),
+    output:
+        temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'),
+    threads: 2
+    shell:
+        'deeprvat_associate regress '
+        + debug +
+        '--chunk {wildcards.chunk} '
+        '--n-chunks ' + str(n_regression_chunks) + ' '
+        '--use-bias '
+        '--repeat {wildcards.repeat} '
+        + do_scoretest +
+        '{input.config} '
+        '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats
+        '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results'