Skip to content

Commit

Permalink
Merge branch 'main' into feature/add-sample-missingness-rule-back
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed Feb 5, 2024
2 parents 63a9b55 + 21a8e46 commit 0736725
Show file tree
Hide file tree
Showing 15 changed files with 436 additions and 524 deletions.
95 changes: 59 additions & 36 deletions .github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,33 @@ jobs:
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Training Association Testing smoke test
uses: snakemake/snakemake-github-action@v1.24.0
- uses: mamba-org/setup-micromamba@v1.4.3
with:
directory: 'example'
snakefile: 'pipelines/training_association_testing.snakefile'
args: '-j 2 -n'
environment-name: deeprvat-gh-action
environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml
cache-environment: true
cache-downloads: true
- name: Smoketest training_association_testing pipeline
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
--snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs
shell: micromamba-shell {0}
- name: Link pretrained models
run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models
- name: Association Testing Pretrained Smoke Test
uses: snakemake/snakemake-github-action@v1.24.0
with:
directory: 'example'
snakefile: 'pipelines/association_testing_pretrained.snakefile'
args: '-j 2 -n'
- name: Seed Gene Discovery Smoke Test
uses: snakemake/snakemake-github-action@v1.24.0
with:
directory: 'example'
snakefile: 'pipelines/seed_gene_discovery.snakefile'
args: '-j 2 -n'
shell: bash -el {0}
- name: Smoketest association_testing_pretrained pipeline
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
--snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs
shell: micromamba-shell {0}
- name: Copy seed gene discovery snakemake config
run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml .
shell: bash -el {0}
- name: Smoketest seed_gene_discovery pipeline
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \
--snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs
shell: micromamba-shell {0}

DeepRVAT-Pipeline-Tests:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -76,41 +83,57 @@ jobs:
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Preprocessing Smoke Test With QC
uses: snakemake/snakemake-github-action@v1.24.0
- uses: mamba-org/setup-micromamba@v1.4.3
with:
directory: 'example/preprocess'
snakefile: 'pipelines/preprocess_with_qc.snakefile'
args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml'
stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa'
environment-name: deeprvat-preprocess-gh-action
environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
cache-environment: true
cache-downloads: true

- name: Preprocessing Smoke Test No QC
uses: snakemake/snakemake-github-action@v1.24.0
with:
directory: 'example/preprocess'
snakefile: 'pipelines/preprocess_no_qc.snakefile'
args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml'
stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa'
- name: Fake fasta data
if: steps.cache-fasta.outputs.cache-hit != 'true'
run: |
cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa
- name: Run preprocessing pipeline no qc Smoke Test
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
--snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \
--configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
shell: micromamba-shell {0}


- name: Preprocessing pipeline with qc Smoke Test
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \
--snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \
--configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
shell: micromamba-shell {0}


DeepRVAT-Annotation-Pipeline-Smoke-Tests:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Annotations Smoke Test
uses: snakemake/snakemake-github-action@v1.25.1
- uses: mamba-org/setup-micromamba@v1.4.3
with:
directory: 'example/annotations'
snakefile: 'pipelines/annotations.snakefile'
args: '-j 2 -n --configfile pipelines/config/deeprvat_annotation_config.yaml'
environment-name: deeprvat-preprocess-gh-action
environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml
cache-environment: true
cache-downloads: true
- name: Annotations Smoke Test
run: |
python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \
--snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \
--configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs
shell: micromamba-shell {0}


DeepRVAT-Preprocessing-Pipeline-Tests-No-QC:
runs-on: ubuntu-latest
needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests
steps:

- name: Check out repository code
uses: actions/checkout@v3
- uses: mamba-org/setup-micromamba@v1.4.3
Expand Down
2 changes: 1 addition & 1 deletion docs/annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ BCFtools as well as HTSlib should be installed on the machine,

will be installed by the pipeline together with the [plugins](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html) for primateAI and spliceAI. Annotation data for CADD, spliceAI and primateAI should be downloaded. The path to the data may be specified in the corresponding [config file](https://github.com/PMBio/deeprvat/blob/main/pipelines/config/deeprvat_annotation_config.yaml).
Download path:
- [CADD](http://cadd.gs.washington.edu/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their Tabix Indices
- [CADD](https://cadd.bihealth.org/download): "All possible SNVs of GRCh38/hg38" and "gnomad.genomes.r3.0.indel.tsv.gz" incl. their Tabix Indices
- [SpliceAI](https://basespace.illumina.com/s/otSPW8hnhaZR): "genome_scores_v1.3"/"spliceai_scores.raw.snv.hg38.vcf.gz" and "spliceai_scores.raw.indel.hg38.vcf.gz"
- [PrimateAI](https://basespace.illumina.com/s/yYGFdGih1rXL) PrimateAI supplementary data/"PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz"

Expand Down
22 changes: 7 additions & 15 deletions docs/preprocessing.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,18 @@ An example file is included in this repo: [example config](https://github.com/PM
# What chromosomes should be processed
included_chromosomes : [21,22]

# The format of the name of the "raw" vcf files
vcf_files_list: vcf_files_list.txt

# Number of threads to use in the preprocessing script, separate from snakemake threads
preprocess_threads: 16

# If you need to run a cmd to load bcf and samtools specify it here, see example
bcftools_load_cmd : # module load bcftools/1.10.2 &&
samtools_load_cmd : # module load samtools/1.9 &&

# Path to where you want to write results and intermediate data
working_dir: workdir
# Path to ukbb data
data_dir: data

# These paths are all relative to the data dir
metadata_dir_name: metadata

# These paths are all relative to the working dir
# Here will the finished preprocessed files end up
Expand All @@ -75,23 +76,14 @@ sparse_dir_name : sparse
# Expected to be found in working_dir/reference_dir
reference_fasta_file : GRCh38.primary_assembly.genome.fa

# The format of the name of the "raw" vcf files
vcf_files_list: vcf_files_list.txt

# Number of threads to use in the preprocessing script, separate from snakemake threads
preprocess_threads: 16

# You can specify a different zcat cmd for example gzcat here, default zcat
zcat_cmd: gzcat
zcat_cmd:
```
The config above would use the following directory structure:
```shell
parent_directory
|-- data
| |-- metadata
| `-- vcf
`-- workdir
|-- norm
| |-- bcf
Expand Down
14 changes: 14 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ Replace `[path_to_deeprvat]` with the path to your clone of the repository.
Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.


### Run the training pipeline on some example data

```shell
mkdir example
cd example
ln -s [path_to_deeprvat]/example/* .
snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/run_training.snakefile
```

Replace `[path_to_deeprvat]` with the path to your clone of the repository.

Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed.


### Run the association testing pipeline with pretrained models

```shell
Expand Down
12 changes: 12 additions & 0 deletions pipelines/association_testing/association_dataset.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

rule association_dataset:
input:
config = '{phenotype}/deeprvat/hpopt_config.yaml'
output:
'{phenotype}/deeprvat/association_dataset.pkl'
threads: 4
shell:
'deeprvat_associate make-dataset '
+ debug +
'{input.config} '
'{output}'
74 changes: 74 additions & 0 deletions pipelines/association_testing/burdens.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@

rule link_burdens:
priority: 1
input:
checkpoints = lambda wildcards: [
f'{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt'
for repeat in range(n_repeats) for bag in range(n_bags)
],
dataset = '{phenotype}/deeprvat/association_dataset.pkl',
data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
model_config = model_path / 'config.yaml',
output:
'{phenotype}/deeprvat/burdens/chunk{chunk}.linked'
threads: 8
shell:
' && '.join([
('deeprvat_associate compute-burdens '
+ debug +
' --n-chunks '+ str(n_burden_chunks) + ' '
f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr '
'--chunk {wildcards.chunk} '
'--dataset-file {input.dataset} '
'{input.data_config} '
'{input.model_config} '
'{input.checkpoints} '
'{wildcards.phenotype}/deeprvat/burdens'),
'touch {output}'
])

rule compute_burdens:
priority: 10
input:
reversed = model_path / "reverse_finished.tmp",
checkpoints = lambda wildcards: [
model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt'
for repeat in range(n_repeats) for bag in range(n_bags)
],
dataset = '{phenotype}/deeprvat/association_dataset.pkl',
data_config = '{phenotype}/deeprvat/hpopt_config.yaml',
model_config = model_path / 'config.yaml',
output:
'{phenotype}/deeprvat/burdens/chunk{chunk}.finished'
threads: 8
shell:
' && '.join([
('deeprvat_associate compute-burdens '
+ debug +
' --n-chunks '+ str(n_burden_chunks) + ' '
'--chunk {wildcards.chunk} '
'--dataset-file {input.dataset} '
'{input.data_config} '
'{input.model_config} '
'{input.checkpoints} '
'{wildcards.phenotype}/deeprvat/burdens'),
'touch {output}'
])

rule reverse_models:
input:
checkpoints = expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt',
bag=range(n_bags), repeat=range(n_repeats)),
model_config = model_path / 'config.yaml',
data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml",
output:
temp(model_path / "reverse_finished.tmp")
threads: 4
shell:
" && ".join([
("deeprvat_associate reverse-models "
"{input.model_config} "
"{input.data_config} "
"{input.checkpoints}"),
"touch {output}"
])
63 changes: 63 additions & 0 deletions pipelines/association_testing/regress_eval.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@

rule evaluate:
input:
associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
repeat=range(n_repeats)),
config = '{phenotype}/deeprvat/hpopt_config.yaml',
output:
"{phenotype}/deeprvat/eval/significant.parquet",
"{phenotype}/deeprvat/eval/all_results.parquet"
threads: 1
shell:
'deeprvat_evaluate '
+ debug +
'--use-seed-genes '
'--n-repeats {n_repeats} '
'--correction-method FDR '
'{input.associations} '
'{input.config} '
'{wildcards.phenotype}/deeprvat/eval'

rule all_regression:
input:
expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)),

rule combine_regression_chunks:
input:
expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)),
output:
'{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet',
threads: 1
shell:
'deeprvat_associate combine-regression-results '
'--model-name repeat_{wildcards.repeat} '
'{input} '
'{output}'

rule regress:
input:
config = "{phenotype}/deeprvat/hpopt_config.yaml",
chunks = lambda wildcards: expand(
('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' +
("finished" if wildcards.phenotype == phenotypes[0] else "linked")),
chunk=range(n_burden_chunks)
),
phenotype_0_chunks = expand(
phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished',
chunk=range(n_burden_chunks)
),
output:
temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'),
threads: 2
shell:
'deeprvat_associate regress '
+ debug +
'--chunk {wildcards.chunk} '
'--n-chunks ' + str(n_regression_chunks) + ' '
'--use-bias '
'--repeat {wildcards.repeat} '
+ do_scoretest +
'{input.config} '
'{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats
'{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results'
Loading

0 comments on commit 0736725

Please sign in to comment.