diff --git a/docs/_static/preprocess_rulegraph_no_qc.svg b/docs/_static/preprocess_no_qc_rulegraph.svg similarity index 65% rename from docs/_static/preprocess_rulegraph_no_qc.svg rename to docs/_static/preprocess_no_qc_rulegraph.svg index 6ddf980d..c631689f 100644 --- a/docs/_static/preprocess_rulegraph_no_qc.svg +++ b/docs/_static/preprocess_no_qc_rulegraph.svg @@ -1,7 +1,7 @@ - 0 - + all 1 - + combine_genotypes - + 1->0 @@ -30,8 +30,8 @@ 2 - -preprocess_no_qc + +preprocess @@ -42,25 +42,25 @@ 3 - + add_variant_ids - + 3->0 - + 3->2 - - + + 4 - + concatenate_variants @@ -72,7 +72,7 @@ 9 - + create_parquet_variant_ids @@ -84,7 +84,7 @@ 5 - + variants @@ -96,7 +96,7 @@ 6 - + normalize @@ -108,7 +108,7 @@ 10 - + sparsify @@ -120,17 +120,17 @@ 7 - + extract_samples - + 7->2 - + 7->6 @@ -138,29 +138,29 @@ 8 - + index_fasta - + 8->6 - + 9->0 - + 9->2 - + 10->2 diff --git a/docs/_static/preprocess_rulegraph_with_qc.svg b/docs/_static/preprocess_with_qc_rulegraph.svg similarity index 61% rename from docs/_static/preprocess_rulegraph_with_qc.svg rename to docs/_static/preprocess_with_qc_rulegraph.svg index 167d7839..24d6061c 100644 --- a/docs/_static/preprocess_rulegraph_with_qc.svg +++ b/docs/_static/preprocess_with_qc_rulegraph.svg @@ -1,7 +1,7 @@ - 0 - + all 1 - + combine_genotypes @@ -30,8 +30,8 @@ 2 - -preprocess_with_qc + +preprocess @@ -42,7 +42,7 @@ 3 - + add_variant_ids @@ -54,13 +54,13 @@ 3->2 - - + + 4 - + concatenate_variants @@ -72,7 +72,7 @@ 9 - + create_parquet_variant_ids @@ -84,7 +84,7 @@ 5 - + variants @@ -96,7 +96,7 @@ 6 - + normalize @@ -108,7 +108,7 @@ 10 - + sparsify @@ -120,7 +120,7 @@ 11 - + qc_varmiss @@ -132,7 +132,7 @@ 12 - + qc_hwe @@ -144,7 +144,7 @@ 13 - + qc_read_depth @@ -156,7 +156,7 @@ 14 - + qc_allelic_imbalance @@ -168,7 +168,7 @@ 16 - + qc_indmiss @@ -180,17 +180,17 @@ 7 - + extract_samples - + 7->2 - - + + - + 7->6 @@ -198,11 +198,11 @@ 8 - + index_fasta - + 8->6 @@ -214,28 +214,28 @@ - + 9->2 - - + + - + 10->2 - - + + - + 11->2 - - + + - + 12->2 - - + + @@ -244,7 +244,7 @@ - + 14->2 @@ -252,11 +252,11 @@ 15 - + process_individual_missingness - + 15->2 diff --git a/docs/preprocessing.md b/docs/preprocessing.md index b9564cf1..c9c3bc79 100644 --- a/docs/preprocessing.md +++ b/docs/preprocessing.md @@ -3,7 +3,7 @@ The DeepRVAT preprocessing pipeline is based on [snakemake](https://snakemake.readthedocs.io/en/stable/) it uses [bcftools+samstools](https://www.htslib.org/) and a [python script](https://github.com/PMBio/deeprvat/blob/main/deeprvat/preprocessing/preprocess.py) preprocessing.py. -![DeepRVAT preprocessing pipeline](_static/preprocess_rulegraph_no_qc.svg) +![DeepRVAT preprocessing pipeline](_static/preprocess_no_qc_rulegraph.svg) ## Output @@ -126,7 +126,7 @@ we used when we wrote the paper. The qc is specific to the UKBB data, so if you pipeline without qc. ### Run the preprocess pipeline with example data and qc -![DeepRVAT preprocessing pipeline](_static/preprocess_rulegraph_with_qc.svg) +![DeepRVAT preprocessing pipeline](_static/preprocess_with_qc_rulegraph.svg) *The vcf files in the example data folder was generated using [fake-vcf](https://github.com/endast/fake-vcf) (with some manual editing). @@ -170,7 +170,7 @@ total 48 ### Run the preprocess pipeline with example data and no qc -![DeepRVAT preprocessing pipeline](_static/preprocess_rulegraph_no_qc.svg) +![DeepRVAT preprocessing pipeline](_static/preprocess_no_qc_rulegraph.svg) *The vcf files in the example data folder was generated using [fake-vcf](https://github.com/endast/fake-vcf) (with some manual editing). diff --git a/pipelines/preprocess_no_qc.snakefile b/pipelines/preprocess_no_qc.snakefile index 4675972f..43cd58c8 100644 --- a/pipelines/preprocess_no_qc.snakefile +++ b/pipelines/preprocess_no_qc.snakefile @@ -3,17 +3,17 @@ include: "preprocessing/preprocess.snakefile" rule all: input: - preprocessed_dir / "genotypes.h5", - norm_variants_dir / "variants.tsv.gz", - variants=norm_variants_dir / "variants.parquet", + combined_genotypes=rules.combine_genotypes.output, + variants_tsv=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants, -rule preprocess_no_qc: +rule preprocess: input: - variants=norm_variants_dir / "variants.tsv.gz", - variants_parquet=norm_variants_dir / "variants.parquet", - samples=norm_dir / "samples_chr.csv", - sparse_tg=expand(sparse_dir / "{vcf_stem}.tsv.gz", vcf_stem=vcf_stems), + variants=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants, + samples=rules.extract_samples.output, + sparse_tg=expand(rules.sparsify.output.tsv, vcf_stem=vcf_stems), output: expand(preprocessed_dir / "genotypes_chr{chr}.h5", chr=chromosomes), shell: @@ -27,6 +27,6 @@ rule preprocess_no_qc: "{input.variants_parquet}", "{input.samples}", f"{sparse_dir}", - f"{preprocessed_dir / 'genotypes'}", + f"{preprocessed_dir/ 'genotypes'}", ] ) diff --git a/pipelines/preprocess_with_qc.snakefile b/pipelines/preprocess_with_qc.snakefile index 70b778bb..630c85a1 100644 --- a/pipelines/preprocess_with_qc.snakefile +++ b/pipelines/preprocess_with_qc.snakefile @@ -4,28 +4,26 @@ include: "preprocessing/qc.snakefile" rule all: input: - preprocessed_dir / "genotypes.h5", - norm_variants_dir / "variants.tsv.gz", - variants=norm_variants_dir / "variants.parquet", + combined_genotypes=rules.combine_genotypes.output, + variants_tsv=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants, -rule preprocess_with_qc: +rule preprocess: input: - variants=norm_variants_dir / "variants.tsv.gz", - variants_parquet=norm_variants_dir / "variants.parquet", - samples=norm_dir / "samples_chr.csv", - sparse_tg=expand(sparse_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), - qc_varmiss=expand(qc_varmiss_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), - qc_hwe=expand(qc_hwe_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), - qc_read_depth=expand( - qc_read_depth_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems - ), + variants=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants, + samples=rules.extract_samples.output, + sparse_tg=expand(rules.sparsify.output.tsv, vcf_stem=vcf_stems), + qc_varmiss=expand(rules.qc_varmiss.output, vcf_stem=vcf_stems), + qc_hwe=expand(rules.qc_hwe.output, vcf_stem=vcf_stems), + qc_read_depth=expand(rules.qc_read_depth.output, vcf_stem=vcf_stems), qc_allelic_imbalance=expand( - qc_allelic_imbalance_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems + rules.qc_allelic_imbalance.output, vcf_stem=vcf_stems ), - qc_indmiss_samples=qc_filtered_samples_dir / "indmiss_samples.csv", + qc_indmiss_samples=rules.process_individual_missingness.output, output: - expand(preprocessed_dir / "genotypes_chr{chr}.h5",chr=chromosomes), + expand(preprocessed_dir / "genotypes_chr{chr}.h5", chr=chromosomes), shell: " ".join( [ @@ -42,6 +40,6 @@ rule preprocess_with_qc: "{input.variants_parquet}", "{input.samples}", f"{sparse_dir}", - f"{preprocessed_dir / 'genotypes'}", + f"{preprocessed_dir/ 'genotypes'}", ] ) diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index cad20b3c..69edc18d 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -50,10 +50,17 @@ rule combine_genotypes: shell: f"{preprocessing_cmd} combine-genotypes {{input}} {{output}}" +rule extract_samples: + input: + vcf_files, + output: + norm_dir / "samples_chr.csv", + shell: + f"{load_bcftools} bcftools query --list-samples {{input}} > {{output}}" rule normalize: input: - samplefile=norm_dir / "samples_chr.csv", + samplefile=rules.extract_samples.output, fasta=fasta_file, fastaindex=fasta_index_file, params: @@ -77,7 +84,7 @@ rule index_fasta: rule sparsify: input: - bcf=bcf_dir / "{vcf_stem}.bcf", + bcf=rules.normalize.output.bcf_file output: tsv=sparse_dir / "{vcf_stem}.tsv.gz", resources: @@ -89,7 +96,7 @@ rule sparsify: rule variants: input: - bcf=bcf_dir / "{vcf_stem}.bcf", + bcf=rules.normalize.output.bcf_file, output: norm_variants_dir / "{vcf_stem}.tsv.gz", resources: @@ -100,7 +107,7 @@ rule variants: rule concatenate_variants: input: - expand(norm_variants_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), + expand(rules.variants.output,vcf_stem=vcf_stems), output: norm_variants_dir / "variants_no_id.tsv.gz", resources: @@ -111,7 +118,7 @@ rule concatenate_variants: rule add_variant_ids: input: - norm_variants_dir / "variants_no_id.tsv.gz", + rules.concatenate_variants.output output: variants=norm_variants_dir / "variants.tsv.gz", duplicates=qc_duplicate_vars_dir / "duplicates.tsv", @@ -123,7 +130,7 @@ rule add_variant_ids: rule create_parquet_variant_ids: input: - norm_variants_dir / "variants_no_id.tsv.gz", + rules.concatenate_variants.output output: variants=norm_variants_dir / "variants.parquet", duplicates=qc_duplicate_vars_dir / "duplicates.parquet", @@ -131,13 +138,3 @@ rule create_parquet_variant_ids: mem_mb=2048, shell: f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}" - - -rule extract_samples: - input: - vcf_files, - output: - norm_dir / "samples_chr.csv", - shell: - f"{load_bcftools} bcftools query --list-samples {{input}} > {{output}}" -