diff --git a/pipelines/preprocess_no_qc.snakefile b/pipelines/preprocess_no_qc.snakefile index 4675972f..4f098e7b 100644 --- a/pipelines/preprocess_no_qc.snakefile +++ b/pipelines/preprocess_no_qc.snakefile @@ -1,21 +1,19 @@ include: "preprocessing/preprocess.snakefile" - rule all: input: - preprocessed_dir / "genotypes.h5", - norm_variants_dir / "variants.tsv.gz", - variants=norm_variants_dir / "variants.parquet", - + combined_genotypes=rules.combine_genotypes.output, + variants_tsv=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants -rule preprocess_no_qc: +rule preprocess: input: - variants=norm_variants_dir / "variants.tsv.gz", - variants_parquet=norm_variants_dir / "variants.parquet", - samples=norm_dir / "samples_chr.csv", - sparse_tg=expand(sparse_dir / "{vcf_stem}.tsv.gz", vcf_stem=vcf_stems), + variants=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants, + samples=rules.extract_samples.output, + sparse_tg=expand(rules.sparsify.output.tsv,vcf_stem=vcf_stems), output: - expand(preprocessed_dir / "genotypes_chr{chr}.h5", chr=chromosomes), + expand(preprocessed_dir / "genotypes_chr{chr}.h5",chr=chromosomes), shell: " ".join( [ diff --git a/pipelines/preprocess_with_qc.snakefile b/pipelines/preprocess_with_qc.snakefile index 70b778bb..93e3519f 100644 --- a/pipelines/preprocess_with_qc.snakefile +++ b/pipelines/preprocess_with_qc.snakefile @@ -4,26 +4,27 @@ include: "preprocessing/qc.snakefile" rule all: input: - preprocessed_dir / "genotypes.h5", - norm_variants_dir / "variants.tsv.gz", - variants=norm_variants_dir / "variants.parquet", + combined_genotypes=rules.combine_genotypes.output, + variants_tsv=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants -rule preprocess_with_qc: +rule preprocess: input: - variants=norm_variants_dir / "variants.tsv.gz", - variants_parquet=norm_variants_dir / "variants.parquet", - samples=norm_dir / "samples_chr.csv", - sparse_tg=expand(sparse_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), - qc_varmiss=expand(qc_varmiss_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), - qc_hwe=expand(qc_hwe_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), + variants=rules.add_variant_ids.output.variants, + variants_parquet=rules.create_parquet_variant_ids.output.variants, + samples=rules.extract_samples.output, + sparse_tg=expand(rules.sparsify.output.tsv,vcf_stem=vcf_stems), + + qc_varmiss=expand(rules.qc_varmiss.output,vcf_stem=vcf_stems), + qc_hwe=expand(rules.qc_hwe.output,vcf_stem=vcf_stems), qc_read_depth=expand( - qc_read_depth_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems + rules.qc_read_depth.output,vcf_stem=vcf_stems ), qc_allelic_imbalance=expand( - qc_allelic_imbalance_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems + rules.qc_allelic_imbalance.output,vcf_stem=vcf_stems ), - qc_indmiss_samples=qc_filtered_samples_dir / "indmiss_samples.csv", + qc_indmiss_samples=rules.process_individual_missingness.output output: expand(preprocessed_dir / "genotypes_chr{chr}.h5",chr=chromosomes), shell: diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index cad20b3c..69edc18d 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -50,10 +50,17 @@ rule combine_genotypes: shell: f"{preprocessing_cmd} combine-genotypes {{input}} {{output}}" +rule extract_samples: + input: + vcf_files, + output: + norm_dir / "samples_chr.csv", + shell: + f"{load_bcftools} bcftools query --list-samples {{input}} > {{output}}" rule normalize: input: - samplefile=norm_dir / "samples_chr.csv", + samplefile=rules.extract_samples.output, fasta=fasta_file, fastaindex=fasta_index_file, params: @@ -77,7 +84,7 @@ rule index_fasta: rule sparsify: input: - bcf=bcf_dir / "{vcf_stem}.bcf", + bcf=rules.normalize.output.bcf_file output: tsv=sparse_dir / "{vcf_stem}.tsv.gz", resources: @@ -89,7 +96,7 @@ rule sparsify: rule variants: input: - bcf=bcf_dir / "{vcf_stem}.bcf", + bcf=rules.normalize.output.bcf_file, output: norm_variants_dir / "{vcf_stem}.tsv.gz", resources: @@ -100,7 +107,7 @@ rule variants: rule concatenate_variants: input: - expand(norm_variants_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), + expand(rules.variants.output,vcf_stem=vcf_stems), output: norm_variants_dir / "variants_no_id.tsv.gz", resources: @@ -111,7 +118,7 @@ rule concatenate_variants: rule add_variant_ids: input: - norm_variants_dir / "variants_no_id.tsv.gz", + rules.concatenate_variants.output output: variants=norm_variants_dir / "variants.tsv.gz", duplicates=qc_duplicate_vars_dir / "duplicates.tsv", @@ -123,7 +130,7 @@ rule add_variant_ids: rule create_parquet_variant_ids: input: - norm_variants_dir / "variants_no_id.tsv.gz", + rules.concatenate_variants.output output: variants=norm_variants_dir / "variants.parquet", duplicates=qc_duplicate_vars_dir / "duplicates.parquet", @@ -131,13 +138,3 @@ rule create_parquet_variant_ids: mem_mb=2048, shell: f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}" - - -rule extract_samples: - input: - vcf_files, - output: - norm_dir / "samples_chr.csv", - shell: - f"{load_bcftools} bcftools query --list-samples {{input}} > {{output}}" -