Skip to content

Commit

Permalink
Use dynamic paths from rules for preprocessing pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
endast committed May 8, 2024
1 parent c6f7abd commit d1c6191
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 40 deletions.
20 changes: 9 additions & 11 deletions pipelines/preprocess_no_qc.snakefile
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
include: "preprocessing/preprocess.snakefile"


rule all:
input:
preprocessed_dir / "genotypes.h5",
norm_variants_dir / "variants.tsv.gz",
variants=norm_variants_dir / "variants.parquet",

combined_genotypes=rules.combine_genotypes.output,
variants_tsv=rules.add_variant_ids.output.variants,
variants_parquet=rules.create_parquet_variant_ids.output.variants

rule preprocess_no_qc:
rule preprocess:
input:
variants=norm_variants_dir / "variants.tsv.gz",
variants_parquet=norm_variants_dir / "variants.parquet",
samples=norm_dir / "samples_chr.csv",
sparse_tg=expand(sparse_dir / "{vcf_stem}.tsv.gz", vcf_stem=vcf_stems),
variants=rules.add_variant_ids.output.variants,
variants_parquet=rules.create_parquet_variant_ids.output.variants,
samples=rules.extract_samples.output,
sparse_tg=expand(rules.sparsify.output.tsv,vcf_stem=vcf_stems),
output:
expand(preprocessed_dir / "genotypes_chr{chr}.h5", chr=chromosomes),
expand(preprocessed_dir / "genotypes_chr{chr}.h5",chr=chromosomes),
shell:
" ".join(
[
Expand Down
27 changes: 14 additions & 13 deletions pipelines/preprocess_with_qc.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,27 @@ include: "preprocessing/qc.snakefile"

rule all:
input:
preprocessed_dir / "genotypes.h5",
norm_variants_dir / "variants.tsv.gz",
variants=norm_variants_dir / "variants.parquet",
combined_genotypes=rules.combine_genotypes.output,
variants_tsv=rules.add_variant_ids.output.variants,
variants_parquet=rules.create_parquet_variant_ids.output.variants


rule preprocess_with_qc:
rule preprocess:
input:
variants=norm_variants_dir / "variants.tsv.gz",
variants_parquet=norm_variants_dir / "variants.parquet",
samples=norm_dir / "samples_chr.csv",
sparse_tg=expand(sparse_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems),
qc_varmiss=expand(qc_varmiss_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems),
qc_hwe=expand(qc_hwe_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems),
variants=rules.add_variant_ids.output.variants,
variants_parquet=rules.create_parquet_variant_ids.output.variants,
samples=rules.extract_samples.output,
sparse_tg=expand(rules.sparsify.output.tsv,vcf_stem=vcf_stems),

qc_varmiss=expand(rules.qc_varmiss.output,vcf_stem=vcf_stems),
qc_hwe=expand(rules.qc_hwe.output,vcf_stem=vcf_stems),
qc_read_depth=expand(
qc_read_depth_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems
rules.qc_read_depth.output,vcf_stem=vcf_stems
),
qc_allelic_imbalance=expand(
qc_allelic_imbalance_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems
rules.qc_allelic_imbalance.output,vcf_stem=vcf_stems
),
qc_indmiss_samples=qc_filtered_samples_dir / "indmiss_samples.csv",
qc_indmiss_samples=rules.process_individual_missingness.output
output:
expand(preprocessed_dir / "genotypes_chr{chr}.h5",chr=chromosomes),
shell:
Expand Down
29 changes: 13 additions & 16 deletions pipelines/preprocessing/preprocess.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,17 @@ rule combine_genotypes:
shell:
f"{preprocessing_cmd} combine-genotypes {{input}} {{output}}"

rule extract_samples:
input:
vcf_files,
output:
norm_dir / "samples_chr.csv",
shell:
f"{load_bcftools} bcftools query --list-samples {{input}} > {{output}}"

rule normalize:
input:
samplefile=norm_dir / "samples_chr.csv",
samplefile=rules.extract_samples.output,
fasta=fasta_file,
fastaindex=fasta_index_file,
params:
Expand All @@ -77,7 +84,7 @@ rule index_fasta:

rule sparsify:
input:
bcf=bcf_dir / "{vcf_stem}.bcf",
bcf=rules.normalize.output.bcf_file
output:
tsv=sparse_dir / "{vcf_stem}.tsv.gz",
resources:
Expand All @@ -89,7 +96,7 @@ rule sparsify:

rule variants:
input:
bcf=bcf_dir / "{vcf_stem}.bcf",
bcf=rules.normalize.output.bcf_file,
output:
norm_variants_dir / "{vcf_stem}.tsv.gz",
resources:
Expand All @@ -100,7 +107,7 @@ rule variants:

rule concatenate_variants:
input:
expand(norm_variants_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems),
expand(rules.variants.output,vcf_stem=vcf_stems),
output:
norm_variants_dir / "variants_no_id.tsv.gz",
resources:
Expand All @@ -111,7 +118,7 @@ rule concatenate_variants:

rule add_variant_ids:
input:
norm_variants_dir / "variants_no_id.tsv.gz",
rules.concatenate_variants.output
output:
variants=norm_variants_dir / "variants.tsv.gz",
duplicates=qc_duplicate_vars_dir / "duplicates.tsv",
Expand All @@ -123,21 +130,11 @@ rule add_variant_ids:

rule create_parquet_variant_ids:
input:
norm_variants_dir / "variants_no_id.tsv.gz",
rules.concatenate_variants.output
output:
variants=norm_variants_dir / "variants.parquet",
duplicates=qc_duplicate_vars_dir / "duplicates.parquet",
resources:
mem_mb=2048,
shell:
f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}"


rule extract_samples:
input:
vcf_files,
output:
norm_dir / "samples_chr.csv",
shell:
f"{load_bcftools} bcftools query --list-samples {{input}} > {{output}}"

0 comments on commit d1c6191

Please sign in to comment.