Skip to content

Commit

Permalink
added resource specifications for mem_mb back into pipelines, removed… (
Browse files Browse the repository at this point in the history
#72)

* added resource specifications for mem_mb back into pipelines, removed load specifications

* Add resources back to preprocessing pipeline

---------

Co-authored-by: Mück <m991k@b260-pc003.inet.dkfz-heidelberg.de>
Co-authored-by: Magnus Wahlberg <endast@gmail.com>
  • Loading branch information
3 people authored Apr 19, 2024
1 parent c08f2a7 commit ed98449
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 13 deletions.
21 changes: 21 additions & 0 deletions pipelines/annotations.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ rule select_rename_fill_columns:
annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet",
output:
anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -178,6 +179,7 @@ if not gene_id_file:
rule create_gene_id_file:
input: gtf_file
output: gene_id_file
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -193,6 +195,7 @@ rule filter_by_exon_distance:
protein_coding_genes = gene_id_file
output:
anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet",
resources: mem_mb = lambda wildcards, attempt: 25_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -208,6 +211,7 @@ rule add_gene_ids:
gene_id_file = gene_id_file,
annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet",
output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs.parquet",
resources: mem_mb = lambda wildcards, attempt: 19_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -223,6 +227,7 @@ rule calculate_MAF:
anno_dir / "vep_deepripe_deepsea_absplice_af.parquet"
output:
anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet"
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -241,6 +246,7 @@ rule merge_allele_frequency:
annotation_file = anno_dir / "vep_deepripe_deepsea_absplice.parquet"
output:
anno_dir / "vep_deepripe_deepsea_absplice_af.parquet"
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -261,6 +267,7 @@ rule calculate_allele_frequency:
variants = variant_file
output:
allele_frequencies = anno_tmp_dir / "af_df.parquet"
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join([
f"python {annotation_python_file}",
Expand All @@ -282,6 +289,7 @@ rule merge_absplice_scores:
output:
anno_dir / "vep_deepripe_deepsea_absplice.parquet"
threads: ncores_merge_absplice
resources: mem_mb = lambda wildcards, attempt: 19_000 * (attempt + 1),
shell:
" ".join(
[
Expand All @@ -300,6 +308,7 @@ rule aggregate_absplice_scores:
output:
score_file = anno_tmp_dir / "abSplice_score_file.parquet",
threads: ncores_agg_absplice
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join(
[
Expand All @@ -318,6 +327,7 @@ rule merge_deepsea_pcas:
col_yaml_file = annotation_columns_yaml_file
output:
anno_dir / "vep_deepripe_deepsea.parquet",
resources: mem_mb = lambda wildcards, attempt: 30_000 * (attempt + 1),
shell:
" ".join(
[
Expand All @@ -341,6 +351,7 @@ rule concat_annotations:
output:
anno_dir / "vep_deepripe.parquet",
params: joined=lambda w, input: ",".join(input.vcf_files)
resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
shell:
" ".join(
[
Expand All @@ -366,6 +377,7 @@ rule merge_annotations:
vcf_file= anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"),
output:
anno_dir / f"{source_variant_file_pattern}_merged.parquet",
resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
shell:
(
"HEADER=$(grep -n '#Uploaded_variation' "
Expand All @@ -380,6 +392,7 @@ rule deepSea_PCA:
deepsea_anno = str(anno_dir / "all_variants.deepSea.parquet")
output:
deepSEA_tmp_dir / "deepsea_pca.parquet",
resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1),
shell:
" ".join(
["mkdir -p",
Expand All @@ -404,6 +417,7 @@ rule add_ids_deepSea:
output:
directory(anno_dir / "all_variants.wID.deepSea.parquet"),
threads: ncores_addis
resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1),
shell:
" ".join(
[
Expand All @@ -429,6 +443,7 @@ rule concat_deepSea:
),
params: joined=lambda w, input: ",".join(input.deepSEAscoreFiles)
threads:8
resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1),
output:
anno_dir / "all_variants.deepSea.parquet",
shell:
Expand All @@ -452,6 +467,8 @@ rule deepSea:
output:
anno_dir / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"),
threads: n_jobs_deepripe
resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),

conda:
"kipoi-veff2"
shell:
Expand All @@ -465,6 +482,7 @@ rule deepRiPe_parclip:
output:
anno_dir / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"),
threads: n_jobs_deepripe
resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
shell:
f"mkdir -p {pybedtools_tmp_path / 'parclip'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'parclip'} {saved_deepripe_models_path} {{threads}} 'parclip'"

Expand All @@ -476,6 +494,7 @@ rule deepRiPe_eclip_hg2:
output:
anno_dir / (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"),
threads: lambda wildcards, attempt: n_jobs_deepripe * attempt
resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
shell:
f"mkdir -p {pybedtools_tmp_path / 'hg2'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'hg2'} {saved_deepripe_models_path} {{threads}} 'eclip_hg2'"

Expand All @@ -487,6 +506,7 @@ rule deepRiPe_eclip_k5:
output:
anno_dir / (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"),
threads: lambda wildcards, attempt: n_jobs_deepripe * attempt
resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
shell:
f"mkdir -p {pybedtools_tmp_path / 'k5'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'k5'} {saved_deepripe_models_path} {{threads}} 'eclip_k5'"

Expand All @@ -498,6 +518,7 @@ rule vep:
output:
anno_dir / (source_variant_file_pattern + "_vep_anno.tsv"),
threads: vep_nfork
resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
shell:
" ".join(
[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ rule association_dataset:
threads: 4
resources:
mem_mb = lambda wildcards, attempt: 32000 * (attempt + 1),
load = 64000
priority: 30
shell:
'deeprvat_associate make-dataset '
Expand Down
4 changes: 0 additions & 4 deletions pipelines/association_testing/burdens.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ rule average_burdens:
threads: 1
resources:
mem_mb = lambda wildcards, attempt: 4098 + (attempt - 1) * 4098,
load = 4000,
priority: 10,
shell:
' && '.join([
Expand Down Expand Up @@ -46,7 +45,6 @@ rule link_burdens:
threads: 8
resources:
mem_mb = lambda wildcards, attempt: 20480 + (attempt - 1) * 4098,
load = lambda wildcards, attempt: 16000 + (attempt - 1) * 4000
shell:
' && '.join([
('deeprvat_associate compute-burdens '
Expand Down Expand Up @@ -80,7 +78,6 @@ rule compute_burdens:
threads: 8
resources:
mem_mb = 20000,
load = 8000,
gpus = 1
shell:
' && '.join([
Expand All @@ -107,7 +104,6 @@ rule reverse_models:
threads: 4
resources:
mem_mb = 20480,
load = 20480
shell:
" && ".join([
("deeprvat_associate reverse-models "
Expand Down
4 changes: 0 additions & 4 deletions pipelines/association_testing/regress_eval.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ rule evaluate:
threads: 1
resources:
mem_mb = 16000,
load = 16000
params:
n_combis = 1,
use_baseline_results = '--use-baseline-results'
Expand All @@ -35,7 +34,6 @@ rule combine_regression_chunks:
threads: 1
resources:
mem_mb = lambda wildcards, attempt: 12000 + (attempt - 1) * 4098,
load = 2000
shell:
'deeprvat_associate combine-regression-results '
'--model-name repeat_0 '
Expand All @@ -60,8 +58,6 @@ rule regress:
threads: 2
resources:
mem_mb = lambda wildcards, attempt: 28676 + (attempt - 1) * 4098,
# mem_mb = 16000,
load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000
params:
burden_file = f'{phenotypes[0]}/deeprvat/burdens/burdens_average.zarr',
burden_dir = '{phenotype}/deeprvat/burdens',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ rule regression_correct_common:
threads: 2
resources:
mem_mb = lambda wildcards, attempt: 28676 + (attempt - 1) * 4098,
load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000
params:
burden_file = f'{burden_phenotype}/deeprvat/burdens/burdens_{{burden_agg_fct}}_{{n_avg_repeats}}_{{combi}}.zarr',
burden_dir = '{phenotype}/deeprvat/burdens/',
Expand Down
1 change: 0 additions & 1 deletion pipelines/cv_training/cv_training.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ rule spread_config:
threads: 1
resources:
mem_mb = 1024,
load = 1000
shell:
' && '.join([
conda_check,
Expand Down
12 changes: 12 additions & 0 deletions pipelines/preprocessing/preprocess.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ rule normalize:
vcf_file=lambda wildcards: vcf_look_up[wildcards.vcf_stem],
output:
bcf_file=bcf_dir / "{vcf_stem}.bcf",
resources:
mem_mb=lambda wildcards, attempt: 16384 * (attempt + 1),
shell:
f"""{load_bcftools} bcftools view --samples-file {{input.samplefile}} --output-type u {{params.vcf_file}} | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}"""

Expand All @@ -78,6 +80,8 @@ rule sparsify:
bcf=bcf_dir / "{vcf_stem}.bcf",
output:
tsv=sparse_dir / "{vcf_stem}.tsv.gz",
resources:
mem_mb=512,
shell:
f"""{load_bcftools} bcftools query --format '[%CHROM\t%POS\t%REF\t%ALT\t%SAMPLE\t%GT\n]' --include 'GT!="RR" & GT!="mis"' {{input.bcf}} \
| sed 's/0[/,|]1/1/; s/1[/,|]0/1/; s/1[/,|]1/2/; s/0[/,|]0/0/' | gzip > {{output.tsv}}"""
Expand All @@ -88,6 +92,8 @@ rule variants:
bcf=bcf_dir / "{vcf_stem}.bcf",
output:
norm_variants_dir / "{vcf_stem}.tsv.gz",
resources:
mem_mb=512,
shell:
f"{load_bcftools} bcftools query --format '%CHROM\t%POS\t%REF\t%ALT\n' {{input}} | gzip > {{output}}"

Expand All @@ -97,6 +103,8 @@ rule concatenate_variants:
expand(norm_variants_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems),
output:
norm_variants_dir / "variants_no_id.tsv.gz",
resources:
mem_mb=256,
shell:
"{zcat_cmd} {input} | gzip > {output}"

Expand All @@ -107,6 +115,8 @@ rule add_variant_ids:
output:
variants=norm_variants_dir / "variants.tsv.gz",
duplicates=qc_duplicate_vars_dir / "duplicates.tsv",
resources:
mem_mb=2048,
shell:
f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}"

Expand All @@ -117,6 +127,8 @@ rule create_parquet_variant_ids:
output:
variants=norm_variants_dir / "variants.parquet",
duplicates=qc_duplicate_vars_dir / "duplicates.parquet",
resources:
mem_mb=2048,
shell:
f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}"

Expand Down
8 changes: 8 additions & 0 deletions pipelines/preprocessing/qc.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ rule qc_allelic_imbalance:
bcf_dir / "{vcf_stem}.bcf",
output:
qc_allelic_imbalance_dir / "{vcf_stem}.tsv.gz",
resources:
mem_mb=lambda wildcards, attempt: 256 * attempt,
shell:
f"""{load_bcftools} bcftools query --format '%CHROM\t%POS\t%REF\t%ALT\n' --exclude 'COUNT(GT="het")=0 || (GT="het" & ((TYPE="snp" & (FORMAT/AD[*:1] / FORMAT/AD[*:0]) > 0.15) | (TYPE="indel" & (FORMAT/AD[*:1] / FORMAT/AD[*:0]) > 0.20)))' {{input}} | gzip > {{output}}"""

Expand All @@ -14,6 +16,8 @@ rule qc_varmiss:
bcf_dir / "{vcf_stem}.bcf",
output:
qc_varmiss_dir / "{vcf_stem}.tsv.gz",
resources:
mem_mb=lambda wildcards, attempt: 256 * attempt,
shell:
f'{load_bcftools} bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\n" --include "F_MISSING >= 0.1" {{input}} | gzip > {{output}}'

Expand All @@ -23,6 +27,8 @@ rule qc_hwe:
bcf_dir / "{vcf_stem}.bcf",
output:
qc_hwe_dir / "{vcf_stem}.tsv.gz",
resources:
mem_mb=lambda wildcards, attempt: 256 * (attempt + 1),
shell:
f'{load_bcftools} bcftools +fill-tags --output-type u {{input}} -- --tags HWE | bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\n" --include "INFO/HWE <= 1e-15" | gzip > {{output}}'

Expand All @@ -32,6 +38,8 @@ rule qc_read_depth:
bcf_dir / "{vcf_stem}.bcf",
output:
qc_read_depth_dir / "{vcf_stem}.tsv.gz",
resources:
mem_mb=lambda wildcards, attempt: 256 * attempt,
shell:
f"""{load_bcftools} bcftools query --format '[%CHROM\\t%POS\\t%REF\\t%ALT\\t%SAMPLE\\n]' --include '(GT!="RR" & GT!="mis" & TYPE="snp" & FORMAT/DP < 7) | (GT!="RR" & GT!="mis" & TYPE="indel" & FORMAT/DP < 10)' {{input}} | gzip > {{output}}"""

Expand Down
2 changes: 0 additions & 2 deletions pipelines/training/train.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ rule best_training_run:
threads: 1
resources:
mem_mb = 2048,
load = 2000
shell:
(
'deeprvat_train best-training-run '
Expand Down Expand Up @@ -58,7 +57,6 @@ rule train:
priority: 1000
resources:
mem_mb = 20000,
load = 8000,
gpus = 1
shell:
f"parallel --jobs {n_parallel_training_jobs} --halt now,fail=1 --results train_repeat{{{{1}}}}_trial{{{{2}}}}/ "
Expand Down

0 comments on commit ed98449

Please sign in to comment.