From ed98449dbc87d4fc362a4faec9c7b0e46032da3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCck?= <mueckm1@gmail.com>
Date: Fri, 19 Apr 2024 08:44:09 +0200
Subject: [PATCH] =?UTF-8?q?added=20resource=20specifications=20for=20mem?=
 =?UTF-8?q?=5Fmb=20back=20into=20pipelines,=20removed=E2=80=A6=20(#72)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* added resource specifications for mem_mb back into pipelines, removed load specifications

* Add resources back to preprocessing pipeline

---------

Co-authored-by: Mück <m991k@b260-pc003.inet.dkfz-heidelberg.de>
Co-authored-by: Magnus Wahlberg <endast@gmail.com>
---
 pipelines/annotations.snakefile               | 21 +++++++++++++++++++
 .../association_dataset.snakefile             |  1 -
 .../association_testing/burdens.snakefile     |  4 ----
 .../regress_eval.snakefile                    |  4 ----
 ...ting_control_for_common_variants.snakefile |  1 -
 pipelines/cv_training/cv_training.snakefile   |  1 -
 pipelines/preprocessing/preprocess.snakefile  | 12 +++++++++++
 pipelines/preprocessing/qc.snakefile          |  8 +++++++
 pipelines/training/train.snakefile            |  2 --
 9 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
index a97f38ed..f3b617c5 100644
--- a/pipelines/annotations.snakefile
+++ b/pipelines/annotations.snakefile
@@ -162,6 +162,7 @@ rule select_rename_fill_columns:
         annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet",
     output:
         anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet",
+    resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
         " ".join([
             f"python {annotation_python_file}", 
@@ -178,6 +179,7 @@ if not gene_id_file:
     rule create_gene_id_file:
         input: gtf_file
         output: gene_id_file
+        resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
         shell:
             " ".join([
                 f"python {annotation_python_file}", 
@@ -193,6 +195,7 @@ rule filter_by_exon_distance:
         protein_coding_genes = gene_id_file
     output:
         anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet",
+    resources: mem_mb = lambda wildcards, attempt: 25_000 * (attempt + 1),
     shell:
         " ".join([
             f"python {annotation_python_file}", 
@@ -208,6 +211,7 @@ rule add_gene_ids:
         gene_id_file = gene_id_file,
         annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet",
     output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs.parquet",  
+    resources: mem_mb = lambda wildcards, attempt: 19_000 * (attempt + 1),
     shell:
         " ".join([
             f"python {annotation_python_file}", 
@@ -223,6 +227,7 @@ rule calculate_MAF:
         anno_dir / "vep_deepripe_deepsea_absplice_af.parquet"
     output:
         anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet"
+    resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
         " ".join([
             f"python {annotation_python_file}", 
@@ -241,6 +246,7 @@ rule merge_allele_frequency:
         annotation_file = anno_dir / "vep_deepripe_deepsea_absplice.parquet"
     output:
         anno_dir / "vep_deepripe_deepsea_absplice_af.parquet"
+    resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
         " ".join([
             f"python {annotation_python_file}", 
@@ -261,6 +267,7 @@ rule calculate_allele_frequency:
         variants = variant_file
     output:
         allele_frequencies = anno_tmp_dir / "af_df.parquet"
+    resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
         " ".join([
             f"python {annotation_python_file}", 
@@ -282,6 +289,7 @@ rule merge_absplice_scores:
     output: 
         anno_dir / "vep_deepripe_deepsea_absplice.parquet"
     threads: ncores_merge_absplice
+    resources: mem_mb = lambda wildcards, attempt: 19_000 * (attempt + 1),
     shell: 
         " ".join(
             [
@@ -300,6 +308,7 @@ rule aggregate_absplice_scores:
     output:
         score_file = anno_tmp_dir / "abSplice_score_file.parquet",
     threads: ncores_agg_absplice
+    resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
         " ".join(
             [
@@ -318,6 +327,7 @@ rule merge_deepsea_pcas:
         col_yaml_file = annotation_columns_yaml_file
     output:
         anno_dir / "vep_deepripe_deepsea.parquet",
+    resources: mem_mb = lambda wildcards, attempt: 30_000 * (attempt + 1),
     shell:
         " ".join(
             [
@@ -341,6 +351,7 @@ rule concat_annotations:
     output:
         anno_dir / "vep_deepripe.parquet",
     params: joined=lambda w, input: ",".join(input.vcf_files)
+    resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1),
     shell:
         " ".join(
             [
@@ -366,6 +377,7 @@ rule merge_annotations:
         vcf_file= anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"),
     output:
         anno_dir / f"{source_variant_file_pattern}_merged.parquet",
+    resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
     shell:
         (
             "HEADER=$(grep  -n  '#Uploaded_variation' "
@@ -380,6 +392,7 @@ rule deepSea_PCA:
         deepsea_anno = str(anno_dir / "all_variants.deepSea.parquet")
     output:
         deepSEA_tmp_dir / "deepsea_pca.parquet",
+    resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1),
     shell:
         " ".join(
             ["mkdir -p",
@@ -404,6 +417,7 @@ rule add_ids_deepSea:
     output:
         directory(anno_dir / "all_variants.wID.deepSea.parquet"),
     threads: ncores_addis
+    resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1),
     shell:
         " ".join(
             [
@@ -429,6 +443,7 @@ rule concat_deepSea:
         ),
     params: joined=lambda w, input: ",".join(input.deepSEAscoreFiles)
     threads:8
+    resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1),
     output:
         anno_dir / "all_variants.deepSea.parquet",
     shell:
@@ -452,6 +467,8 @@ rule deepSea:
     output:
         anno_dir / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"),
     threads: n_jobs_deepripe
+    resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
+
     conda:
         "kipoi-veff2"
     shell:
@@ -465,6 +482,7 @@ rule deepRiPe_parclip:
     output:
         anno_dir / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"),
     threads: n_jobs_deepripe
+    resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
     shell:
         f"mkdir -p {pybedtools_tmp_path / 'parclip'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir}  {{input.fasta}} {pybedtools_tmp_path / 'parclip'} {saved_deepripe_models_path} {{threads}} 'parclip'"
 
@@ -476,6 +494,7 @@ rule deepRiPe_eclip_hg2:
     output:
         anno_dir / (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"),
     threads: lambda wildcards, attempt: n_jobs_deepripe * attempt
+    resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
     shell:
         f"mkdir -p {pybedtools_tmp_path / 'hg2'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir}  {{input.fasta}} {pybedtools_tmp_path / 'hg2'} {saved_deepripe_models_path} {{threads}} 'eclip_hg2'"
 
@@ -487,6 +506,7 @@ rule deepRiPe_eclip_k5:
     output:
         anno_dir / (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"),
     threads: lambda wildcards, attempt: n_jobs_deepripe * attempt
+    resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
     shell:
         f"mkdir -p {pybedtools_tmp_path / 'k5'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir}  {{input.fasta}} {pybedtools_tmp_path / 'k5'} {saved_deepripe_models_path} {{threads}} 'eclip_k5'"
 
@@ -498,6 +518,7 @@ rule vep:
     output:
         anno_dir / (source_variant_file_pattern + "_vep_anno.tsv"),
     threads: vep_nfork
+    resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1),
     shell:
         " ".join(
             [
diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile
index ad6d1870..9c8ba228 100644
--- a/pipelines/association_testing/association_dataset.snakefile
+++ b/pipelines/association_testing/association_dataset.snakefile
@@ -11,7 +11,6 @@ rule association_dataset:
     threads: 4
     resources:
         mem_mb = lambda wildcards, attempt: 32000 * (attempt + 1),
-        load = 64000
     priority: 30
     shell:
         'deeprvat_associate make-dataset '
diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile
index 5ac7cf1f..99f72d95 100644
--- a/pipelines/association_testing/burdens.snakefile
+++ b/pipelines/association_testing/burdens.snakefile
@@ -15,7 +15,6 @@ rule average_burdens:
     threads: 1
     resources:
         mem_mb = lambda wildcards, attempt: 4098 + (attempt - 1) * 4098,
-        load = 4000,
     priority: 10,
     shell:
         ' && '.join([
@@ -46,7 +45,6 @@ rule link_burdens:
     threads: 8
     resources:
         mem_mb = lambda wildcards, attempt: 20480 + (attempt - 1) * 4098,
-        load = lambda wildcards, attempt: 16000 + (attempt - 1) * 4000
     shell:
         ' && '.join([
             ('deeprvat_associate compute-burdens '
@@ -80,7 +78,6 @@ rule compute_burdens:
     threads: 8
     resources:
         mem_mb = 20000,
-        load = 8000,
         gpus = 1
     shell:
         ' && '.join([
@@ -107,7 +104,6 @@ rule reverse_models:
     threads: 4
     resources:
         mem_mb = 20480,
-        load = 20480
     shell:
         " && ".join([
             ("deeprvat_associate reverse-models "
diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile
index 2f5325fb..70aa086b 100644
--- a/pipelines/association_testing/regress_eval.snakefile
+++ b/pipelines/association_testing/regress_eval.snakefile
@@ -12,7 +12,6 @@ rule evaluate:
     threads: 1
     resources:
         mem_mb = 16000,
-        load = 16000
     params:
         n_combis = 1,
         use_baseline_results = '--use-baseline-results'
@@ -35,7 +34,6 @@ rule combine_regression_chunks:
     threads: 1
     resources:
         mem_mb = lambda wildcards, attempt: 12000 + (attempt - 1) * 4098,
-        load = 2000
     shell:
         'deeprvat_associate combine-regression-results '
         '--model-name repeat_0 ' 
@@ -60,8 +58,6 @@ rule regress:
     threads: 2
     resources:
         mem_mb = lambda wildcards, attempt: 28676  + (attempt - 1) * 4098,
-        # mem_mb = 16000,
-        load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000
     params:
         burden_file = f'{phenotypes[0]}/deeprvat/burdens/burdens_average.zarr',
         burden_dir = '{phenotype}/deeprvat/burdens',
diff --git a/pipelines/association_testing_control_for_common_variants.snakefile b/pipelines/association_testing_control_for_common_variants.snakefile
index 37729478..944edbf8 100644
--- a/pipelines/association_testing_control_for_common_variants.snakefile
+++ b/pipelines/association_testing_control_for_common_variants.snakefile
@@ -118,7 +118,6 @@ rule regression_correct_common:
     threads: 2
     resources:
         mem_mb = lambda wildcards, attempt: 28676  + (attempt - 1) * 4098,
-        load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000
     params:
         burden_file = f'{burden_phenotype}/deeprvat/burdens/burdens_{{burden_agg_fct}}_{{n_avg_repeats}}_{{combi}}.zarr',
         burden_dir = '{phenotype}/deeprvat/burdens/',
diff --git a/pipelines/cv_training/cv_training.snakefile b/pipelines/cv_training/cv_training.snakefile
index be8bfaed..3c4bb674 100644
--- a/pipelines/cv_training/cv_training.snakefile
+++ b/pipelines/cv_training/cv_training.snakefile
@@ -18,7 +18,6 @@ rule spread_config:
     threads: 1
     resources:
         mem_mb = 1024,
-        load = 1000
     shell:
         ' && '.join([
             conda_check,
diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile
index 20eb0c89..cad20b3c 100644
--- a/pipelines/preprocessing/preprocess.snakefile
+++ b/pipelines/preprocessing/preprocess.snakefile
@@ -60,6 +60,8 @@ rule normalize:
         vcf_file=lambda wildcards: vcf_look_up[wildcards.vcf_stem],
     output:
         bcf_file=bcf_dir / "{vcf_stem}.bcf",
+    resources:
+        mem_mb=lambda wildcards, attempt: 16384 * (attempt + 1),
     shell:
         f"""{load_bcftools} bcftools view --samples-file {{input.samplefile}} --output-type u {{params.vcf_file}} | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}"""
 
@@ -78,6 +80,8 @@ rule sparsify:
         bcf=bcf_dir / "{vcf_stem}.bcf",
     output:
         tsv=sparse_dir / "{vcf_stem}.tsv.gz",
+    resources:
+        mem_mb=512,
     shell:
         f"""{load_bcftools} bcftools query --format '[%CHROM\t%POS\t%REF\t%ALT\t%SAMPLE\t%GT\n]' --include 'GT!="RR" & GT!="mis"' {{input.bcf}} \
         | sed 's/0[/,|]1/1/; s/1[/,|]0/1/; s/1[/,|]1/2/; s/0[/,|]0/0/' | gzip > {{output.tsv}}"""
@@ -88,6 +92,8 @@ rule variants:
         bcf=bcf_dir / "{vcf_stem}.bcf",
     output:
         norm_variants_dir / "{vcf_stem}.tsv.gz",
+    resources:
+        mem_mb=512,
     shell:
         f"{load_bcftools} bcftools query --format '%CHROM\t%POS\t%REF\t%ALT\n' {{input}} | gzip > {{output}}"
 
@@ -97,6 +103,8 @@ rule concatenate_variants:
         expand(norm_variants_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems),
     output:
         norm_variants_dir / "variants_no_id.tsv.gz",
+    resources:
+        mem_mb=256,
     shell:
         "{zcat_cmd} {input} | gzip > {output}"
 
@@ -107,6 +115,8 @@ rule add_variant_ids:
     output:
         variants=norm_variants_dir / "variants.tsv.gz",
         duplicates=qc_duplicate_vars_dir / "duplicates.tsv",
+    resources:
+        mem_mb=2048,
     shell:
         f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}"
 
@@ -117,6 +127,8 @@ rule create_parquet_variant_ids:
     output:
         variants=norm_variants_dir / "variants.parquet",
         duplicates=qc_duplicate_vars_dir / "duplicates.parquet",
+    resources:
+        mem_mb=2048,
     shell:
         f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}"
 
diff --git a/pipelines/preprocessing/qc.snakefile b/pipelines/preprocessing/qc.snakefile
index 8b499273..2369de45 100644
--- a/pipelines/preprocessing/qc.snakefile
+++ b/pipelines/preprocessing/qc.snakefile
@@ -5,6 +5,8 @@ rule qc_allelic_imbalance:
         bcf_dir / "{vcf_stem}.bcf",
     output:
         qc_allelic_imbalance_dir / "{vcf_stem}.tsv.gz",
+    resources:
+        mem_mb=lambda wildcards, attempt: 256 * attempt,
     shell:
         f"""{load_bcftools} bcftools query --format '%CHROM\t%POS\t%REF\t%ALT\n' --exclude 'COUNT(GT="het")=0 || (GT="het" & ((TYPE="snp" & (FORMAT/AD[*:1] / FORMAT/AD[*:0]) > 0.15) | (TYPE="indel" & (FORMAT/AD[*:1] / FORMAT/AD[*:0]) > 0.20)))' {{input}} | gzip > {{output}}"""
 
@@ -14,6 +16,8 @@ rule qc_varmiss:
         bcf_dir / "{vcf_stem}.bcf",
     output:
         qc_varmiss_dir / "{vcf_stem}.tsv.gz",
+    resources:
+        mem_mb=lambda wildcards, attempt: 256 * attempt,
     shell:
         f'{load_bcftools} bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\n" --include "F_MISSING >= 0.1" {{input}} | gzip > {{output}}'
 
@@ -23,6 +27,8 @@ rule qc_hwe:
         bcf_dir / "{vcf_stem}.bcf",
     output:
         qc_hwe_dir / "{vcf_stem}.tsv.gz",
+    resources:
+        mem_mb=lambda wildcards, attempt: 256 * (attempt + 1),
     shell:
         f'{load_bcftools} bcftools +fill-tags --output-type u {{input}} -- --tags HWE | bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\n" --include "INFO/HWE <= 1e-15" | gzip > {{output}}'
 
@@ -32,6 +38,8 @@ rule qc_read_depth:
         bcf_dir / "{vcf_stem}.bcf",
     output:
         qc_read_depth_dir / "{vcf_stem}.tsv.gz",
+    resources:
+        mem_mb=lambda wildcards, attempt: 256 * attempt,
     shell:
         f"""{load_bcftools} bcftools query --format '[%CHROM\\t%POS\\t%REF\\t%ALT\\t%SAMPLE\\n]' --include '(GT!="RR" & GT!="mis" & TYPE="snp" & FORMAT/DP < 7) | (GT!="RR" & GT!="mis" & TYPE="indel" & FORMAT/DP < 10)' {{input}} | gzip > {{output}}"""
 
diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile
index 4c0b9433..f33fd5b6 100644
--- a/pipelines/training/train.snakefile
+++ b/pipelines/training/train.snakefile
@@ -21,7 +21,6 @@ rule best_training_run:
     threads: 1
     resources:
         mem_mb = 2048,
-        load = 2000
     shell:
         (
             'deeprvat_train best-training-run '
@@ -58,7 +57,6 @@ rule train:
     priority: 1000
     resources:
         mem_mb = 20000,
-        load = 8000,
         gpus = 1
     shell:
         f"parallel --jobs {n_parallel_training_jobs} --halt now,fail=1 --results train_repeat{{{{1}}}}_trial{{{{2}}}}/ "