From ed98449dbc87d4fc362a4faec9c7b0e46032da3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCck?= Date: Fri, 19 Apr 2024 08:44:09 +0200 Subject: [PATCH] =?UTF-8?q?added=20resource=20specifications=20for=20mem?= =?UTF-8?q?=5Fmb=20back=20into=20pipelines,=20removed=E2=80=A6=20(#72)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added resource specifications for mem_mb back into pipelines, removed load specifications * Add resources back to preprocessing pipeline --------- Co-authored-by: Mück Co-authored-by: Magnus Wahlberg --- pipelines/annotations.snakefile | 21 +++++++++++++++++++ .../association_dataset.snakefile | 1 - .../association_testing/burdens.snakefile | 4 ---- .../regress_eval.snakefile | 4 ---- ...ting_control_for_common_variants.snakefile | 1 - pipelines/cv_training/cv_training.snakefile | 1 - pipelines/preprocessing/preprocess.snakefile | 12 +++++++++++ pipelines/preprocessing/qc.snakefile | 8 +++++++ pipelines/training/train.snakefile | 2 -- 9 files changed, 41 insertions(+), 13 deletions(-) diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile index a97f38ed..f3b617c5 100644 --- a/pipelines/annotations.snakefile +++ b/pipelines/annotations.snakefile @@ -162,6 +162,7 @@ rule select_rename_fill_columns: annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet", output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered_filled.parquet", + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -178,6 +179,7 @@ if not gene_id_file: rule create_gene_id_file: input: gtf_file output: gene_id_file + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -193,6 +195,7 @@ rule filter_by_exon_distance: protein_coding_genes = gene_id_file output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs_filtered.parquet", + resources: mem_mb = lambda wildcards, attempt: 25_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -208,6 +211,7 @@ rule add_gene_ids: gene_id_file = gene_id_file, annotations_path = anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet", output: anno_dir / "vep_deepripe_deepsea_absplice_maf_pIDs.parquet", + resources: mem_mb = lambda wildcards, attempt: 19_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -223,6 +227,7 @@ rule calculate_MAF: anno_dir / "vep_deepripe_deepsea_absplice_af.parquet" output: anno_dir / "vep_deepripe_deepsea_absplice_maf.parquet" + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -241,6 +246,7 @@ rule merge_allele_frequency: annotation_file = anno_dir / "vep_deepripe_deepsea_absplice.parquet" output: anno_dir / "vep_deepripe_deepsea_absplice_af.parquet" + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -261,6 +267,7 @@ rule calculate_allele_frequency: variants = variant_file output: allele_frequencies = anno_tmp_dir / "af_df.parquet" + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join([ f"python {annotation_python_file}", @@ -282,6 +289,7 @@ rule merge_absplice_scores: output: anno_dir / "vep_deepripe_deepsea_absplice.parquet" threads: ncores_merge_absplice + resources: mem_mb = lambda wildcards, attempt: 19_000 * (attempt + 1), shell: " ".join( [ @@ -300,6 +308,7 @@ rule aggregate_absplice_scores: output: score_file = anno_tmp_dir / "abSplice_score_file.parquet", threads: ncores_agg_absplice + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join( [ @@ -318,6 +327,7 @@ rule merge_deepsea_pcas: col_yaml_file = annotation_columns_yaml_file output: anno_dir / "vep_deepripe_deepsea.parquet", + resources: mem_mb = lambda wildcards, attempt: 30_000 * (attempt + 1), shell: " ".join( [ @@ -341,6 +351,7 @@ rule concat_annotations: output: anno_dir / "vep_deepripe.parquet", params: joined=lambda w, input: ",".join(input.vcf_files) + resources: mem_mb = lambda wildcards, attempt: 15_000 * (attempt + 1), shell: " ".join( [ @@ -366,6 +377,7 @@ rule merge_annotations: vcf_file= anno_tmp_dir / (source_variant_file_pattern + "_variants.vcf"), output: anno_dir / f"{source_variant_file_pattern}_merged.parquet", + resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1), shell: ( "HEADER=$(grep -n '#Uploaded_variation' " @@ -380,6 +392,7 @@ rule deepSea_PCA: deepsea_anno = str(anno_dir / "all_variants.deepSea.parquet") output: deepSEA_tmp_dir / "deepsea_pca.parquet", + resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1), shell: " ".join( ["mkdir -p", @@ -404,6 +417,7 @@ rule add_ids_deepSea: output: directory(anno_dir / "all_variants.wID.deepSea.parquet"), threads: ncores_addis + resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1), shell: " ".join( [ @@ -429,6 +443,7 @@ rule concat_deepSea: ), params: joined=lambda w, input: ",".join(input.deepSEAscoreFiles) threads:8 + resources: mem_mb = lambda wildcards, attempt: 50_000 * (attempt + 1), output: anno_dir / "all_variants.deepSea.parquet", shell: @@ -452,6 +467,8 @@ rule deepSea: output: anno_dir / (source_variant_file_pattern + ".CLI.deepseapredict.diff.tsv"), threads: n_jobs_deepripe + resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1), + conda: "kipoi-veff2" shell: @@ -465,6 +482,7 @@ rule deepRiPe_parclip: output: anno_dir / (source_variant_file_pattern + "_variants.parclip_deepripe.csv.gz"), threads: n_jobs_deepripe + resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1), shell: f"mkdir -p {pybedtools_tmp_path / 'parclip'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'parclip'} {saved_deepripe_models_path} {{threads}} 'parclip'" @@ -476,6 +494,7 @@ rule deepRiPe_eclip_hg2: output: anno_dir / (source_variant_file_pattern + "_variants.eclip_hg2_deepripe.csv.gz"), threads: lambda wildcards, attempt: n_jobs_deepripe * attempt + resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1), shell: f"mkdir -p {pybedtools_tmp_path / 'hg2'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'hg2'} {saved_deepripe_models_path} {{threads}} 'eclip_hg2'" @@ -487,6 +506,7 @@ rule deepRiPe_eclip_k5: output: anno_dir / (source_variant_file_pattern + "_variants.eclip_k5_deepripe.csv.gz"), threads: lambda wildcards, attempt: n_jobs_deepripe * attempt + resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1), shell: f"mkdir -p {pybedtools_tmp_path / 'k5'} && python {annotation_python_file} scorevariants-deepripe {{input.variants}} {anno_dir} {{input.fasta}} {pybedtools_tmp_path / 'k5'} {saved_deepripe_models_path} {{threads}} 'eclip_k5'" @@ -498,6 +518,7 @@ rule vep: output: anno_dir / (source_variant_file_pattern + "_vep_anno.tsv"), threads: vep_nfork + resources: mem_mb = lambda wildcards, attempt: 5_000 * (attempt + 1), shell: " ".join( [ diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile index ad6d1870..9c8ba228 100644 --- a/pipelines/association_testing/association_dataset.snakefile +++ b/pipelines/association_testing/association_dataset.snakefile @@ -11,7 +11,6 @@ rule association_dataset: threads: 4 resources: mem_mb = lambda wildcards, attempt: 32000 * (attempt + 1), - load = 64000 priority: 30 shell: 'deeprvat_associate make-dataset ' diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index 5ac7cf1f..99f72d95 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -15,7 +15,6 @@ rule average_burdens: threads: 1 resources: mem_mb = lambda wildcards, attempt: 4098 + (attempt - 1) * 4098, - load = 4000, priority: 10, shell: ' && '.join([ @@ -46,7 +45,6 @@ rule link_burdens: threads: 8 resources: mem_mb = lambda wildcards, attempt: 20480 + (attempt - 1) * 4098, - load = lambda wildcards, attempt: 16000 + (attempt - 1) * 4000 shell: ' && '.join([ ('deeprvat_associate compute-burdens ' @@ -80,7 +78,6 @@ rule compute_burdens: threads: 8 resources: mem_mb = 20000, - load = 8000, gpus = 1 shell: ' && '.join([ @@ -107,7 +104,6 @@ rule reverse_models: threads: 4 resources: mem_mb = 20480, - load = 20480 shell: " && ".join([ ("deeprvat_associate reverse-models " diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile index 2f5325fb..70aa086b 100644 --- a/pipelines/association_testing/regress_eval.snakefile +++ b/pipelines/association_testing/regress_eval.snakefile @@ -12,7 +12,6 @@ rule evaluate: threads: 1 resources: mem_mb = 16000, - load = 16000 params: n_combis = 1, use_baseline_results = '--use-baseline-results' @@ -35,7 +34,6 @@ rule combine_regression_chunks: threads: 1 resources: mem_mb = lambda wildcards, attempt: 12000 + (attempt - 1) * 4098, - load = 2000 shell: 'deeprvat_associate combine-regression-results ' '--model-name repeat_0 ' @@ -60,8 +58,6 @@ rule regress: threads: 2 resources: mem_mb = lambda wildcards, attempt: 28676 + (attempt - 1) * 4098, - # mem_mb = 16000, - load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000 params: burden_file = f'{phenotypes[0]}/deeprvat/burdens/burdens_average.zarr', burden_dir = '{phenotype}/deeprvat/burdens', diff --git a/pipelines/association_testing_control_for_common_variants.snakefile b/pipelines/association_testing_control_for_common_variants.snakefile index 37729478..944edbf8 100644 --- a/pipelines/association_testing_control_for_common_variants.snakefile +++ b/pipelines/association_testing_control_for_common_variants.snakefile @@ -118,7 +118,6 @@ rule regression_correct_common: threads: 2 resources: mem_mb = lambda wildcards, attempt: 28676 + (attempt - 1) * 4098, - load = lambda wildcards, attempt: 28000 + (attempt - 1) * 4000 params: burden_file = f'{burden_phenotype}/deeprvat/burdens/burdens_{{burden_agg_fct}}_{{n_avg_repeats}}_{{combi}}.zarr', burden_dir = '{phenotype}/deeprvat/burdens/', diff --git a/pipelines/cv_training/cv_training.snakefile b/pipelines/cv_training/cv_training.snakefile index be8bfaed..3c4bb674 100644 --- a/pipelines/cv_training/cv_training.snakefile +++ b/pipelines/cv_training/cv_training.snakefile @@ -18,7 +18,6 @@ rule spread_config: threads: 1 resources: mem_mb = 1024, - load = 1000 shell: ' && '.join([ conda_check, diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index 20eb0c89..cad20b3c 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -60,6 +60,8 @@ rule normalize: vcf_file=lambda wildcards: vcf_look_up[wildcards.vcf_stem], output: bcf_file=bcf_dir / "{vcf_stem}.bcf", + resources: + mem_mb=lambda wildcards, attempt: 16384 * (attempt + 1), shell: f"""{load_bcftools} bcftools view --samples-file {{input.samplefile}} --output-type u {{params.vcf_file}} | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}""" @@ -78,6 +80,8 @@ rule sparsify: bcf=bcf_dir / "{vcf_stem}.bcf", output: tsv=sparse_dir / "{vcf_stem}.tsv.gz", + resources: + mem_mb=512, shell: f"""{load_bcftools} bcftools query --format '[%CHROM\t%POS\t%REF\t%ALT\t%SAMPLE\t%GT\n]' --include 'GT!="RR" & GT!="mis"' {{input.bcf}} \ | sed 's/0[/,|]1/1/; s/1[/,|]0/1/; s/1[/,|]1/2/; s/0[/,|]0/0/' | gzip > {{output.tsv}}""" @@ -88,6 +92,8 @@ rule variants: bcf=bcf_dir / "{vcf_stem}.bcf", output: norm_variants_dir / "{vcf_stem}.tsv.gz", + resources: + mem_mb=512, shell: f"{load_bcftools} bcftools query --format '%CHROM\t%POS\t%REF\t%ALT\n' {{input}} | gzip > {{output}}" @@ -97,6 +103,8 @@ rule concatenate_variants: expand(norm_variants_dir / "{vcf_stem}.tsv.gz",vcf_stem=vcf_stems), output: norm_variants_dir / "variants_no_id.tsv.gz", + resources: + mem_mb=256, shell: "{zcat_cmd} {input} | gzip > {output}" @@ -107,6 +115,8 @@ rule add_variant_ids: output: variants=norm_variants_dir / "variants.tsv.gz", duplicates=qc_duplicate_vars_dir / "duplicates.tsv", + resources: + mem_mb=2048, shell: f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}" @@ -117,6 +127,8 @@ rule create_parquet_variant_ids: output: variants=norm_variants_dir / "variants.parquet", duplicates=qc_duplicate_vars_dir / "duplicates.parquet", + resources: + mem_mb=2048, shell: f"{preprocessing_cmd} add-variant-ids {{input}} {{output.variants}} {{output.duplicates}}" diff --git a/pipelines/preprocessing/qc.snakefile b/pipelines/preprocessing/qc.snakefile index 8b499273..2369de45 100644 --- a/pipelines/preprocessing/qc.snakefile +++ b/pipelines/preprocessing/qc.snakefile @@ -5,6 +5,8 @@ rule qc_allelic_imbalance: bcf_dir / "{vcf_stem}.bcf", output: qc_allelic_imbalance_dir / "{vcf_stem}.tsv.gz", + resources: + mem_mb=lambda wildcards, attempt: 256 * attempt, shell: f"""{load_bcftools} bcftools query --format '%CHROM\t%POS\t%REF\t%ALT\n' --exclude 'COUNT(GT="het")=0 || (GT="het" & ((TYPE="snp" & (FORMAT/AD[*:1] / FORMAT/AD[*:0]) > 0.15) | (TYPE="indel" & (FORMAT/AD[*:1] / FORMAT/AD[*:0]) > 0.20)))' {{input}} | gzip > {{output}}""" @@ -14,6 +16,8 @@ rule qc_varmiss: bcf_dir / "{vcf_stem}.bcf", output: qc_varmiss_dir / "{vcf_stem}.tsv.gz", + resources: + mem_mb=lambda wildcards, attempt: 256 * attempt, shell: f'{load_bcftools} bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\n" --include "F_MISSING >= 0.1" {{input}} | gzip > {{output}}' @@ -23,6 +27,8 @@ rule qc_hwe: bcf_dir / "{vcf_stem}.bcf", output: qc_hwe_dir / "{vcf_stem}.tsv.gz", + resources: + mem_mb=lambda wildcards, attempt: 256 * (attempt + 1), shell: f'{load_bcftools} bcftools +fill-tags --output-type u {{input}} -- --tags HWE | bcftools query --format "%CHROM\t%POS\t%REF\t%ALT\n" --include "INFO/HWE <= 1e-15" | gzip > {{output}}' @@ -32,6 +38,8 @@ rule qc_read_depth: bcf_dir / "{vcf_stem}.bcf", output: qc_read_depth_dir / "{vcf_stem}.tsv.gz", + resources: + mem_mb=lambda wildcards, attempt: 256 * attempt, shell: f"""{load_bcftools} bcftools query --format '[%CHROM\\t%POS\\t%REF\\t%ALT\\t%SAMPLE\\n]' --include '(GT!="RR" & GT!="mis" & TYPE="snp" & FORMAT/DP < 7) | (GT!="RR" & GT!="mis" & TYPE="indel" & FORMAT/DP < 10)' {{input}} | gzip > {{output}}""" diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile index 4c0b9433..f33fd5b6 100644 --- a/pipelines/training/train.snakefile +++ b/pipelines/training/train.snakefile @@ -21,7 +21,6 @@ rule best_training_run: threads: 1 resources: mem_mb = 2048, - load = 2000 shell: ( 'deeprvat_train best-training-run ' @@ -58,7 +57,6 @@ rule train: priority: 1000 resources: mem_mb = 20000, - load = 8000, gpus = 1 shell: f"parallel --jobs {n_parallel_training_jobs} --halt now,fail=1 --results train_repeat{{{{1}}}}_trial{{{{2}}}}/ "