From 198335a485c6b181cb1a87b5d56c2de2e8c7b177 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 14 Dec 2023 16:21:07 +0100 Subject: [PATCH 01/15] making snakemake runners modular Additional snakemake runners for only running training, only association testing, and full train + association testing pipelines. --- .../association_dataset.snakefile | 12 + .../association_testing/burdens.snakefile | 74 +++++ .../regress_eval.snakefile | 63 ++++ .../association_testing_pretrained.snakefile | 187 +----------- pipelines/run_training.snakefile | 48 +++ pipelines/training/config.snakefile | 29 ++ pipelines/training/train.snakefile | 61 ++++ pipelines/training/training_dataset.snakefile | 37 +++ .../training_association_testing.snakefile | 286 +----------------- 9 files changed, 341 insertions(+), 456 deletions(-) create mode 100644 pipelines/association_testing/association_dataset.snakefile create mode 100644 pipelines/association_testing/burdens.snakefile create mode 100644 pipelines/association_testing/regress_eval.snakefile create mode 100644 pipelines/run_training.snakefile create mode 100644 pipelines/training/config.snakefile create mode 100644 pipelines/training/train.snakefile create mode 100644 pipelines/training/training_dataset.snakefile diff --git a/pipelines/association_testing/association_dataset.snakefile b/pipelines/association_testing/association_dataset.snakefile new file mode 100644 index 00000000..0e63e53f --- /dev/null +++ b/pipelines/association_testing/association_dataset.snakefile @@ -0,0 +1,12 @@ + +rule association_dataset: + input: + config = '{phenotype}/deeprvat/hpopt_config.yaml' + output: + '{phenotype}/deeprvat/association_dataset.pkl' + threads: 4 + shell: + 'deeprvat_associate make-dataset ' + + debug + + '{input.config} ' + '{output}' diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile new file mode 100644 index 00000000..7e95372f --- /dev/null +++ b/pipelines/association_testing/burdens.snakefile @@ -0,0 +1,74 @@ + +rule link_burdens: + priority: 1 + input: + checkpoints = lambda wildcards: [ + f'{pretrained_model_path}/repeat_{repeat}/best/bag_{bag}.ckpt' + for repeat in range(n_repeats) for bag in range(n_bags) + ], + dataset = '{phenotype}/deeprvat/association_dataset.pkl', + data_config = '{phenotype}/deeprvat/hpopt_config.yaml', + model_config = pretrained_model_path / 'config.yaml', + output: + '{phenotype}/deeprvat/burdens/chunk{chunk}.linked' + threads: 8 + shell: + ' && '.join([ + ('deeprvat_associate compute-burdens ' + + debug + + ' --n-chunks '+ str(n_burden_chunks) + ' ' + f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr ' + '--chunk {wildcards.chunk} ' + '--dataset-file {input.dataset} ' + '{input.data_config} ' + '{input.model_config} ' + '{input.checkpoints} ' + '{wildcards.phenotype}/deeprvat/burdens'), + 'touch {output}' + ]) + +rule compute_burdens: + priority: 10 + input: + reversed = pretrained_model_path / "reverse_finished.tmp", + checkpoints = lambda wildcards: [ + pretrained_model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt' + for repeat in range(n_repeats) for bag in range(n_bags) + ], + dataset = '{phenotype}/deeprvat/association_dataset.pkl', + data_config = '{phenotype}/deeprvat/hpopt_config.yaml', + model_config = pretrained_model_path / 'config.yaml', + output: + '{phenotype}/deeprvat/burdens/chunk{chunk}.finished' + threads: 8 + shell: + ' && '.join([ + ('deeprvat_associate compute-burdens ' + + debug + + ' --n-chunks '+ str(n_burden_chunks) + ' ' + '--chunk {wildcards.chunk} ' + '--dataset-file {input.dataset} ' + '{input.data_config} ' + '{input.model_config} ' + '{input.checkpoints} ' + '{wildcards.phenotype}/deeprvat/burdens'), + 'touch {output}' + ]) + +rule reverse_models: + input: + checkpoints = expand(pretrained_model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', + bag=range(n_bags), repeat=range(n_repeats)), + model_config = pretrained_model_path / 'config.yaml', + data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml", + output: + temp(pretrained_model_path / "reverse_finished.tmp") + threads: 4 + shell: + " && ".join([ + ("deeprvat_associate reverse-models " + "{input.model_config} " + "{input.data_config} " + "{input.checkpoints}"), + "touch {output}" + ]) \ No newline at end of file diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile new file mode 100644 index 00000000..bcb3f369 --- /dev/null +++ b/pipelines/association_testing/regress_eval.snakefile @@ -0,0 +1,63 @@ + +rule evaluate: + input: + associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', + repeat=range(n_repeats)), + config = '{phenotype}/deeprvat/hpopt_config.yaml', + output: + "{phenotype}/deeprvat/eval/significant.parquet", + "{phenotype}/deeprvat/eval/all_results.parquet" + threads: 1 + shell: + 'deeprvat_evaluate ' + + debug + + '--use-seed-genes ' + '--n-repeats {n_repeats} ' + '--correction-method FDR ' + '{input.associations} ' + '{input.config} ' + '{wildcards.phenotype}/deeprvat/eval' + +rule all_regression: + input: + expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', + phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)), + +rule combine_regression_chunks: + input: + expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)), + output: + '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', + threads: 1 + shell: + 'deeprvat_associate combine-regression-results ' + '--model-name repeat_{wildcards.repeat} ' + '{input} ' + '{output}' + +rule regress: + input: + config = "{phenotype}/deeprvat/hpopt_config.yaml", + chunks = lambda wildcards: expand( + ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' + + ("finished" if wildcards.phenotype == phenotypes[0] else "linked")), + chunk=range(n_burden_chunks) + ), + phenotype_0_chunks = expand( + phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished', + chunk=range(n_burden_chunks) + ), + output: + temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'), + threads: 2 + shell: + 'deeprvat_associate regress ' + + debug + + '--chunk {wildcards.chunk} ' + '--n-chunks ' + str(n_regression_chunks) + ' ' + '--use-bias ' + '--repeat {wildcards.repeat} ' + + do_scoretest + + '{input.config} ' + '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats + '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results' \ No newline at end of file diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index 702302f0..379ba795 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -8,16 +8,23 @@ phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 +n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' -pretrained_model_path = Path(config.get("pretrained_model_path", "pretrained_models")) +tensor_compression_level = config['training'].get('tensor_compression_level', 1) +pretrained_model_path = Path('models') #Path(config.get("pretrained_model_path", "pretrained_models")) wildcard_constraints: repeat="\d+", trial="\d+", +include: "training/config.snakefile" +include: "association_testing/association_dataset.snakefile" +include: "association_testing/burdens.snakefile" +include: "association_testing/regress_eval.snakefile" + rule all: input: expand("{phenotype}/deeprvat/eval/significant.parquet", @@ -25,69 +32,6 @@ rule all: expand("{phenotype}/deeprvat/eval/all_results.parquet", phenotype=phenotypes) -rule evaluate: - input: - associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', - repeat=range(n_repeats)), - config = '{phenotype}/deeprvat/hpopt_config.yaml', - output: - "{phenotype}/deeprvat/eval/significant.parquet", - "{phenotype}/deeprvat/eval/all_results.parquet" - threads: 1 - shell: - 'deeprvat_evaluate ' - + debug + - '--use-seed-genes ' - '--n-repeats {n_repeats} ' - '--correction-method FDR ' - '{input.associations} ' - '{input.config} ' - '{wildcards.phenotype}/deeprvat/eval' - -rule all_regression: - input: - expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', - phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)), - -rule combine_regression_chunks: - input: - expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)), - output: - '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', - threads: 1 - shell: - 'deeprvat_associate combine-regression-results ' - '--model-name repeat_{wildcards.repeat} ' - '{input} ' - '{output}' - -rule regress: - input: - config = "{phenotype}/deeprvat/hpopt_config.yaml", - chunks = lambda wildcards: expand( - ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' + - ("finished" if wildcards.phenotype == phenotypes[0] else "linked")), - chunk=range(n_burden_chunks) - ), - phenotype_0_chunks = expand( - phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished', - chunk=range(n_burden_chunks) - ), - output: - temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'), - threads: 2 - shell: - 'deeprvat_associate regress ' - + debug + - '--chunk {wildcards.chunk} ' - '--n-chunks ' + str(n_regression_chunks) + ' ' - '--use-bias ' - '--repeat {wildcards.repeat} ' - + do_scoretest + - '{input.config} ' - '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats - '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results' - rule all_burdens: input: [ @@ -97,97 +41,11 @@ rule all_burdens: for c in range(n_burden_chunks) ] -rule link_burdens: - priority: 1 - input: - checkpoints = lambda wildcards: [ - f'{pretrained_model_path}/repeat_{repeat}/best/bag_{bag}.ckpt' - for repeat in range(n_repeats) for bag in range(n_bags) - ], - dataset = '{phenotype}/deeprvat/association_dataset.pkl', - data_config = '{phenotype}/deeprvat/hpopt_config.yaml', - model_config = pretrained_model_path / 'config.yaml', - output: - '{phenotype}/deeprvat/burdens/chunk{chunk}.linked' - threads: 8 - shell: - ' && '.join([ - ('deeprvat_associate compute-burdens ' - + debug + - ' --n-chunks '+ str(n_burden_chunks) + ' ' - f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr ' - '--chunk {wildcards.chunk} ' - '--dataset-file {input.dataset} ' - '{input.data_config} ' - '{input.model_config} ' - '{input.checkpoints} ' - '{wildcards.phenotype}/deeprvat/burdens'), - 'touch {output}' - ]) - -rule compute_burdens: - priority: 10 - input: - reversed = pretrained_model_path / "reverse_finished.tmp", - checkpoints = lambda wildcards: [ - pretrained_model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt' - for repeat in range(n_repeats) for bag in range(n_bags) - ], - dataset = '{phenotype}/deeprvat/association_dataset.pkl', - data_config = '{phenotype}/deeprvat/hpopt_config.yaml', - model_config = pretrained_model_path / 'config.yaml', - output: - '{phenotype}/deeprvat/burdens/chunk{chunk}.finished' - threads: 8 - shell: - ' && '.join([ - ('deeprvat_associate compute-burdens ' - + debug + - ' --n-chunks '+ str(n_burden_chunks) + ' ' - '--chunk {wildcards.chunk} ' - '--dataset-file {input.dataset} ' - '{input.data_config} ' - '{input.model_config} ' - '{input.checkpoints} ' - '{wildcards.phenotype}/deeprvat/burdens'), - 'touch {output}' - ]) - rule all_association_dataset: input: expand('{phenotype}/deeprvat/association_dataset.pkl', phenotype=phenotypes) -rule association_dataset: - input: - config = '{phenotype}/deeprvat/hpopt_config.yaml' - output: - '{phenotype}/deeprvat/association_dataset.pkl' - threads: 4 - shell: - 'deeprvat_associate make-dataset ' - + debug + - '{input.config} ' - '{output}' - -rule reverse_models: - input: - checkpoints = expand(pretrained_model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', - bag=range(n_bags), repeat=range(n_repeats)), - model_config = pretrained_model_path / 'config.yaml', - data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml", - output: - temp(pretrained_model_path / "reverse_finished.tmp") - threads: 4 - shell: - " && ".join([ - ("deeprvat_associate reverse-models " - "{input.model_config} " - "{input.data_config} " - "{input.checkpoints}"), - "touch {output}" - ]) - rule all_config: input: seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet', @@ -196,32 +54,3 @@ rule all_config: phenotype=phenotypes), baseline = expand('{phenotype}/deeprvat/baseline_results.parquet', phenotype=phenotypes), - -rule config: - input: - config = 'config.yaml', - baseline = lambda wildcards: [ - str(Path(r['base']) / wildcards.phenotype / r['type'] / - 'eval/burden_associations.parquet') - for r in config['baseline_results'] - ] - output: - seed_genes = '{phenotype}/deeprvat/seed_genes.parquet', - config = '{phenotype}/deeprvat/hpopt_config.yaml', - baseline = '{phenotype}/deeprvat/baseline_results.parquet', - threads: 1 - params: - baseline_results = lambda wildcards, input: ''.join([ - f'--baseline-results {b} ' - for b in input.baseline - ]) - shell: - ( - 'deeprvat_config update-config ' - '--phenotype {wildcards.phenotype} ' - '{params.baseline_results}' - '--baseline-results-out {output.baseline} ' - '--seed-genes-out {output.seed_genes} ' - '{input.config} ' - '{output.config}' - ) diff --git a/pipelines/run_training.snakefile b/pipelines/run_training.snakefile new file mode 100644 index 00000000..dc5f0254 --- /dev/null +++ b/pipelines/run_training.snakefile @@ -0,0 +1,48 @@ +from pathlib import Path + +configfile: 'config.yaml' + +debug_flag = config.get('debug', False) +phenotypes = config['phenotypes'] +phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes + +n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 +n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 +n_trials = config['hyperparameter_optimization']['n_trials'] +n_bags = config['training']['n_bags'] if not debug_flag else 3 +n_repeats = config['n_repeats'] +debug = '--debug ' if debug_flag else '' +do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' +tensor_compression_level = config['training'].get('tensor_compression_level', 1) + +wildcard_constraints: + repeat="\d+", + trial="\d+", + +include: "training/config.snakefile" +include: "training/training_dataset.snakefile" +include: "training/train.snakefile" + +rule all: + input: + expand('models/repeat_{repeat}/best/bag_{bag}.ckpt', + bag=range(n_bags), repeat=range(n_repeats)), + "models/config.yaml" + +rule all_training_dataset: + input: + input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr', + phenotype=phenotypes, repeat=range(n_repeats)), + covariates = expand('{phenotype}/deeprvat/covariates.zarr', + phenotype=phenotypes, repeat=range(n_repeats)), + y = expand('{phenotype}/deeprvat/y.zarr', + phenotype=phenotypes, repeat=range(n_repeats)) + +rule all_config: + input: + seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet', + phenotype=phenotypes), + config = expand('{phenotype}/deeprvat/hpopt_config.yaml', + phenotype=phenotypes), + baseline = expand('{phenotype}/deeprvat/baseline_results.parquet', + phenotype=phenotypes), \ No newline at end of file diff --git a/pipelines/training/config.snakefile b/pipelines/training/config.snakefile new file mode 100644 index 00000000..3c58a39d --- /dev/null +++ b/pipelines/training/config.snakefile @@ -0,0 +1,29 @@ + +rule config: + input: + config = 'config.yaml', + baseline = lambda wildcards: [ + str(Path(r['base']) / wildcards.phenotype / r['type'] / + 'eval/burden_associations.parquet') + for r in config['baseline_results'] + ] + output: + seed_genes = '{phenotype}/deeprvat/seed_genes.parquet', + config = '{phenotype}/deeprvat/hpopt_config.yaml', + baseline = '{phenotype}/deeprvat/baseline_results.parquet', + threads: 1 + params: + baseline_results = lambda wildcards, input: ''.join([ + f'--baseline-results {b} ' + for b in input.baseline + ]) + shell: + ( + 'deeprvat_config update-config ' + '--phenotype {wildcards.phenotype} ' + '{params.baseline_results}' + '--baseline-results-out {output.baseline} ' + '--seed-genes-out {output.seed_genes} ' + '{input.config} ' + '{output.config}' + ) \ No newline at end of file diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile new file mode 100644 index 00000000..c904180d --- /dev/null +++ b/pipelines/training/train.snakefile @@ -0,0 +1,61 @@ + +rule link_config: + input: + 'models/repeat_0/config.yaml' + output: + "models/config.yaml" + threads: 1 + shell: + "ln -s repeat_0/config.yaml {output}" + + +rule best_training_run: + input: + expand('models/repeat_{{repeat}}/trial{trial_number}/config.yaml', + trial_number=range(n_trials)), + output: + checkpoints = expand('models/repeat_{{repeat}}/best/bag_{bag}.ckpt', + bag=range(n_bags)), + config = 'models/repeat_{repeat}/config.yaml' + threads: 1 + shell: + ( + 'deeprvat_train best-training-run ' + + debug + + 'models/repeat_{wildcards.repeat} ' + 'models/repeat_{wildcards.repeat}/best ' + 'models/repeat_{wildcards.repeat}/hyperparameter_optimization.db ' + '{output.config}' + ) + +rule train: + input: + config = expand('{phenotype}/deeprvat/hpopt_config.yaml', + phenotype=phenotypes), + input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr', + phenotype=phenotypes), + covariates = expand('{phenotype}/deeprvat/covariates.zarr', + phenotype=phenotypes), + y = expand('{phenotype}/deeprvat/y.zarr', + phenotype=phenotypes), + output: + config = 'models/repeat_{repeat}/trial{trial_number}/config.yaml', + finished = 'models/repeat_{repeat}/trial{trial_number}/finished.tmp' + params: + phenotypes = " ".join( + [f"--phenotype {p} " + f"{p}/deeprvat/input_tensor.zarr " + f"{p}/deeprvat/covariates.zarr " + f"{p}/deeprvat/y.zarr" + for p in phenotypes]) + shell: + ' && '.join([ + 'deeprvat_train train ' + + debug + + '--trial-id {wildcards.trial_number} ' + "{params.phenotypes} " + 'config.yaml ' + 'models/repeat_{wildcards.repeat}/trial{wildcards.trial_number} ' + 'models/repeat_{wildcards.repeat}/hyperparameter_optimization.db', + 'touch {output.finished}' + ]) diff --git a/pipelines/training/training_dataset.snakefile b/pipelines/training/training_dataset.snakefile new file mode 100644 index 00000000..66903b85 --- /dev/null +++ b/pipelines/training/training_dataset.snakefile @@ -0,0 +1,37 @@ + +rule training_dataset: + input: + config = '{phenotype}/deeprvat/hpopt_config.yaml', + training_dataset = '{phenotype}/deeprvat/training_dataset.pkl' + output: + input_tensor = directory('{phenotype}/deeprvat/input_tensor.zarr'), + covariates = directory('{phenotype}/deeprvat/covariates.zarr'), + y = directory('{phenotype}/deeprvat/y.zarr') + threads: 8 + priority: 50 + shell: + ( + 'deeprvat_train make-dataset ' + + debug + + '--compression-level ' + str(tensor_compression_level) + ' ' + '--training-dataset-file {input.training_dataset} ' + '{input.config} ' + '{output.input_tensor} ' + '{output.covariates} ' + '{output.y}' + ) + +rule training_dataset_pickle: + input: + '{phenotype}/deeprvat/hpopt_config.yaml' + output: + '{phenotype}/deeprvat/training_dataset.pkl' + threads: 1 + shell: + ( + 'deeprvat_train make-dataset ' + '--pickle-only ' + '--training-dataset-file {output} ' + '{input} ' + 'dummy dummy dummy' + ) \ No newline at end of file diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index 069602b6..f0f91134 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -14,11 +14,19 @@ n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) +pretrained_model_path = Path('models') wildcard_constraints: repeat="\d+", trial="\d+", +include: "training/config.snakefile" +include: "training/training_dataset.snakefile" +include: "training/train.snakefile" +include: "association_testing/association_dataset.snakefile" +include: "association_testing/burdens.snakefile" +include: "association_testing/regress_eval.snakefile" + rule all: input: expand("{phenotype}/deeprvat/eval/significant.parquet", @@ -26,69 +34,6 @@ rule all: expand("{phenotype}/deeprvat/eval/all_results.parquet", phenotype=phenotypes) -rule evaluate: - input: - associations = expand('{{phenotype}}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', - repeat=range(n_repeats)), - config = '{phenotype}/deeprvat/hpopt_config.yaml', - output: - "{phenotype}/deeprvat/eval/significant.parquet", - "{phenotype}/deeprvat/eval/all_results.parquet" - threads: 1 - shell: - 'deeprvat_evaluate ' - + debug + - '--use-seed-genes ' - '--n-repeats {n_repeats} ' - '--correction-method FDR ' - '{input.associations} ' - '{input.config} ' - '{wildcards.phenotype}/deeprvat/eval' - -rule all_regression: - input: - expand('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', - phenotype=phenotypes, type=['deeprvat'], repeat=range(n_repeats)), - -rule combine_regression_chunks: - input: - expand('{{phenotype}}/deeprvat/repeat_{{repeat}}/results/burden_associations_{chunk}.parquet', chunk=range(n_regression_chunks)), - output: - '{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations.parquet', - threads: 1 - shell: - 'deeprvat_associate combine-regression-results ' - '--model-name repeat_{wildcards.repeat} ' - '{input} ' - '{output}' - -rule regress: - input: - config = "{phenotype}/deeprvat/hpopt_config.yaml", - chunks = lambda wildcards: expand( - ('{{phenotype}}/deeprvat/burdens/chunk{chunk}.' + - ("finished" if wildcards.phenotype == phenotypes[0] else "linked")), - chunk=range(n_burden_chunks) - ), - phenotype_0_chunks = expand( - phenotypes[0] + '/deeprvat/burdens/chunk{chunk}.finished', - chunk=range(n_burden_chunks) - ), - output: - temp('{phenotype}/deeprvat/repeat_{repeat}/results/burden_associations_{chunk}.parquet'), - threads: 2 - shell: - 'deeprvat_associate regress ' - + debug + - '--chunk {wildcards.chunk} ' - '--n-chunks ' + str(n_regression_chunks) + ' ' - '--use-bias ' - '--repeat {wildcards.repeat} ' - + do_scoretest + - '{input.config} ' - '{wildcards.phenotype}/deeprvat/burdens ' #TODO make this w/o repeats - '{wildcards.phenotype}/deeprvat/repeat_{wildcards.repeat}/results' - rule all_burdens: input: [ @@ -98,164 +43,17 @@ rule all_burdens: for c in range(n_burden_chunks) ] -rule link_burdens: - priority: 1 - input: - checkpoints = lambda wildcards: [ - f'models/repeat_{repeat}/best/bag_{bag}.ckpt' - for repeat in range(n_repeats) for bag in range(n_bags) - ], - dataset = '{phenotype}/deeprvat/association_dataset.pkl', - data_config = '{phenotype}/deeprvat/hpopt_config.yaml', - model_config = 'models/config.yaml', - output: - '{phenotype}/deeprvat/burdens/chunk{chunk}.linked' - threads: 8 - shell: - ' && '.join([ - ('deeprvat_associate compute-burdens ' - + debug + - ' --n-chunks '+ str(n_burden_chunks) + ' ' - f'--link-burdens ../../../{phenotypes[0]}/deeprvat/burdens/burdens.zarr ' - '--chunk {wildcards.chunk} ' - '--dataset-file {input.dataset} ' - '{input.data_config} ' - '{input.model_config} ' - '{input.checkpoints} ' - '{wildcards.phenotype}/deeprvat/burdens'), - 'touch {output}' - ]) - -rule compute_burdens: - priority: 10 - input: - reversed = "models/reverse_finished.tmp", - checkpoints = lambda wildcards: [ - f'models/repeat_{repeat}/best/bag_{bag}.ckpt' - for repeat in range(n_repeats) for bag in range(n_bags) - ], - dataset = '{phenotype}/deeprvat/association_dataset.pkl', - data_config = '{phenotype}/deeprvat/hpopt_config.yaml', - model_config = 'models/config.yaml', - output: - '{phenotype}/deeprvat/burdens/chunk{chunk}.finished' - threads: 8 - shell: - ' && '.join([ - ('deeprvat_associate compute-burdens ' - + debug + - ' --n-chunks '+ str(n_burden_chunks) + ' ' - '--chunk {wildcards.chunk} ' - '--dataset-file {input.dataset} ' - '{input.data_config} ' - '{input.model_config} ' - '{input.checkpoints} ' - '{wildcards.phenotype}/deeprvat/burdens'), - 'touch {output}' - ]) - rule all_association_dataset: input: expand('{phenotype}/deeprvat/association_dataset.pkl', phenotype=phenotypes) -rule association_dataset: - input: - config = '{phenotype}/deeprvat/hpopt_config.yaml' - output: - '{phenotype}/deeprvat/association_dataset.pkl' - threads: 4 - shell: - 'deeprvat_associate make-dataset ' - + debug + - '{input.config} ' - '{output}' - -rule reverse_models: - input: - checkpoints = expand('models/repeat_{repeat}/best/bag_{bag}.ckpt', - bag=range(n_bags), repeat=range(n_repeats)), - model_config = 'models/config.yaml', - data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml", - output: - "models/reverse_finished.tmp" - threads: 4 - shell: - " && ".join([ - ("deeprvat_associate reverse-models " - "{input.model_config} " - "{input.data_config} " - "{input.checkpoints}"), - "touch {output}" - ]) - rule all_training: input: expand('models/repeat_{repeat}/best/bag_{bag}.ckpt', bag=range(n_bags), repeat=range(n_repeats)), "models/config.yaml" -rule link_config: - input: - 'models/repeat_0/config.yaml' - output: - "models/config.yaml" - threads: 1 - shell: - "ln -s repeat_0/config.yaml {output}" - - -rule best_training_run: - input: - expand('models/repeat_{{repeat}}/trial{trial_number}/config.yaml', - trial_number=range(n_trials)), - output: - checkpoints = expand('models/repeat_{{repeat}}/best/bag_{bag}.ckpt', - bag=range(n_bags)), - config = 'models/repeat_{repeat}/config.yaml' - threads: 1 - shell: - ( - 'deeprvat_train best-training-run ' - + debug + - 'models/repeat_{wildcards.repeat} ' - 'models/repeat_{wildcards.repeat}/best ' - 'models/repeat_{wildcards.repeat}/hyperparameter_optimization.db ' - '{output.config}' - ) - -rule train: - input: - config = expand('{phenotype}/deeprvat/hpopt_config.yaml', - phenotype=phenotypes), - input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr', - phenotype=phenotypes), - covariates = expand('{phenotype}/deeprvat/covariates.zarr', - phenotype=phenotypes), - y = expand('{phenotype}/deeprvat/y.zarr', - phenotype=phenotypes), - output: - config = 'models/repeat_{repeat}/trial{trial_number}/config.yaml', - finished = 'models/repeat_{repeat}/trial{trial_number}/finished.tmp' - params: - phenotypes = " ".join( - [f"--phenotype {p} " - f"{p}/deeprvat/input_tensor.zarr " - f"{p}/deeprvat/covariates.zarr " - f"{p}/deeprvat/y.zarr" - for p in phenotypes]) - shell: - ' && '.join([ - 'deeprvat_train train ' - + debug + - '--trial-id {wildcards.trial_number} ' - "{params.phenotypes} " - 'config.yaml ' - 'models/repeat_{wildcards.repeat}/trial{wildcards.trial_number} ' - 'models/repeat_{wildcards.repeat}/hyperparameter_optimization.db', - 'touch {output.finished}' - ]) - rule all_training_dataset: input: input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr', @@ -265,43 +63,6 @@ rule all_training_dataset: y = expand('{phenotype}/deeprvat/y.zarr', phenotype=phenotypes, repeat=range(n_repeats)) -rule training_dataset: - input: - config = '{phenotype}/deeprvat/hpopt_config.yaml', - training_dataset = '{phenotype}/deeprvat/training_dataset.pkl' - output: - input_tensor = directory('{phenotype}/deeprvat/input_tensor.zarr'), - covariates = directory('{phenotype}/deeprvat/covariates.zarr'), - y = directory('{phenotype}/deeprvat/y.zarr') - threads: 8 - priority: 50 - shell: - ( - 'deeprvat_train make-dataset ' - + debug + - '--compression-level ' + str(tensor_compression_level) + ' ' - '--training-dataset-file {input.training_dataset} ' - '{input.config} ' - '{output.input_tensor} ' - '{output.covariates} ' - '{output.y}' - ) - -rule training_dataset_pickle: - input: - '{phenotype}/deeprvat/hpopt_config.yaml' - output: - '{phenotype}/deeprvat/training_dataset.pkl' - threads: 1 - shell: - ( - 'deeprvat_train make-dataset ' - '--pickle-only ' - '--training-dataset-file {output} ' - '{input} ' - 'dummy dummy dummy' - ) - rule all_config: input: seed_genes = expand('{phenotype}/deeprvat/seed_genes.parquet', @@ -309,33 +70,4 @@ rule all_config: config = expand('{phenotype}/deeprvat/hpopt_config.yaml', phenotype=phenotypes), baseline = expand('{phenotype}/deeprvat/baseline_results.parquet', - phenotype=phenotypes), - -rule config: - input: - config = 'config.yaml', - baseline = lambda wildcards: [ - str(Path(r['base']) / wildcards.phenotype / r['type'] / - 'eval/burden_associations.parquet') - for r in config['baseline_results'] - ] - output: - seed_genes = '{phenotype}/deeprvat/seed_genes.parquet', - config = '{phenotype}/deeprvat/hpopt_config.yaml', - baseline = '{phenotype}/deeprvat/baseline_results.parquet', - threads: 1 - params: - baseline_results = lambda wildcards, input: ''.join([ - f'--baseline-results {b} ' - for b in input.baseline - ]) - shell: - ( - 'deeprvat_config update-config ' - '--phenotype {wildcards.phenotype} ' - '{params.baseline_results}' - '--baseline-results-out {output.baseline} ' - '--seed-genes-out {output.seed_genes} ' - '{input.config} ' - '{output.config}' - ) + phenotype=phenotypes), \ No newline at end of file From f827648567c3248a15c8e65d84d039873fe422a4 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 14 Dec 2023 16:26:50 +0100 Subject: [PATCH 02/15] bug-fix pretrained model path --- pipelines/association_testing_pretrained.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index 379ba795..dca4fc7c 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -14,7 +14,7 @@ n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) -pretrained_model_path = Path('models') #Path(config.get("pretrained_model_path", "pretrained_models")) +pretrained_model_path = Path(config.get("pretrained_model_path", "pretrained_models")) wildcard_constraints: repeat="\d+", From 5a8696e2d8774291f00f58cfa41aac3cc0a50ecd Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Fri, 15 Dec 2023 16:01:13 +0100 Subject: [PATCH 03/15] Adding additional snakemake pipeline run option to readthedocs --- docs/usage.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 93361782..5d7c9170 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -56,6 +56,20 @@ Replace `[path_to_deeprvat]` with the path to your clone of the repository. Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed. +### Run the training pipeline on some example data + +```shell +mkdir example +cd example +ln -s [path_to_deeprvat]/example/* . +snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/run_training.snakefile +``` + +Replace `[path_to_deeprvat]` with the path to your clone of the repository. + +Note that the example data is randomly generated, and so is only suited for testing whether the `deeprvat` package has been correctly installed. + + ### Run the association testing pipeline with pretrained models ```shell From 3f139db37f2a290f61274f3cafb67afeb118584b Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 21 Dec 2023 12:08:34 +0100 Subject: [PATCH 04/15] update train snakefile pipeline from PR #42 --- pipelines/training/train.snakefile | 37 ++++++++++++++++-------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile index c904180d..c66b6858 100644 --- a/pipelines/training/train.snakefile +++ b/pipelines/training/train.snakefile @@ -31,31 +31,34 @@ rule best_training_run: rule train: input: config = expand('{phenotype}/deeprvat/hpopt_config.yaml', - phenotype=phenotypes), + phenotype=training_phenotypes), input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr', - phenotype=phenotypes), + phenotype=training_phenotypes), covariates = expand('{phenotype}/deeprvat/covariates.zarr', - phenotype=phenotypes), + phenotype=training_phenotypes), y = expand('{phenotype}/deeprvat/y.zarr', - phenotype=phenotypes), + phenotype=training_phenotypes), output: - config = 'models/repeat_{repeat}/trial{trial_number}/config.yaml', - finished = 'models/repeat_{repeat}/trial{trial_number}/finished.tmp' + expand('models/repeat_{repeat}/trial{trial_number}/config.yaml', + repeat=range(n_repeats), trial_number=range(n_trials)), + expand('models/repeat_{repeat}/trial{trial_number}/finished.tmp', + repeat=range(n_repeats), trial_number=range(n_trials)) params: phenotypes = " ".join( [f"--phenotype {p} " f"{p}/deeprvat/input_tensor.zarr " f"{p}/deeprvat/covariates.zarr " f"{p}/deeprvat/y.zarr" - for p in phenotypes]) + for p in training_phenotypes]) shell: - ' && '.join([ - 'deeprvat_train train ' - + debug + - '--trial-id {wildcards.trial_number} ' - "{params.phenotypes} " - 'config.yaml ' - 'models/repeat_{wildcards.repeat}/trial{wildcards.trial_number} ' - 'models/repeat_{wildcards.repeat}/hyperparameter_optimization.db', - 'touch {output.finished}' - ]) + f"parallel --jobs {n_parallel_training_jobs} --halt now,fail=1 --results train_repeat{{{{1}}}}_trial{{{{2}}}}/ " + 'deeprvat_train train ' + + debug + + '--trial-id {{2}} ' + "{params.phenotypes} " + 'config.yaml ' + 'models/repeat_{{1}}/trial{{2}} ' + "models/repeat_{{1}}/hyperparameter_optimization.db '&&' " + "touch models/repeat_{{1}}/trial{{2}}/finished.tmp " + "::: " + " ".join(map(str, range(n_repeats))) + " " + "::: " + " ".join(map(str, range(n_trials))) From 7382e2770a8870844582b4ffbfe965d88228641a Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 21 Dec 2023 15:05:33 +0100 Subject: [PATCH 05/15] bug-fix model path for snakemake pipeline runners --- pipelines/association_testing/burdens.snakefile | 16 ++++++++-------- .../association_testing_pretrained.snakefile | 3 ++- pipelines/training_association_testing.snakefile | 6 +++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index 7e95372f..550390fa 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -3,12 +3,12 @@ rule link_burdens: priority: 1 input: checkpoints = lambda wildcards: [ - f'{pretrained_model_path}/repeat_{repeat}/best/bag_{bag}.ckpt' + f'{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt' for repeat in range(n_repeats) for bag in range(n_bags) ], dataset = '{phenotype}/deeprvat/association_dataset.pkl', data_config = '{phenotype}/deeprvat/hpopt_config.yaml', - model_config = pretrained_model_path / 'config.yaml', + model_config = model_path / 'config.yaml', output: '{phenotype}/deeprvat/burdens/chunk{chunk}.linked' threads: 8 @@ -30,14 +30,14 @@ rule link_burdens: rule compute_burdens: priority: 10 input: - reversed = pretrained_model_path / "reverse_finished.tmp", + reversed = model_path / "reverse_finished.tmp", checkpoints = lambda wildcards: [ - pretrained_model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt' + model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt' for repeat in range(n_repeats) for bag in range(n_bags) ], dataset = '{phenotype}/deeprvat/association_dataset.pkl', data_config = '{phenotype}/deeprvat/hpopt_config.yaml', - model_config = pretrained_model_path / 'config.yaml', + model_config = model_path / 'config.yaml', output: '{phenotype}/deeprvat/burdens/chunk{chunk}.finished' threads: 8 @@ -57,12 +57,12 @@ rule compute_burdens: rule reverse_models: input: - checkpoints = expand(pretrained_model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', + checkpoints = expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', bag=range(n_bags), repeat=range(n_repeats)), - model_config = pretrained_model_path / 'config.yaml', + model_config = model_path / 'config.yaml', data_config = Path(phenotypes[0]) / "deeprvat/hpopt_config.yaml", output: - temp(pretrained_model_path / "reverse_finished.tmp") + temp(model_path / "reverse_finished.tmp") threads: 4 shell: " && ".join([ diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index dca4fc7c..d7aaa006 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -5,6 +5,7 @@ configfile: 'config.yaml' debug_flag = config.get('debug', False) phenotypes = config['phenotypes'] phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes +training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 @@ -14,7 +15,7 @@ n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) -pretrained_model_path = Path(config.get("pretrained_model_path", "pretrained_models")) +model_path = Path(config.get("pretrained_model_path", "pretrained_models")) wildcard_constraints: repeat="\d+", diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index b7001bb9..8b28ce26 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -15,7 +15,7 @@ n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) -pretrained_model_path = Path('models') +model_path = Path('models') n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1) wildcard_constraints: @@ -52,9 +52,9 @@ rule all_association_dataset: rule all_training: input: - expand('models/repeat_{repeat}/best/bag_{bag}.ckpt', + expand('{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt', bag=range(n_bags), repeat=range(n_repeats)), - "models/config.yaml" + "{model_path}/config.yaml" rule all_training_dataset: input: From aec735ffbdce1ae9422aea9124709b9f18f1e668 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Thu, 21 Dec 2023 15:32:21 +0100 Subject: [PATCH 06/15] bug-fix f string syntax --- pipelines/training/train.snakefile | 26 +++++++++---------- .../training_association_testing.snakefile | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile index c66b6858..bed870bf 100644 --- a/pipelines/training/train.snakefile +++ b/pipelines/training/train.snakefile @@ -1,9 +1,9 @@ rule link_config: input: - 'models/repeat_0/config.yaml' + '{model_path}/repeat_0/config.yaml' output: - "models/config.yaml" + '{model_path}/config.yaml' threads: 1 shell: "ln -s repeat_0/config.yaml {output}" @@ -11,20 +11,20 @@ rule link_config: rule best_training_run: input: - expand('models/repeat_{{repeat}}/trial{trial_number}/config.yaml', + expand('{model_path}/repeat_{{repeat}}/trial{trial_number}/config.yaml', trial_number=range(n_trials)), output: - checkpoints = expand('models/repeat_{{repeat}}/best/bag_{bag}.ckpt', + checkpoints = expand('{model_path}/repeat_{{repeat}}/best/bag_{bag}.ckpt', bag=range(n_bags)), - config = 'models/repeat_{repeat}/config.yaml' + config = '{model_path}/repeat_{repeat}/config.yaml' threads: 1 shell: ( 'deeprvat_train best-training-run ' + debug + - 'models/repeat_{wildcards.repeat} ' - 'models/repeat_{wildcards.repeat}/best ' - 'models/repeat_{wildcards.repeat}/hyperparameter_optimization.db ' + '{model_path}/repeat_{wildcards.repeat} ' + '{model_path}/repeat_{wildcards.repeat}/best ' + '{model_path}/repeat_{wildcards.repeat}/hyperparameter_optimization.db ' '{output.config}' ) @@ -39,9 +39,9 @@ rule train: y = expand('{phenotype}/deeprvat/y.zarr', phenotype=training_phenotypes), output: - expand('models/repeat_{repeat}/trial{trial_number}/config.yaml', + expand('{model_path}/repeat_{repeat}/trial{trial_number}/config.yaml', repeat=range(n_repeats), trial_number=range(n_trials)), - expand('models/repeat_{repeat}/trial{trial_number}/finished.tmp', + expand('{model_path}/repeat_{repeat}/trial{trial_number}/finished.tmp', repeat=range(n_repeats), trial_number=range(n_trials)) params: phenotypes = " ".join( @@ -57,8 +57,8 @@ rule train: '--trial-id {{2}} ' "{params.phenotypes} " 'config.yaml ' - 'models/repeat_{{1}}/trial{{2}} ' - "models/repeat_{{1}}/hyperparameter_optimization.db '&&' " - "touch models/repeat_{{1}}/trial{{2}}/finished.tmp " + '{model_path}/repeat_{{1}}/trial{{2}} ' + '{model_path}/repeat_{{1}}/hyperparameter_optimization.db "&&" ' + 'touch {model_path}/repeat_{{1}}/trial{{2}}/finished.tmp ' "::: " + " ".join(map(str, range(n_repeats))) + " " "::: " + " ".join(map(str, range(n_trials))) diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index 8b28ce26..320cb9fb 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -15,7 +15,7 @@ n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) -model_path = Path('models') +model_path = Path("models") n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1) wildcard_constraints: From 6cde84f1188a7f3aae910c72b1c53fbd9c80f219 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:18:15 +0100 Subject: [PATCH 07/15] Update github-actions.yml --- .github/workflows/github-actions.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 2f686b17..2f2e534b 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -8,12 +8,31 @@ jobs: steps: - name: Check out repository code uses: actions/checkout@v3 + - uses: mamba-org/setup-micromamba@v1.4.3 + with: + environment-name: deeprvat-preprocess-gh-action + environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml + cache-environment: true + cache-downloads: true + + - name: Fake fasta data + run: touch workdir/reference/GRCh38.primary_assembly.genome.fa.gz + + - name: Run preprocessing pipeline dry run + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \ + --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \ + --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs + shell: micromamba-shell {0} + - name: Training Association Testing smoke test uses: snakemake/snakemake-github-action@v1.24.0 with: directory: 'example' snakefile: 'pipelines/training_association_testing.snakefile' args: '-j 2 -n' + + - name: Link pretrained models run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - name: Association Testing Pretrained Smoke Test From d27e6a8b9d65561717536cd2111f8b91891cc558 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:20:09 +0100 Subject: [PATCH 08/15] Update github-actions.yml --- .github/workflows/github-actions.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 2f2e534b..186e6102 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -16,7 +16,7 @@ jobs: cache-downloads: true - name: Fake fasta data - run: touch workdir/reference/GRCh38.primary_assembly.genome.fa.gz + run: touch ${{ github.workspace }}/example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa.gz - name: Run preprocessing pipeline dry run run: | @@ -32,7 +32,7 @@ jobs: snakefile: 'pipelines/training_association_testing.snakefile' args: '-j 2 -n' - + - name: Link pretrained models run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - name: Association Testing Pretrained Smoke Test From b8b82e568f823fa8030e244c7b2515b7ad9d3813 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:21:25 +0100 Subject: [PATCH 09/15] Update github-actions.yml --- .github/workflows/github-actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 186e6102..dbd58b87 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -16,7 +16,7 @@ jobs: cache-downloads: true - name: Fake fasta data - run: touch ${{ github.workspace }}/example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa.gz + run: touch ${{ github.workspace }}/example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa - name: Run preprocessing pipeline dry run run: | From 90edc4c3987217788a6f885e38d2661504002ad1 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:23:31 +0100 Subject: [PATCH 10/15] Revert "Update github-actions.yml" This reverts commit b8b82e568f823fa8030e244c7b2515b7ad9d3813. --- .github/workflows/github-actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index dbd58b87..186e6102 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -16,7 +16,7 @@ jobs: cache-downloads: true - name: Fake fasta data - run: touch ${{ github.workspace }}/example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa + run: touch ${{ github.workspace }}/example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa.gz - name: Run preprocessing pipeline dry run run: | From 12c30de79e79050a89379303edada5e58189993c Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:23:35 +0100 Subject: [PATCH 11/15] Revert "Update github-actions.yml" This reverts commit d27e6a8b9d65561717536cd2111f8b91891cc558. --- .github/workflows/github-actions.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 186e6102..2f2e534b 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -16,7 +16,7 @@ jobs: cache-downloads: true - name: Fake fasta data - run: touch ${{ github.workspace }}/example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa.gz + run: touch workdir/reference/GRCh38.primary_assembly.genome.fa.gz - name: Run preprocessing pipeline dry run run: | @@ -32,7 +32,7 @@ jobs: snakefile: 'pipelines/training_association_testing.snakefile' args: '-j 2 -n' - + - name: Link pretrained models run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - name: Association Testing Pretrained Smoke Test From 289ee3efaf3912dcea8f30c0418fda8fe4f5199b Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:23:38 +0100 Subject: [PATCH 12/15] Revert "Update github-actions.yml" This reverts commit 6cde84f1188a7f3aae910c72b1c53fbd9c80f219. --- .github/workflows/github-actions.yml | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 2f2e534b..2f686b17 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -8,31 +8,12 @@ jobs: steps: - name: Check out repository code uses: actions/checkout@v3 - - uses: mamba-org/setup-micromamba@v1.4.3 - with: - environment-name: deeprvat-preprocess-gh-action - environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml - cache-environment: true - cache-downloads: true - - - name: Fake fasta data - run: touch workdir/reference/GRCh38.primary_assembly.genome.fa.gz - - - name: Run preprocessing pipeline dry run - run: | - python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \ - --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \ - --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs - shell: micromamba-shell {0} - - name: Training Association Testing smoke test uses: snakemake/snakemake-github-action@v1.24.0 with: directory: 'example' snakefile: 'pipelines/training_association_testing.snakefile' args: '-j 2 -n' - - - name: Link pretrained models run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - name: Association Testing Pretrained Smoke Test From e5b6a2d8dabdf47b64bf4535e940b8f1061fa418 Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:28:25 +0100 Subject: [PATCH 13/15] Update github-actions.yml --- .github/workflows/github-actions.yml | 99 ++++++++++++++++++---------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 2f686b17..83a31730 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -8,26 +8,37 @@ jobs: steps: - name: Check out repository code uses: actions/checkout@v3 - - name: Training Association Testing smoke test - uses: snakemake/snakemake-github-action@v1.24.0 + - uses: mamba-org/setup-micromamba@v1.4.3 with: - directory: 'example' - snakefile: 'pipelines/training_association_testing.snakefile' - args: '-j 2 -n' + environment-name: deeprvat-gh-action + environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml + cache-environment: true + cache-downloads: true + - name: Link pretrained models run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - - name: Association Testing Pretrained Smoke Test - uses: snakemake/snakemake-github-action@v1.24.0 - with: - directory: 'example' - snakefile: 'pipelines/association_testing_pretrained.snakefile' - args: '-j 2 -n' - - name: Seed Gene Discovery Smoke Test - uses: snakemake/snakemake-github-action@v1.24.0 - with: - directory: 'example' - snakefile: 'pipelines/seed_gene_discovery.snakefile' - args: '-j 2 -n' + + - name: Smoketest training_association_testing pipeline + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ + --snakefile ${{ github.workspace }}/pipelines/training_association_testing.snakefile --show-failed-logs + shell: micromamba-shell {0} + - name: Link pretrained models + run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models + shell: bash -el {0} + - name: Smoketest association_testing_pretrained pipeline + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ + --snakefile ${{ github.workspace }}/pipelines/association_testing_pretrained.snakefile --show-failed-logs + shell: micromamba-shell {0} + - name: Copy seed gene discovery snakemake config + run: cd ${{ github.workspace }}/example && cp ../deeprvat/seed_gene_discovery/config.yaml . + shell: bash -el {0} + - name: Smoketest seed_gene_discovery pipeline + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ + --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs + shell: micromamba-shell {0} DeepRVAT-Pipeline-Tests: runs-on: ubuntu-latest @@ -76,21 +87,32 @@ jobs: steps: - name: Check out repository code uses: actions/checkout@v3 - - name: Preprocessing Smoke Test With QC - uses: snakemake/snakemake-github-action@v1.24.0 + - uses: mamba-org/setup-micromamba@v1.4.3 with: - directory: 'example/preprocess' - snakefile: 'pipelines/preprocess_with_qc.snakefile' - args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml' - stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa' + environment-name: deeprvat-preprocess-gh-action + environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml + cache-environment: true + cache-downloads: true - - name: Preprocessing Smoke Test No QC - uses: snakemake/snakemake-github-action@v1.24.0 - with: - directory: 'example/preprocess' - snakefile: 'pipelines/preprocess_no_qc.snakefile' - args: '-j 2 -n --configfile pipelines/config/deeprvat_preprocess_config.yaml' - stagein: 'touch example/preprocess/workdir/reference/GRCh38.primary_assembly.genome.fa' + - name: Fake fasta data + if: steps.cache-fasta.outputs.cache-hit != 'true' + run: | + cd ${{ github.workspace }}/example/preprocess && touch workdir/reference/GRCh38.primary_assembly.genome.fa + + - name: Run preprocessing pipeline no qc Smoke Test + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \ + --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \ + --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs + shell: micromamba-shell {0} + + + - name: Preprocessing pipeline with qc Smoke Test + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/preprocess \ + --snakefile ${{ github.workspace }}/pipelines/preprocess_with_qc.snakefile \ + --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs + shell: micromamba-shell {0} DeepRVAT-Annotation-Pipeline-Smoke-Tests: @@ -98,19 +120,24 @@ jobs: steps: - name: Check out repository code uses: actions/checkout@v3 - - name: Annotations Smoke Test - uses: snakemake/snakemake-github-action@v1.25.1 + - uses: mamba-org/setup-micromamba@v1.4.3 with: - directory: 'example/annotations' - snakefile: 'pipelines/annotations.snakefile' - args: '-j 2 -n --configfile pipelines/config/deeprvat_annotation_config.yaml' + environment-name: deeprvat-preprocess-gh-action + environment-file: ${{ github.workspace }}/deeprvat_preprocessing_env.yml + cache-environment: true + cache-downloads: true + - name: Annotations Smoke Test + run: | + python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example/annotations \ + --snakefile ${{ github.workspace }}/pipelines/annotations.snakefile \ + --configfile ${{ github.workspace }}/pipelines/config/deeprvat_annotation_config.yaml --show-failed-logs + shell: micromamba-shell {0} DeepRVAT-Preprocessing-Pipeline-Tests-No-QC: runs-on: ubuntu-latest needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests steps: - - name: Check out repository code uses: actions/checkout@v3 - uses: mamba-org/setup-micromamba@v1.4.3 From b79409dc5557dbdce53c30a7d352a365f2f00a1a Mon Sep 17 00:00:00 2001 From: Magnus Wahlberg Date: Thu, 21 Dec 2023 16:31:10 +0100 Subject: [PATCH 14/15] Update github-actions.yml --- .github/workflows/github-actions.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 83a31730..5b3ff8a6 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -14,10 +14,6 @@ jobs: environment-file: ${{ github.workspace }}/deeprvat_env_no_gpu.yml cache-environment: true cache-downloads: true - - - name: Link pretrained models - run: cd ${{ github.workspace }}/example && ln -s ../pretrained_models - - name: Smoketest training_association_testing pipeline run: | python -m snakemake -n -j 2 --directory ${{ github.workspace }}/example \ From 03e04a6697139571dd3100b640c001930b08ea20 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Fri, 22 Dec 2023 11:34:42 +0100 Subject: [PATCH 15/15] fix-model path string variable in rules --- pipelines/run_training.snakefile | 13 ++++++++----- pipelines/training/train.snakefile | 14 +++++++------- pipelines/training_association_testing.snakefile | 4 ++-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/pipelines/run_training.snakefile b/pipelines/run_training.snakefile index dc5f0254..0e10d79e 100644 --- a/pipelines/run_training.snakefile +++ b/pipelines/run_training.snakefile @@ -5,6 +5,7 @@ configfile: 'config.yaml' debug_flag = config.get('debug', False) phenotypes = config['phenotypes'] phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes +training_phenotypes = config["training"].get("phenotypes", phenotypes) n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 @@ -14,6 +15,8 @@ n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' do_scoretest = '--do-scoretest ' if config.get('do_scoretest', False) else '' tensor_compression_level = config['training'].get('tensor_compression_level', 1) +model_path = Path("models") +n_parallel_training_jobs = config["training"].get("n_parallel_jobs", 1) wildcard_constraints: repeat="\d+", @@ -25,18 +28,18 @@ include: "training/train.snakefile" rule all: input: - expand('models/repeat_{repeat}/best/bag_{bag}.ckpt', + expand( model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', bag=range(n_bags), repeat=range(n_repeats)), - "models/config.yaml" + model_path / "config.yaml" rule all_training_dataset: input: input_tensor = expand('{phenotype}/deeprvat/input_tensor.zarr', - phenotype=phenotypes, repeat=range(n_repeats)), + phenotype=training_phenotypes, repeat=range(n_repeats)), covariates = expand('{phenotype}/deeprvat/covariates.zarr', - phenotype=phenotypes, repeat=range(n_repeats)), + phenotype=training_phenotypes, repeat=range(n_repeats)), y = expand('{phenotype}/deeprvat/y.zarr', - phenotype=phenotypes, repeat=range(n_repeats)) + phenotype=training_phenotypes, repeat=range(n_repeats)) rule all_config: input: diff --git a/pipelines/training/train.snakefile b/pipelines/training/train.snakefile index bed870bf..c747fd1f 100644 --- a/pipelines/training/train.snakefile +++ b/pipelines/training/train.snakefile @@ -1,9 +1,9 @@ rule link_config: input: - '{model_path}/repeat_0/config.yaml' + model_path / 'repeat_0/config.yaml' output: - '{model_path}/config.yaml' + model_path / 'config.yaml' threads: 1 shell: "ln -s repeat_0/config.yaml {output}" @@ -11,12 +11,12 @@ rule link_config: rule best_training_run: input: - expand('{model_path}/repeat_{{repeat}}/trial{trial_number}/config.yaml', + expand(model_path / 'repeat_{{repeat}}/trial{trial_number}/config.yaml', trial_number=range(n_trials)), output: - checkpoints = expand('{model_path}/repeat_{{repeat}}/best/bag_{bag}.ckpt', + checkpoints = expand(model_path / 'repeat_{{repeat}}/best/bag_{bag}.ckpt', bag=range(n_bags)), - config = '{model_path}/repeat_{repeat}/config.yaml' + config = model_path / 'repeat_{repeat}/config.yaml' threads: 1 shell: ( @@ -39,9 +39,9 @@ rule train: y = expand('{phenotype}/deeprvat/y.zarr', phenotype=training_phenotypes), output: - expand('{model_path}/repeat_{repeat}/trial{trial_number}/config.yaml', + expand(model_path / 'repeat_{repeat}/trial{trial_number}/config.yaml', repeat=range(n_repeats), trial_number=range(n_trials)), - expand('{model_path}/repeat_{repeat}/trial{trial_number}/finished.tmp', + expand(model_path / 'repeat_{repeat}/trial{trial_number}/finished.tmp', repeat=range(n_repeats), trial_number=range(n_trials)) params: phenotypes = " ".join( diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index 320cb9fb..60384eaf 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -52,9 +52,9 @@ rule all_association_dataset: rule all_training: input: - expand('{model_path}/repeat_{repeat}/best/bag_{bag}.ckpt', + expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', bag=range(n_bags), repeat=range(n_repeats)), - "{model_path}/config.yaml" + model_path / "config.yaml" rule all_training_dataset: input: