From 51c21dd78b8d525e75633a2b8584f285f54ce155 Mon Sep 17 00:00:00 2001 From: Brian Clarke Date: Wed, 20 Sep 2023 10:04:39 +0200 Subject: [PATCH] adaptations for running with DNANexus --- deeprvat/deeprvat/config.py | 10 +- .../association_testing_pretrained.snakefile | 144 ++++++++++++++++ dnanexus/config.yaml | 161 ++++++++++++++++++ dnanexus/deeprvat_burdens/Readme.developer.md | 31 ++++ dnanexus/deeprvat_burdens/Readme.md | 21 +++ dnanexus/deeprvat_burdens/dxapp.json | 64 +++++++ .../deeprvat_burdens/src/DeepRvat-test.sh | 69 ++++++++ dnanexus/deeprvat_burdens/src/run.sh | 30 ++++ 8 files changed, 526 insertions(+), 4 deletions(-) create mode 100644 dnanexus/association_testing_pretrained.snakefile create mode 100644 dnanexus/config.yaml create mode 100644 dnanexus/deeprvat_burdens/Readme.developer.md create mode 100644 dnanexus/deeprvat_burdens/Readme.md create mode 100644 dnanexus/deeprvat_burdens/dxapp.json create mode 100755 dnanexus/deeprvat_burdens/src/DeepRvat-test.sh create mode 100644 dnanexus/deeprvat_burdens/src/run.sh diff --git a/deeprvat/deeprvat/config.py b/deeprvat/deeprvat/config.py index 410f4141..8ccde5de 100644 --- a/deeprvat/deeprvat/config.py +++ b/deeprvat/deeprvat/config.py @@ -42,9 +42,10 @@ def update_config( new_config_file: str, ): if seed_gene_dir is None and len(baseline_results) == 0: - raise ValueError( - "One of --seed-gene-dir and --baseline-results " "must be specified" - ) + logger.warning("Neither --seed-gene-dir nor --baseline-results " + "were specified. This will result in errors if " + "when trying to train DeepRVAT using the resulting " + "config files.") with open(old_config_file) as f: config = yaml.safe_load(f) @@ -52,7 +53,8 @@ def update_config( if phenotype is not None: logger.info(f"Updating config for phenotype {phenotype}") config["data"]["dataset_config"]["y_phenotypes"] = [phenotype] - config["training_data"]["dataset_config"]["y_phenotypes"] = [phenotype] + if "training_data" in config: + config["training_data"]["dataset_config"]["y_phenotypes"] = [phenotype] # For using seed genes from results of baseline methods if len(baseline_results) > 0: diff --git a/dnanexus/association_testing_pretrained.snakefile b/dnanexus/association_testing_pretrained.snakefile new file mode 100644 index 00000000..768e76e7 --- /dev/null +++ b/dnanexus/association_testing_pretrained.snakefile @@ -0,0 +1,144 @@ +from pathlib import Path +from typing import Iterable, Union + +configfile: 'config.yaml' + +debug_flag = config.get('debug', False) +phenotypes = config['phenotypes'] +phenotypes = list(phenotypes.keys()) if type(phenotypes) == dict else phenotypes + +n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 +n_repeats = config['n_repeats'] +debug = '--debug ' if debug_flag else '' +pretrained_model_path = Path(config.get("pretrained_model_path", "pretrained_models")) + +dnanexus_destination = Path(config["dnanexus"]["destination"]) +dnanexus_applet = config["dnanexus"]["applet"] +dnanexus_priority = config["dnanexus"].get("priority", "low") +dnanexus_configfile = config["dnanexus"]["configfile"] + + +def dx_run( + command: str, + mkdirs: Union[str, Iterable[str]], + instance_type: str, + dx_configfile: str = dnanexus_configfile, + cost_limit: float = 1.00, + destination: str = dnanexus_destination, + applet: str = dnanexus_applet, + dx_priority: str = dnanexus_priority, +): + if isinstance(mkdirs, str): + mkdirs = [mkdirs] + + mkdir_string = " && ".join(f"mkdir -p {d}" for d in mkdirs) + + dx_run_shell = f"dx run {applet} " + dx_run_shell += f"--instance-type {instance_type} " + dx_run_shell += f"--priority {dx_priority} " + dx_run_shell += f"--cost-limit {cost_limit} " + dx_run_shell += f"-iconfig={dx_configfile} " + dx_run_shell += f"-icommand='" + mkdir_string + dx_run_shell += f" && {command}' " + dx_run_shell += f"--destination {destination} " + dx_run_shell += f"--wait " + dx_run_shell += f"-y " + + return dx_run_shell + +wildcard_constraints: + repeat="\d+", + trial="\d+", + + +rule all: + input: + expand("{phenotype}/deeprvat/burdens/chunk{chunk}.finished", + phenotype=phenotypes, + chunk=n_burden_chunks) + +rule compute_burdens: + priority: 10 + input: + reversed = pretrained_model_path / "reverse_finished.tmp", + checkpoints = lambda wildcards: [ + pretrained_model_path / f'repeat_{repeat}/best/bag_{bag}.ckpt' + for repeat in range(n_repeats) for bag in range(n_bags) + ], + dataset = '{phenotype}/deeprvat/association_dataset.pkl', + data_config = '{phenotype}/deeprvat/hpopt_config.yaml', + model_config = pretrained_model_path / 'config.yaml', + output: + '{phenotype}/deeprvat/burdens/chunk{chunk}.finished' + threads: 8 + shell: + ' && '.join([ + ('deeprvat_associate compute-burdens ' + + debug + + ' --n-chunks '+ str(n_burden_chunks) + ' ' + '--chunk {wildcards.chunk} ' + '--dataset-file {input.dataset} ' + '{input.data_config} ' + '{input.model_config} ' + '{input.checkpoints} ' + '{wildcards.phenotype}/deeprvat/burdens'), + 'touch {output}' + ]) + +rule all_association_dataset: + input: + expand('{phenotype}/deeprvat/association_dataset.pkl', + phenotype=phenotypes) + +rule association_dataset: + input: + config = '{phenotype}/deeprvat/hpopt_config.yaml' + output: + '{phenotype}/deeprvat/association_dataset.pkl' + threads: 1 + params: + dx_run = lambda wildcards, input, output: dx_run( + command=( + 'deeprvat_associate make-dataset ' + + debug + + str("/mnt/project/DeepRVAT" / dnanexus_destination / f'{input.config} ') + + f'{output}' + ), + mkdirs=f"{wildcards.phenotype}/deeprvat", + instance_type="mem3_ssd1_v2_x4", + cost_limit=1, + ), + shell: + " && ".join([ + "{params.dx_run}", + "touch {output}" + ]) + +rule all_config: + input: + config = expand('{phenotype}/deeprvat/hpopt_config.yaml', + phenotype=phenotypes), + +rule config: + input: + config = 'config.yaml', + output: + config = '{phenotype}/deeprvat/hpopt_config.yaml', + params: + dx_run = lambda wildcards, input, output: dx_run( + command=( + 'deeprvat_config update-config ' + f'--phenotype {wildcards.phenotype} ' + f'{input.config} ' + f'{output.config}' + ), + mkdirs=f"{wildcards.phenotype}/deeprvat", + instance_type="mem1_ssd1_v2_x2", + cost_limit=0.10, + ), + threads: 1 + shell: + " && ".join([ + "{params.dx_run}", + "touch {output}" + ]) diff --git a/dnanexus/config.yaml b/dnanexus/config.yaml new file mode 100644 index 00000000..d140044c --- /dev/null +++ b/dnanexus/config.yaml @@ -0,0 +1,161 @@ +phenotypes: + Calcium: + correction_method: FDR + n_training_genes: 100 + baseline_phenotype: Calcium + +n_burden_chunks: 1 + +n_repeats: 6 + +do_scoretest: True + +dnanexus: + configfile: DeepRVAT/workdir/pretrained_scoring/config.yaml + destination: DeepRVAT/workdir/pretrained_scoring + applet: deeprvat_burdens + priority: low + +model: + type: DeepSet + model_collection: agg_models + checkpoint: combined_agg.pt + config: + phi_layers: 2 + phi_hidden_dim: 20 + rho_layers: 3 + rho_hidden_dim: 10 + activation: LeakyReLU + pool: max + use_sigmoid: True + metrics: + objective: MSE + objective_mode: min + loss: MSE + all: + MSE: {} + PearsonCorrTorch: {} + MAE: {} + RSquared: {} + optimizer: + type: AdamW + config: {} + +data: + gt_file: /mnt/project/DeepRVAT/DeepRVAT/data/preprocessed/genotypes.h5 + variant_file: /mnt/project/DeepRVAT/DeepRVAT/data/variants.parquet + dataset_config: + min_common_af: + MAF: 0.01 + phenotype_file: /mnt/project/DeepRVAT/DeepRVAT/data/phenotypes.parquet + y_transformation: quantile_transform + x_phenotypes: + - age + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + annotation_file: /mnt/project/DeepRVAT/DeepRVAT/data/annotations.parquet + annotations: + - MAF + - MAF_MB + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + gene_file: /mnt/project/DeepRVAT/DeepRVAT/data/protein_coding_genes.parquet + use_common_variants: False + use_rare_variants: True + rare_embedding: + type: PaddedAnnotations + config: + annotations: + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + thresholds: + MAF: "MAF < 1e-3" + CADD_PHRED: "CADD_PHRED > 5" + gene_file: /mnt/project/DeepRVAT/DeepRVAT/data/protein_coding_genes.parquet + verbose: True + low_memory: True + verbose: True + dataloader_config: + batch_size: 16 + num_workers: 10 diff --git a/dnanexus/deeprvat_burdens/Readme.developer.md b/dnanexus/deeprvat_burdens/Readme.developer.md new file mode 100644 index 00000000..13954700 --- /dev/null +++ b/dnanexus/deeprvat_burdens/Readme.developer.md @@ -0,0 +1,31 @@ +# DeepRvat-test Developer Readme + + + +## Running this app with additional computational resources + +This app has the following entry points: + +* main + +When running this app, you can override the instance type to be used by +providing the ``systemRequirements`` field to ```/applet-XXXX/run``` or +```/app-XXXX/run```, as follows: + + { + systemRequirements: { + "main": {"instanceType": "mem2_hdd2_x2"} + }, + [...] + } + +See Run +Specification in the API documentation for more information about the +available instance types. diff --git a/dnanexus/deeprvat_burdens/Readme.md b/dnanexus/deeprvat_burdens/Readme.md new file mode 100644 index 00000000..2d90b54a --- /dev/null +++ b/dnanexus/deeprvat_burdens/Readme.md @@ -0,0 +1,21 @@ + +# DeepRVAT (DNAnexus Platform App) + +Rare variant association testing using deep learning and data-driven burden scores + +This is the source code for an app that runs on the DNAnexus Platform. +For more information about how to run or modify it, see +https://documentation.dnanexus.com/. + + + + + diff --git a/dnanexus/deeprvat_burdens/dxapp.json b/dnanexus/deeprvat_burdens/dxapp.json new file mode 100644 index 00000000..ca408e4c --- /dev/null +++ b/dnanexus/deeprvat_burdens/dxapp.json @@ -0,0 +1,64 @@ +{ +"name": "deeprvat_burdens", +"title": "deeprvat_burdens", +"summary": "compute DeepRVAT burden scores", + "dxapi": "1.0.0", + "version": "0.0.1", + "inputSpec": [ + { + "name": "config", + "label": "config", + "class": "string", + "optional": false, + "patterns": [ + "*" + ], + "help": "" + }, + { + "name": "command", + "label": "command", + "class": "string", + "optional": false, + "help": "" + } + ], + "outputSpec": [ + { + "name": "results", + "label": "results", + "class": "array:file", + "patterns": [ + "*" + ], + "help": "" + } + ], + "runSpec": { + "timeoutPolicy": { + "*": { + "hours": 1 + } + }, + "interpreter": "bash", + "file": "src/run.sh", + "distribution": "Ubuntu", + "release": "20.04", + "version": "0" + }, + "access": { + "network": [ + "*" + ], + "project": "CONTRIBUTE" + }, + "regionalOptions": { + "aws:eu-west-2": { + "systemRequirements": { + "*": { + "instanceType": "mem1_ssd1_v2_x4" + } + } + } + } +} diff --git a/dnanexus/deeprvat_burdens/src/DeepRvat-test.sh b/dnanexus/deeprvat_burdens/src/DeepRvat-test.sh new file mode 100755 index 00000000..0b02d03a --- /dev/null +++ b/dnanexus/deeprvat_burdens/src/DeepRvat-test.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# DeepRvat-test 0.0.1 +# Generated by dx-app-wizard. +# +# Basic execution pattern: Your app will run on a single machine from +# beginning to end. +# +# Your job's input variables (if any) will be loaded as environment +# variables before this script runs. Any array inputs will be loaded +# as bash arrays. +# +# Any code outside of main() (or any entry point you may add) is +# ALWAYS executed, followed by running the entry point itself. +# +# See https://documentation.dnanexus.com/developer for tutorials on how +# to modify this file. + +main() { + + echo "Value of input_file: '$input_file'" + echo "Value of parameter1: '$parameter1'" + echo "Value of parameter2: '$parameter2'" + + # The following line(s) use the dx command-line tool to download your file + # inputs to the local file system using variable names for the filenames. To + # recover the original filenames, you can use the output of "dx describe + # "$variable" --name". + + dx download "$input_file" -o input_file + # Building the environment + mkdir -p deeprvat_non_cuda + tar -xzf /deeprvat_non_cuda.tar.gz -C deeprvat_non_cuda + ./deeprvat_non_cuda/bin/python + source deeprvat_non_cuda/bin/activate + echo $parameter1 + echo $parameter2 + conda-unpack + conda list > output_file + # python /test.py test input_file output_file + + # Fill in your application code here. + # + # To report any recognized errors in the correct format in + # $HOME/job_error.json and exit this script, you can use the + # dx-jobutil-report-error utility as follows: + # + # dx-jobutil-report-error "My error message" + # + # Note however that this entire bash script is executed with -e + # when running in the cloud, so any line which returns a nonzero + # exit code will prematurely exit the script; if no error was + # reported in the job_error.json file, then the failure reason + # will be AppInternalError with a generic error message. + + # The following line(s) use the dx command-line tool to upload your file + # outputs after you have created them on the local file system. It assumes + # that you have used the output field name for the filename for each output, + # but you can change that behavior to suit your needs. Run "dx upload -h" + # to see more options to set metadata. + + output_file=$(dx upload output_file --brief) + + # The following line(s) use the utility dx-jobutil-add-output to format and + # add output variables to your job's output as appropriate for the output + # class. Run "dx-jobutil-add-output -h" for more information on what it + # does. + + dx-jobutil-add-output output_file "$output_file" --class=file +} diff --git a/dnanexus/deeprvat_burdens/src/run.sh b/dnanexus/deeprvat_burdens/src/run.sh new file mode 100644 index 00000000..6dc4f5e8 --- /dev/null +++ b/dnanexus/deeprvat_burdens/src/run.sh @@ -0,0 +1,30 @@ +main() { + echo "Mounting via dxfuse" + mkdir -pv /mnt/project + dxfuse -verbose 2 /mnt/project DeepRVAT + sleep 3 + echo "----------" + + echo "Unpacking conda env" + mkdir -p deeprvat_non_cuda + tar -xzf /mnt/project/DeepRVAT/DeepRVAT/deeprvat_non_cuda.tar.gz -C deeprvat_non_cuda + ./deeprvat_non_cuda/bin/python + source deeprvat_non_cuda/bin/activate + conda-unpack + + echo "Installing deeprvat package" + tar -xzf /mnt/project/DeepRVAT/DeepRVAT/deeprvat.tar.gz + pip install -e deeprvat + + echo "Executing command: $command using config $config" + mkdir -p out/results + cd out/results + cp /mnt/project/DeepRVAT/$config . + eval $command + + echo "Uploading outputs" + rm config.yaml + dx-upload-all-outputs + + echo "DONE!" +}