From 33d1798703781cb49e499828201cca10d1a204c6 Mon Sep 17 00:00:00 2001 From: Kayla Meyer Date: Fri, 27 Sep 2024 11:43:17 +0200 Subject: [PATCH] Add pytests for configuration file generation --- tests/deeprvat/test_config.py | 63 ++++ .../expected/deeprvat_config.yaml | 243 ++++++++++++++++ ...eprvat_input_pretrained_models_config.yaml | 80 +++++ .../expected/deeprvat_config.yaml | 256 ++++++++++++++++ ...eprvat_input_pretrained_models_config.yaml | 80 +++++ .../expected/deeprvat_config.yaml | 269 +++++++++++++++++ .../input/deeprvat_input_config.yaml | 159 ++++++++++ .../expected/deeprvat_config.yaml | 273 ++++++++++++++++++ .../input/deeprvat_input_config.yaml | 159 ++++++++++ .../expected/deeprvat_config.yaml | 214 ++++++++++++++ .../input/deeprvat_input_training_config.yaml | 122 ++++++++ 11 files changed, 1918 insertions(+) create mode 100644 tests/deeprvat/test_config.py create mode 100644 tests/deeprvat/test_data/config/association_testing_pretrained/expected/deeprvat_config.yaml create mode 100644 tests/deeprvat/test_data/config/association_testing_pretrained/input/deeprvat_input_pretrained_models_config.yaml create mode 100644 tests/deeprvat/test_data/config/association_testing_pretrained_regenie/expected/deeprvat_config.yaml create mode 100644 tests/deeprvat/test_data/config/association_testing_pretrained_regenie/input/deeprvat_input_pretrained_models_config.yaml create mode 100644 tests/deeprvat/test_data/config/training_association_testing/expected/deeprvat_config.yaml create mode 100644 tests/deeprvat/test_data/config/training_association_testing/input/deeprvat_input_config.yaml create mode 100644 tests/deeprvat/test_data/config/training_association_testing_cv/expected/deeprvat_config.yaml create mode 100644 tests/deeprvat/test_data/config/training_association_testing_cv/input/deeprvat_input_config.yaml create mode 100644 tests/deeprvat/test_data/config/training_only/expected/deeprvat_config.yaml create mode 100644 tests/deeprvat/test_data/config/training_only/input/deeprvat_input_training_config.yaml diff --git a/tests/deeprvat/test_config.py b/tests/deeprvat/test_config.py new file mode 100644 index 00000000..86baf8c3 --- /dev/null +++ b/tests/deeprvat/test_config.py @@ -0,0 +1,63 @@ +import logging +from pprint import pprint +import pandas as pd +import yaml +import pytest +from click.testing import CliRunner + +from pathlib import Path +from deeprvat.deeprvat.config import cli as config_cli +from deeprvat.deeprvat.config import create_main_config, load_yaml + +script_dir = Path(__file__).resolve().parent +tests_data_dir = script_dir / "test_data" / "config" + +@pytest.mark.parametrize( + "test_data_name_dir, input_config, clobber", + [ + ( + "training_only", + "deeprvat_input_training_config.yaml", + True, + ), + ( + "training_association_testing", + "deeprvat_input_config.yaml", + True, + ), + ( + "training_association_testing_cv", + "deeprvat_input_config.yaml", + True, + ), + ( + "association_testing_pretrained_regenie", + "deeprvat_input_pretrained_models_config.yaml", + True, + ), + ( + "association_testing_pretrained", + "deeprvat_input_pretrained_models_config.yaml", + True, + ), + ], +) + +def test_create_main_config(test_data_name_dir, input_config, clobber, tmp_path): + + current_test_data_dir = tests_data_dir / test_data_name_dir + + config_file_input = current_test_data_dir / "input" / input_config + expected_config = current_test_data_dir / "expected/deeprvat_config.yaml" + + create_main_config(config_file_input.as_posix(), tmp_path.as_posix(), clobber) + + assert (tmp_path / "deeprvat_config.yaml").exists() + + expected_full_config = load_yaml(expected_config.as_posix()) + generated_config = load_yaml(tmp_path / "deeprvat_config.yaml") + #nested test on equality + assert generated_config == expected_full_config + + + diff --git a/tests/deeprvat/test_data/config/association_testing_pretrained/expected/deeprvat_config.yaml b/tests/deeprvat/test_data/config/association_testing_pretrained/expected/deeprvat_config.yaml new file mode 100644 index 00000000..c25e4a97 --- /dev/null +++ b/tests/deeprvat/test_data/config/association_testing_pretrained/expected/deeprvat_config.yaml @@ -0,0 +1,243 @@ +association_testing_data: + dataloader_config: + batch_size: 16 + num_workers: 10 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + min_common_af: + MAF: 0.001 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: &id001 + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-3 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: &id002 + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet +cv_exp: false +deterministic: false +do_scoretest: true +evaluation: + alpha: 0.05 + correction_method: Bonferroni +hyperparameter_optimization: + direction: maximize + n_trials: 1 + sampler: + config: {} + type: TPESampler +model: + checkpoint: combined_agg.pt + config: + activation: LeakyReLU + metrics: + all: + MAE: {} + MSE: {} + PearsonCorrTorch: {} + RSquared: {} + loss: MSE + objective: MSE + objective_mode: min + optimizer: + config: {} + type: AdamW + phi_hidden_dim: 20 + phi_layers: 2 + pool: max + rho_hidden_dim: 10 + rho_layers: 3 + use_sigmoid: true + model_collection: agg_models + type: DeepSet +n_avg_chunks: 1 +n_burden_chunks: 5 +n_regression_chunks: 2 +n_repeats: 30 +phenotypes: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium +pretrained_model_path: pretrained_models +regenie_exp: false +training: + dataloader_config: + batch_size: 1024 + cache_tensors: true + chunksize: 100 + num_workers: 0 + temp_dir: $TMPDIR/deeprvat_train + drop_n_bags: 0 + min_variant_count: 0 + n_bags: 1 + n_parallel_jobs: 6 + sample_with_replacement: false + train_proportion: 0.8 +training_data: + dataloader_config: + batch_size: 64 + num_workers: 8 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + min_common_af: + MAF: 0.01 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: *id001 + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-2 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: *id002 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet diff --git a/tests/deeprvat/test_data/config/association_testing_pretrained/input/deeprvat_input_pretrained_models_config.yaml b/tests/deeprvat/test_data/config/association_testing_pretrained/input/deeprvat_input_pretrained_models_config.yaml new file mode 100644 index 00000000..66d4c8c6 --- /dev/null +++ b/tests/deeprvat/test_data/config/association_testing_pretrained/input/deeprvat_input_pretrained_models_config.yaml @@ -0,0 +1,80 @@ +use_pretrained_models: True +pretrained_model_path : pretrained_models + +#Phenotypes to be used only for Association Testing +phenotypes_for_association_testing: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium + +#File paths of necessary input files to DeepRVAT +gt_filename: genotypes.h5 +variant_filename: variants.parquet +phenotype_filename: phenotypes.parquet +annotation_filename: annotations.parquet +gene_filename: protein_coding_genes.parquet + +covariates: #x_phenotypes + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + +association_testing_data_thresholds: + MAF: "< 1e-3" + CADD_PHRED: "> 5" + +#DeepRVAT model settings +n_repeats: 30 +y_transformation: quantile_transform + +# Results evaluation settings +evaluation: + correction_method: Bonferroni + alpha: 0.05 + +# Subsetting samples for association testing +#sample_files: +# association_testing: association_testing_samples.pkl + +#Additional settings if using the CV pipeline +cv_options: + cv_exp: False + #cv_path: sample_files + #n_folds: 5 + +#Additional settings if using the REGENIE integration +regenie_options: + regenie_exp: False + # gtf_file: gencode.v38.basic.annotation.gtf.gz + # step_1: + # bgen: imputation.bgen + # snplist: imputation.snplist + # bsize: 1000 + # options: + # - "--sample imputation.sample" + # - "--qt" + # step_2: + # bsize: 400 + # options: + # - "--qt" diff --git a/tests/deeprvat/test_data/config/association_testing_pretrained_regenie/expected/deeprvat_config.yaml b/tests/deeprvat/test_data/config/association_testing_pretrained_regenie/expected/deeprvat_config.yaml new file mode 100644 index 00000000..ee50f185 --- /dev/null +++ b/tests/deeprvat/test_data/config/association_testing_pretrained_regenie/expected/deeprvat_config.yaml @@ -0,0 +1,256 @@ +association_testing_data: + dataloader_config: + batch_size: 16 + num_workers: 10 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + min_common_af: + MAF: 0.001 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: &id001 + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-3 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: &id002 + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet +cv_exp: false +deterministic: false +do_scoretest: true +evaluation: + alpha: 0.05 + correction_method: Bonferroni +gtf_file: gencode.v38.basic.annotation.gtf.gz +hyperparameter_optimization: + direction: maximize + n_trials: 1 + sampler: + config: {} + type: TPESampler +model: + checkpoint: combined_agg.pt + config: + activation: LeakyReLU + metrics: + all: + MAE: {} + MSE: {} + PearsonCorrTorch: {} + RSquared: {} + loss: MSE + objective: MSE + objective_mode: min + optimizer: + config: {} + type: AdamW + phi_hidden_dim: 20 + phi_layers: 2 + pool: max + rho_hidden_dim: 10 + rho_layers: 3 + use_sigmoid: true + model_collection: agg_models + type: DeepSet +n_avg_chunks: 1 +n_burden_chunks: 5 +n_regression_chunks: 2 +n_repeats: 30 +phenotypes: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium +pretrained_model_path: pretrained_models +regenie_exp: true +regenie_options: + step_1: + bgen: imputation.bgen + bsize: 1000 + options: + - --sample imputation.sample + - --qt + snplist: imputation.snplist + step_2: + bsize: 400 + options: + - --qt +training: + dataloader_config: + batch_size: 1024 + cache_tensors: true + chunksize: 100 + num_workers: 0 + temp_dir: $TMPDIR/deeprvat_train + drop_n_bags: 0 + min_variant_count: 0 + n_bags: 1 + n_parallel_jobs: 6 + sample_with_replacement: false + train_proportion: 0.8 +training_data: + dataloader_config: + batch_size: 64 + num_workers: 8 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + min_common_af: + MAF: 0.01 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: *id001 + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-2 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: *id002 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet \ No newline at end of file diff --git a/tests/deeprvat/test_data/config/association_testing_pretrained_regenie/input/deeprvat_input_pretrained_models_config.yaml b/tests/deeprvat/test_data/config/association_testing_pretrained_regenie/input/deeprvat_input_pretrained_models_config.yaml new file mode 100644 index 00000000..849a4023 --- /dev/null +++ b/tests/deeprvat/test_data/config/association_testing_pretrained_regenie/input/deeprvat_input_pretrained_models_config.yaml @@ -0,0 +1,80 @@ +use_pretrained_models: True +pretrained_model_path : pretrained_models + +#Phenotypes to be used only for Association Testing +phenotypes_for_association_testing: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium + +#File paths of necessary input files to DeepRVAT +gt_filename: genotypes.h5 +variant_filename: variants.parquet +phenotype_filename: phenotypes.parquet +annotation_filename: annotations.parquet +gene_filename: protein_coding_genes.parquet + +covariates: #x_phenotypes + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + +association_testing_data_thresholds: + MAF: "< 1e-3" + CADD_PHRED: "> 5" + +#DeepRVAT model settings +n_repeats: 30 +y_transformation: quantile_transform + +# Results evaluation settings +evaluation: + correction_method: Bonferroni + alpha: 0.05 + +# Subsetting samples for association testing +#sample_files: +# association_testing: association_testing_samples.pkl + +#Additional settings if using the CV pipeline +cv_options: + cv_exp: False + #cv_path: sample_files + #n_folds: 5 + +#Additional settings if using the REGENIE integration +regenie_options: + regenie_exp: True + gtf_file: gencode.v38.basic.annotation.gtf.gz + step_1: + bgen: imputation.bgen + snplist: imputation.snplist + bsize: 1000 + options: + - "--sample imputation.sample" + - "--qt" + step_2: + bsize: 400 + options: + - "--qt" diff --git a/tests/deeprvat/test_data/config/training_association_testing/expected/deeprvat_config.yaml b/tests/deeprvat/test_data/config/training_association_testing/expected/deeprvat_config.yaml new file mode 100644 index 00000000..d338833b --- /dev/null +++ b/tests/deeprvat/test_data/config/training_association_testing/expected/deeprvat_config.yaml @@ -0,0 +1,269 @@ +association_testing_data: + dataloader_config: + batch_size: 16 + num_workers: 10 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + min_common_af: + MAF: 0.001 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: &id001 + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-3 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: &id002 + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet +baseline_results: + alpha_seed_genes: 0.05 + correction_method: Bonferroni + options: + - base: baseline_results + type: plof/burden + - base: baseline_results + type: missense/burden + - base: baseline_results + type: plof/skat + - base: baseline_results + type: missense/skat +cv_exp: false +deterministic: true +do_scoretest: true +evaluation: + alpha: 0.05 + correction_method: Bonferroni +hyperparameter_optimization: + direction: maximize + n_trials: 1 + sampler: + config: {} + type: TPESampler +model: + checkpoint: combined_agg.pt + config: + activation: LeakyReLU + metrics: + all: + MAE: {} + MSE: {} + PearsonCorrTorch: {} + RSquared: {} + loss: MSE + objective: MSE + objective_mode: min + optimizer: + config: {} + type: AdamW + phi_hidden_dim: 20 + phi_layers: 2 + pool: max + rho_hidden_dim: 10 + rho_layers: 3 + use_sigmoid: true + model_collection: agg_models + type: DeepSet +n_avg_chunks: 1 +n_burden_chunks: 5 +n_regression_chunks: 2 +n_repeats: 1 +phenotypes: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium +regenie_exp: false +training: + dataloader_config: + batch_size: 1024 + cache_tensors: true + chunksize: 100 + num_workers: 0 + temp_dir: $TMPDIR/deeprvat_train + drop_n_bags: 0 + early_stopping: + min_delta: 1.0e-05 + mode: min + patience: 3 + verbose: true + min_variant_count: 0 + n_bags: 1 + n_parallel_jobs: 6 + phenotypes: + Apolipoprotein_A: {} + Apolipoprotein_B: {} + pl_trainer: + check_val_every_n_epoch: 1 + gpus: 1 + log_every_n_steps: 1 + max_epochs: 1000 + min_epochs: 50 + precision: 16 + sample_with_replacement: false + train_proportion: 0.8 +training_data: + dataloader_config: + batch_size: 64 + num_workers: 8 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + min_common_af: + MAF: 0.01 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: *id001 + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-2 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: *id002 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet diff --git a/tests/deeprvat/test_data/config/training_association_testing/input/deeprvat_input_config.yaml b/tests/deeprvat/test_data/config/training_association_testing/input/deeprvat_input_config.yaml new file mode 100644 index 00000000..19d6858e --- /dev/null +++ b/tests/deeprvat/test_data/config/training_association_testing/input/deeprvat_input_config.yaml @@ -0,0 +1,159 @@ + +# Phenotypes to be used only for Association Testing +phenotypes_for_association_testing: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium + +# Phenotypes to be used only for training +phenotypes_for_training: + - Apolipoprotein_A + - Apolipoprotein_B + +# File paths of necessary input files to DeepRVAT +gt_filename: genotypes.h5 +variant_filename: variants.parquet +phenotype_filename: phenotypes.parquet +annotation_filename: annotations.parquet +gene_filename: protein_coding_genes.parquet + +rare_variant_annotations: + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + +covariates: #x_phenotypes + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + +association_testing_data_thresholds: + MAF: "< 1e-3" + CADD_PHRED: "> 5" + +training_data_thresholds: + MAF: "< 1e-2" + CADD_PHRED: "> 5" + +# Seed Gene Baseline data settings +seed_gene_results: #baseline_results + result_dirs: + - + base: baseline_results + type: plof/burden + - + base: baseline_results + type: missense/burden + - + base: baseline_results + type: plof/skat + - + base: baseline_results + type: missense/skat + + correction_method: Bonferroni + alpha_seed_genes: 0.05 + +# DeepRVAT training settings +training: + pl_trainer: #PyTorch Lightening trainer settings + gpus: 1 + precision: 16 + min_epochs: 50 + max_epochs: 1000 + log_every_n_steps: 1 + check_val_every_n_epoch: 1 + early_stopping: #PyTorch Lightening Early Stopping Criteria + mode: min + patience: 3 + min_delta: 0.00001 + verbose: True + +# DeepRVAT model settings +n_repeats: 1 +y_transformation: quantile_transform +deterministic: true + +# Results evaluation settings +evaluation: + correction_method: Bonferroni + alpha: 0.05 + +# Subsetting samples for training or association testing +#sample_files: +# training: training_samples.pkl +# association_testing: association_testing_samples.pkl + +# Additional settings if using the CV pipeline +cv_options: + cv_exp: False + #cv_path: sample_files + #n_folds: 5 + +# Additional settings if using the REGENIE integration +regenie_options: + regenie_exp: False + # gtf_file: gencode.v38.basic.annotation.gtf.gz + # step_1: + # bgen: imputation.bgen + # snplist: imputation.snplist + # bsize: 1000 + # options: + # - "--sample imputation.sample" + # - "--qt" + # step_2: + # bsize: 400 + # options: + # - "--qt" diff --git a/tests/deeprvat/test_data/config/training_association_testing_cv/expected/deeprvat_config.yaml b/tests/deeprvat/test_data/config/training_association_testing_cv/expected/deeprvat_config.yaml new file mode 100644 index 00000000..3cf29e41 --- /dev/null +++ b/tests/deeprvat/test_data/config/training_association_testing_cv/expected/deeprvat_config.yaml @@ -0,0 +1,273 @@ +association_testing_data: + dataloader_config: + batch_size: 16 + num_workers: 10 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + min_common_af: + MAF: 0.001 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: &id001 + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + gene_file: protein_coding_genes.parquet + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-3 + verbose: true + type: PaddedAnnotations + sample_file: association_testing_samples.pkl + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: &id002 + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet +baseline_results: + alpha_seed_genes: 0.05 + correction_method: Bonferroni + options: + - base: baseline_results + type: plof/burden + - base: baseline_results + type: missense/burden + - base: baseline_results + type: plof/skat + - base: baseline_results + type: missense/skat +cv_exp: true +cv_path: sample_files +deterministic: true +do_scoretest: true +evaluation: + alpha: 0.05 + correction_method: Bonferroni +hyperparameter_optimization: + direction: maximize + n_trials: 1 + sampler: + config: {} + type: TPESampler +model: + checkpoint: combined_agg.pt + config: + activation: LeakyReLU + metrics: + all: + MAE: {} + MSE: {} + PearsonCorrTorch: {} + RSquared: {} + loss: MSE + objective: MSE + objective_mode: min + optimizer: + config: {} + type: AdamW + phi_hidden_dim: 20 + phi_layers: 2 + pool: max + rho_hidden_dim: 10 + rho_layers: 3 + use_sigmoid: true + model_collection: agg_models + type: DeepSet +n_avg_chunks: 1 +n_burden_chunks: 5 +n_folds: 5 +n_regression_chunks: 2 +n_repeats: 1 +phenotypes: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium +regenie_exp: false +training: + dataloader_config: + batch_size: 1024 + cache_tensors: true + chunksize: 100 + num_workers: 0 + temp_dir: $TMPDIR/deeprvat_train + drop_n_bags: 0 + early_stopping: + min_delta: 1.0e-05 + mode: min + patience: 3 + verbose: true + min_variant_count: 0 + n_bags: 1 + n_parallel_jobs: 6 + phenotypes: + Apolipoprotein_A: {} + Apolipoprotein_B: {} + pl_trainer: + check_val_every_n_epoch: 1 + gpus: 1 + log_every_n_steps: 1 + max_epochs: 1000 + min_epochs: 50 + precision: 16 + sample_with_replacement: false + train_proportion: 0.8 +training_data: + dataloader_config: + batch_size: 64 + num_workers: 8 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + min_common_af: + MAF: 0.01 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: *id001 + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-2 + verbose: true + type: PaddedAnnotations + sample_file: training_samples.pkl + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: *id002 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet diff --git a/tests/deeprvat/test_data/config/training_association_testing_cv/input/deeprvat_input_config.yaml b/tests/deeprvat/test_data/config/training_association_testing_cv/input/deeprvat_input_config.yaml new file mode 100644 index 00000000..88a9d124 --- /dev/null +++ b/tests/deeprvat/test_data/config/training_association_testing_cv/input/deeprvat_input_config.yaml @@ -0,0 +1,159 @@ + +# Phenotypes to be used only for Association Testing +phenotypes_for_association_testing: + - Apolipoprotein_A + - Apolipoprotein_B + - Calcium + +# Phenotypes to be used only for training +phenotypes_for_training: + - Apolipoprotein_A + - Apolipoprotein_B + +# File paths of necessary input files to DeepRVAT +gt_filename: genotypes.h5 +variant_filename: variants.parquet +phenotype_filename: phenotypes.parquet +annotation_filename: annotations.parquet +gene_filename: protein_coding_genes.parquet + +rare_variant_annotations: + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + +covariates: #x_phenotypes + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + +association_testing_data_thresholds: + MAF: "< 1e-3" + CADD_PHRED: "> 5" + +training_data_thresholds: + MAF: "< 1e-2" + CADD_PHRED: "> 5" + +# Seed Gene Baseline data settings +seed_gene_results: #baseline_results + result_dirs: + - + base: baseline_results + type: plof/burden + - + base: baseline_results + type: missense/burden + - + base: baseline_results + type: plof/skat + - + base: baseline_results + type: missense/skat + + correction_method: Bonferroni + alpha_seed_genes: 0.05 + +# DeepRVAT training settings +training: + pl_trainer: #PyTorch Lightening trainer settings + gpus: 1 + precision: 16 + min_epochs: 50 + max_epochs: 1000 + log_every_n_steps: 1 + check_val_every_n_epoch: 1 + early_stopping: #PyTorch Lightening Early Stopping Criteria + mode: min + patience: 3 + min_delta: 0.00001 + verbose: True + +# DeepRVAT model settings +n_repeats: 1 +y_transformation: quantile_transform +deterministic: true + +# Results evaluation settings +evaluation: + correction_method: Bonferroni + alpha: 0.05 + +# Subsetting samples for training or association testing +sample_files: + training: training_samples.pkl + association_testing: association_testing_samples.pkl + +# Additional settings if using the CV pipeline +cv_options: + cv_exp: True + cv_path: sample_files + n_folds: 5 + +# Additional settings if using the REGENIE integration +regenie_options: + regenie_exp: False + # gtf_file: gencode.v38.basic.annotation.gtf.gz + # step_1: + # bgen: imputation.bgen + # snplist: imputation.snplist + # bsize: 1000 + # options: + # - "--sample imputation.sample" + # - "--qt" + # step_2: + # bsize: 400 + # options: + # - "--qt" diff --git a/tests/deeprvat/test_data/config/training_only/expected/deeprvat_config.yaml b/tests/deeprvat/test_data/config/training_only/expected/deeprvat_config.yaml new file mode 100644 index 00000000..46003763 --- /dev/null +++ b/tests/deeprvat/test_data/config/training_only/expected/deeprvat_config.yaml @@ -0,0 +1,214 @@ +association_testing_data: + dataloader_config: + batch_size: 16 + num_workers: 10 + dataset_config: + gene_file: protein_coding_genes.parquet + min_common_af: + MAF: 0.001 + rare_embedding: + config: + low_memory: true + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true +baseline_results: + alpha_seed_genes: 0.05 + correction_method: Bonferroni + options: + - base: baseline_results + type: plof/burden + - base: baseline_results + type: missense/burden + - base: baseline_results + type: plof/skat + - base: baseline_results + type: missense/skat +cv_exp: false +deterministic: true +do_scoretest: true +hyperparameter_optimization: + direction: maximize + n_trials: 1 + sampler: + config: {} + type: TPESampler +model: + checkpoint: combined_agg.pt + config: + activation: LeakyReLU + metrics: + all: + MAE: {} + MSE: {} + PearsonCorrTorch: {} + RSquared: {} + loss: MSE + objective: MSE + objective_mode: min + optimizer: + config: {} + type: AdamW + phi_hidden_dim: 20 + phi_layers: 2 + pool: max + rho_hidden_dim: 10 + rho_layers: 3 + use_sigmoid: true + model_collection: agg_models + type: DeepSet +n_avg_chunks: 1 +n_burden_chunks: 5 +n_regression_chunks: 2 +n_repeats: 1 +regenie_exp: false +training: + dataloader_config: + batch_size: 1024 + cache_tensors: true + chunksize: 100 + num_workers: 0 + temp_dir: $TMPDIR/deeprvat_train + drop_n_bags: 0 + early_stopping: + min_delta: 1.0e-05 + mode: min + patience: 3 + verbose: true + min_variant_count: 0 + n_bags: 1 + n_parallel_jobs: 6 + phenotypes: + Apolipoprotein_A: {} + Apolipoprotein_B: {} + pl_trainer: + check_val_every_n_epoch: 1 + gpus: 1 + log_every_n_steps: 1 + max_epochs: 1000 + min_epochs: 50 + precision: 16 + sample_with_replacement: false + train_proportion: 0.8 +training_data: + dataloader_config: + batch_size: 64 + num_workers: 8 + dataset_config: + annotation_file: annotations.parquet + annotations: + - MAF_MB + - MAF + - CADD_PHRED + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + min_common_af: + MAF: 0.01 + phenotype_file: phenotypes.parquet + rare_embedding: + config: + annotations: + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + low_memory: true + thresholds: + CADD_PHRED: CADD_PHRED > 5 + MAF: MAF < 1e-2 + verbose: true + type: PaddedAnnotations + use_common_variants: false + use_rare_variants: true + verbose: true + x_phenotypes: + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + y_transformation: quantile_transform + gt_file: genotypes.h5 + variant_file: variants.parquet diff --git a/tests/deeprvat/test_data/config/training_only/input/deeprvat_input_training_config.yaml b/tests/deeprvat/test_data/config/training_only/input/deeprvat_input_training_config.yaml new file mode 100644 index 00000000..48227520 --- /dev/null +++ b/tests/deeprvat/test_data/config/training_only/input/deeprvat_input_training_config.yaml @@ -0,0 +1,122 @@ +training_only: True + +# Phenotypes to be used only for training +phenotypes_for_training: + - Apolipoprotein_A + - Apolipoprotein_B + +# File paths of necessary input files to DeepRVAT +gt_filename: genotypes.h5 +variant_filename: variants.parquet +phenotype_filename: phenotypes.parquet +annotation_filename: annotations.parquet +gene_filename: protein_coding_genes.parquet + +rare_variant_annotations: + - MAF_MB + - CADD_raw + - sift_score + - polyphen_score + - Consequence_splice_acceptor_variant + - Consequence_splice_donor_variant + - Consequence_stop_gained + - Consequence_frameshift_variant + - Consequence_stop_lost + - Consequence_start_lost + - Consequence_inframe_insertion + - Consequence_inframe_deletion + - Consequence_missense_variant + - Consequence_protein_altering_variant + - Consequence_splice_region_variant + - condel_score + - DeepSEA_PC_1 + - DeepSEA_PC_2 + - DeepSEA_PC_3 + - DeepSEA_PC_4 + - DeepSEA_PC_5 + - DeepSEA_PC_6 + - PrimateAI_score + - AbSplice_DNA + - DeepRipe_plus_QKI_lip_hg2 + - DeepRipe_plus_QKI_clip_k5 + - DeepRipe_plus_KHDRBS1_clip_k5 + - DeepRipe_plus_ELAVL1_parclip + - DeepRipe_plus_TARDBP_parclip + - DeepRipe_plus_HNRNPD_parclip + - DeepRipe_plus_MBNL1_parclip + - DeepRipe_plus_QKI_parclip + - SpliceAI_delta_score + - alphamissense + +covariates: #x_phenotypes + - age + - age2 + - age_sex + - genetic_sex + - genetic_PC_1 + - genetic_PC_2 + - genetic_PC_3 + - genetic_PC_4 + - genetic_PC_5 + - genetic_PC_6 + - genetic_PC_7 + - genetic_PC_8 + - genetic_PC_9 + - genetic_PC_10 + - genetic_PC_11 + - genetic_PC_12 + - genetic_PC_13 + - genetic_PC_14 + - genetic_PC_15 + - genetic_PC_16 + - genetic_PC_17 + - genetic_PC_18 + - genetic_PC_19 + - genetic_PC_20 + +training_data_thresholds: + MAF: "< 1e-2" + CADD_PHRED: "> 5" + +# Seed Gene Baseline data settings +seed_gene_results: #baseline_results + result_dirs: + - + base: baseline_results + type: plof/burden + - + base: baseline_results + type: missense/burden + - + base: baseline_results + type: plof/skat + - + base: baseline_results + type: missense/skat + + correction_method: Bonferroni + alpha_seed_genes: 0.05 + +# DeepRVAT training settings +training: + pl_trainer: #PyTorch Lightening trainer settings + gpus: 1 + precision: 16 + min_epochs: 50 + max_epochs: 1000 + log_every_n_steps: 1 + check_val_every_n_epoch: 1 + early_stopping: #PyTorch Lightening Early Stopping Criteria + mode: min + patience: 3 + min_delta: 0.00001 + verbose: True + +# DeepRVAT model settings +n_repeats: 1 +y_transformation: quantile_transform +deterministic: true + +# Subsetting samples for training or association testing +#sample_files: +# training: training_samples.pkl