-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add pytests for configuration file generation
- Loading branch information
Showing
11 changed files
with
1,918 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import logging | ||
from pprint import pprint | ||
import pandas as pd | ||
import yaml | ||
import pytest | ||
from click.testing import CliRunner | ||
|
||
from pathlib import Path | ||
from deeprvat.deeprvat.config import cli as config_cli | ||
from deeprvat.deeprvat.config import create_main_config, load_yaml | ||
|
||
script_dir = Path(__file__).resolve().parent | ||
tests_data_dir = script_dir / "test_data" / "config" | ||
|
||
@pytest.mark.parametrize( | ||
"test_data_name_dir, input_config, clobber", | ||
[ | ||
( | ||
"training_only", | ||
"deeprvat_input_training_config.yaml", | ||
True, | ||
), | ||
( | ||
"training_association_testing", | ||
"deeprvat_input_config.yaml", | ||
True, | ||
), | ||
( | ||
"training_association_testing_cv", | ||
"deeprvat_input_config.yaml", | ||
True, | ||
), | ||
( | ||
"association_testing_pretrained_regenie", | ||
"deeprvat_input_pretrained_models_config.yaml", | ||
True, | ||
), | ||
( | ||
"association_testing_pretrained", | ||
"deeprvat_input_pretrained_models_config.yaml", | ||
True, | ||
), | ||
], | ||
) | ||
|
||
def test_create_main_config(test_data_name_dir, input_config, clobber, tmp_path): | ||
|
||
current_test_data_dir = tests_data_dir / test_data_name_dir | ||
|
||
config_file_input = current_test_data_dir / "input" / input_config | ||
expected_config = current_test_data_dir / "expected/deeprvat_config.yaml" | ||
|
||
create_main_config(config_file_input.as_posix(), tmp_path.as_posix(), clobber) | ||
|
||
assert (tmp_path / "deeprvat_config.yaml").exists() | ||
|
||
expected_full_config = load_yaml(expected_config.as_posix()) | ||
generated_config = load_yaml(tmp_path / "deeprvat_config.yaml") | ||
#nested test on equality | ||
assert generated_config == expected_full_config | ||
|
||
|
||
|
243 changes: 243 additions & 0 deletions
243
tests/deeprvat/test_data/config/association_testing_pretrained/expected/deeprvat_config.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
association_testing_data: | ||
dataloader_config: | ||
batch_size: 16 | ||
num_workers: 10 | ||
dataset_config: | ||
annotation_file: annotations.parquet | ||
annotations: | ||
- MAF_MB | ||
- MAF | ||
- CADD_PHRED | ||
- CADD_raw | ||
- sift_score | ||
- polyphen_score | ||
- Consequence_splice_acceptor_variant | ||
- Consequence_splice_donor_variant | ||
- Consequence_stop_gained | ||
- Consequence_frameshift_variant | ||
- Consequence_stop_lost | ||
- Consequence_start_lost | ||
- Consequence_inframe_insertion | ||
- Consequence_inframe_deletion | ||
- Consequence_missense_variant | ||
- Consequence_protein_altering_variant | ||
- Consequence_splice_region_variant | ||
- condel_score | ||
- DeepSEA_PC_1 | ||
- DeepSEA_PC_2 | ||
- DeepSEA_PC_3 | ||
- DeepSEA_PC_4 | ||
- DeepSEA_PC_5 | ||
- DeepSEA_PC_6 | ||
- PrimateAI_score | ||
- AbSplice_DNA | ||
- DeepRipe_plus_QKI_lip_hg2 | ||
- DeepRipe_plus_QKI_clip_k5 | ||
- DeepRipe_plus_KHDRBS1_clip_k5 | ||
- DeepRipe_plus_ELAVL1_parclip | ||
- DeepRipe_plus_TARDBP_parclip | ||
- DeepRipe_plus_HNRNPD_parclip | ||
- DeepRipe_plus_MBNL1_parclip | ||
- DeepRipe_plus_QKI_parclip | ||
- SpliceAI_delta_score | ||
- alphamissense | ||
gene_file: protein_coding_genes.parquet | ||
min_common_af: | ||
MAF: 0.001 | ||
phenotype_file: phenotypes.parquet | ||
rare_embedding: | ||
config: | ||
annotations: &id001 | ||
- MAF_MB | ||
- CADD_raw | ||
- sift_score | ||
- polyphen_score | ||
- Consequence_splice_acceptor_variant | ||
- Consequence_splice_donor_variant | ||
- Consequence_stop_gained | ||
- Consequence_frameshift_variant | ||
- Consequence_stop_lost | ||
- Consequence_start_lost | ||
- Consequence_inframe_insertion | ||
- Consequence_inframe_deletion | ||
- Consequence_missense_variant | ||
- Consequence_protein_altering_variant | ||
- Consequence_splice_region_variant | ||
- condel_score | ||
- DeepSEA_PC_1 | ||
- DeepSEA_PC_2 | ||
- DeepSEA_PC_3 | ||
- DeepSEA_PC_4 | ||
- DeepSEA_PC_5 | ||
- DeepSEA_PC_6 | ||
- PrimateAI_score | ||
- AbSplice_DNA | ||
- DeepRipe_plus_QKI_lip_hg2 | ||
- DeepRipe_plus_QKI_clip_k5 | ||
- DeepRipe_plus_KHDRBS1_clip_k5 | ||
- DeepRipe_plus_ELAVL1_parclip | ||
- DeepRipe_plus_TARDBP_parclip | ||
- DeepRipe_plus_HNRNPD_parclip | ||
- DeepRipe_plus_MBNL1_parclip | ||
- DeepRipe_plus_QKI_parclip | ||
- SpliceAI_delta_score | ||
- alphamissense | ||
gene_file: protein_coding_genes.parquet | ||
low_memory: true | ||
thresholds: | ||
CADD_PHRED: CADD_PHRED > 5 | ||
MAF: MAF < 1e-3 | ||
verbose: true | ||
type: PaddedAnnotations | ||
use_common_variants: false | ||
use_rare_variants: true | ||
verbose: true | ||
x_phenotypes: &id002 | ||
- age | ||
- age2 | ||
- age_sex | ||
- genetic_sex | ||
- genetic_PC_1 | ||
- genetic_PC_2 | ||
- genetic_PC_3 | ||
- genetic_PC_4 | ||
- genetic_PC_5 | ||
- genetic_PC_6 | ||
- genetic_PC_7 | ||
- genetic_PC_8 | ||
- genetic_PC_9 | ||
- genetic_PC_10 | ||
- genetic_PC_11 | ||
- genetic_PC_12 | ||
- genetic_PC_13 | ||
- genetic_PC_14 | ||
- genetic_PC_15 | ||
- genetic_PC_16 | ||
- genetic_PC_17 | ||
- genetic_PC_18 | ||
- genetic_PC_19 | ||
- genetic_PC_20 | ||
y_transformation: quantile_transform | ||
gt_file: genotypes.h5 | ||
variant_file: variants.parquet | ||
cv_exp: false | ||
deterministic: false | ||
do_scoretest: true | ||
evaluation: | ||
alpha: 0.05 | ||
correction_method: Bonferroni | ||
hyperparameter_optimization: | ||
direction: maximize | ||
n_trials: 1 | ||
sampler: | ||
config: {} | ||
type: TPESampler | ||
model: | ||
checkpoint: combined_agg.pt | ||
config: | ||
activation: LeakyReLU | ||
metrics: | ||
all: | ||
MAE: {} | ||
MSE: {} | ||
PearsonCorrTorch: {} | ||
RSquared: {} | ||
loss: MSE | ||
objective: MSE | ||
objective_mode: min | ||
optimizer: | ||
config: {} | ||
type: AdamW | ||
phi_hidden_dim: 20 | ||
phi_layers: 2 | ||
pool: max | ||
rho_hidden_dim: 10 | ||
rho_layers: 3 | ||
use_sigmoid: true | ||
model_collection: agg_models | ||
type: DeepSet | ||
n_avg_chunks: 1 | ||
n_burden_chunks: 5 | ||
n_regression_chunks: 2 | ||
n_repeats: 30 | ||
phenotypes: | ||
- Apolipoprotein_A | ||
- Apolipoprotein_B | ||
- Calcium | ||
pretrained_model_path: pretrained_models | ||
regenie_exp: false | ||
training: | ||
dataloader_config: | ||
batch_size: 1024 | ||
cache_tensors: true | ||
chunksize: 100 | ||
num_workers: 0 | ||
temp_dir: $TMPDIR/deeprvat_train | ||
drop_n_bags: 0 | ||
min_variant_count: 0 | ||
n_bags: 1 | ||
n_parallel_jobs: 6 | ||
sample_with_replacement: false | ||
train_proportion: 0.8 | ||
training_data: | ||
dataloader_config: | ||
batch_size: 64 | ||
num_workers: 8 | ||
dataset_config: | ||
annotation_file: annotations.parquet | ||
annotations: | ||
- MAF_MB | ||
- MAF | ||
- CADD_PHRED | ||
- CADD_raw | ||
- sift_score | ||
- polyphen_score | ||
- Consequence_splice_acceptor_variant | ||
- Consequence_splice_donor_variant | ||
- Consequence_stop_gained | ||
- Consequence_frameshift_variant | ||
- Consequence_stop_lost | ||
- Consequence_start_lost | ||
- Consequence_inframe_insertion | ||
- Consequence_inframe_deletion | ||
- Consequence_missense_variant | ||
- Consequence_protein_altering_variant | ||
- Consequence_splice_region_variant | ||
- condel_score | ||
- DeepSEA_PC_1 | ||
- DeepSEA_PC_2 | ||
- DeepSEA_PC_3 | ||
- DeepSEA_PC_4 | ||
- DeepSEA_PC_5 | ||
- DeepSEA_PC_6 | ||
- PrimateAI_score | ||
- AbSplice_DNA | ||
- DeepRipe_plus_QKI_lip_hg2 | ||
- DeepRipe_plus_QKI_clip_k5 | ||
- DeepRipe_plus_KHDRBS1_clip_k5 | ||
- DeepRipe_plus_ELAVL1_parclip | ||
- DeepRipe_plus_TARDBP_parclip | ||
- DeepRipe_plus_HNRNPD_parclip | ||
- DeepRipe_plus_MBNL1_parclip | ||
- DeepRipe_plus_QKI_parclip | ||
- SpliceAI_delta_score | ||
- alphamissense | ||
min_common_af: | ||
MAF: 0.01 | ||
phenotype_file: phenotypes.parquet | ||
rare_embedding: | ||
config: | ||
annotations: *id001 | ||
low_memory: true | ||
thresholds: | ||
CADD_PHRED: CADD_PHRED > 5 | ||
MAF: MAF < 1e-2 | ||
verbose: true | ||
type: PaddedAnnotations | ||
use_common_variants: false | ||
use_rare_variants: true | ||
verbose: true | ||
x_phenotypes: *id002 | ||
y_transformation: quantile_transform | ||
gt_file: genotypes.h5 | ||
variant_file: variants.parquet |
80 changes: 80 additions & 0 deletions
80
.../config/association_testing_pretrained/input/deeprvat_input_pretrained_models_config.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
use_pretrained_models: True | ||
pretrained_model_path : pretrained_models | ||
|
||
#Phenotypes to be used only for Association Testing | ||
phenotypes_for_association_testing: | ||
- Apolipoprotein_A | ||
- Apolipoprotein_B | ||
- Calcium | ||
|
||
#File paths of necessary input files to DeepRVAT | ||
gt_filename: genotypes.h5 | ||
variant_filename: variants.parquet | ||
phenotype_filename: phenotypes.parquet | ||
annotation_filename: annotations.parquet | ||
gene_filename: protein_coding_genes.parquet | ||
|
||
covariates: #x_phenotypes | ||
- age | ||
- age2 | ||
- age_sex | ||
- genetic_sex | ||
- genetic_PC_1 | ||
- genetic_PC_2 | ||
- genetic_PC_3 | ||
- genetic_PC_4 | ||
- genetic_PC_5 | ||
- genetic_PC_6 | ||
- genetic_PC_7 | ||
- genetic_PC_8 | ||
- genetic_PC_9 | ||
- genetic_PC_10 | ||
- genetic_PC_11 | ||
- genetic_PC_12 | ||
- genetic_PC_13 | ||
- genetic_PC_14 | ||
- genetic_PC_15 | ||
- genetic_PC_16 | ||
- genetic_PC_17 | ||
- genetic_PC_18 | ||
- genetic_PC_19 | ||
- genetic_PC_20 | ||
|
||
association_testing_data_thresholds: | ||
MAF: "< 1e-3" | ||
CADD_PHRED: "> 5" | ||
|
||
#DeepRVAT model settings | ||
n_repeats: 30 | ||
y_transformation: quantile_transform | ||
|
||
# Results evaluation settings | ||
evaluation: | ||
correction_method: Bonferroni | ||
alpha: 0.05 | ||
|
||
# Subsetting samples for association testing | ||
#sample_files: | ||
# association_testing: association_testing_samples.pkl | ||
|
||
#Additional settings if using the CV pipeline | ||
cv_options: | ||
cv_exp: False | ||
#cv_path: sample_files | ||
#n_folds: 5 | ||
|
||
#Additional settings if using the REGENIE integration | ||
regenie_options: | ||
regenie_exp: False | ||
# gtf_file: gencode.v38.basic.annotation.gtf.gz | ||
# step_1: | ||
# bgen: imputation.bgen | ||
# snplist: imputation.snplist | ||
# bsize: 1000 | ||
# options: | ||
# - "--sample imputation.sample" | ||
# - "--qt" | ||
# step_2: | ||
# bsize: 400 | ||
# options: | ||
# - "--qt" |
Oops, something went wrong.