Skip to content

Commit

Permalink
Add pytests for configuration file generation
Browse files Browse the repository at this point in the history
  • Loading branch information
meyerkm committed Sep 27, 2024
1 parent f05688a commit 33d1798
Show file tree
Hide file tree
Showing 11 changed files with 1,918 additions and 0 deletions.
63 changes: 63 additions & 0 deletions tests/deeprvat/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import logging
from pprint import pprint
import pandas as pd
import yaml
import pytest
from click.testing import CliRunner

from pathlib import Path
from deeprvat.deeprvat.config import cli as config_cli
from deeprvat.deeprvat.config import create_main_config, load_yaml

script_dir = Path(__file__).resolve().parent
tests_data_dir = script_dir / "test_data" / "config"

@pytest.mark.parametrize(
"test_data_name_dir, input_config, clobber",
[
(
"training_only",
"deeprvat_input_training_config.yaml",
True,
),
(
"training_association_testing",
"deeprvat_input_config.yaml",
True,
),
(
"training_association_testing_cv",
"deeprvat_input_config.yaml",
True,
),
(
"association_testing_pretrained_regenie",
"deeprvat_input_pretrained_models_config.yaml",
True,
),
(
"association_testing_pretrained",
"deeprvat_input_pretrained_models_config.yaml",
True,
),
],
)

def test_create_main_config(test_data_name_dir, input_config, clobber, tmp_path):

current_test_data_dir = tests_data_dir / test_data_name_dir

config_file_input = current_test_data_dir / "input" / input_config
expected_config = current_test_data_dir / "expected/deeprvat_config.yaml"

create_main_config(config_file_input.as_posix(), tmp_path.as_posix(), clobber)

assert (tmp_path / "deeprvat_config.yaml").exists()

expected_full_config = load_yaml(expected_config.as_posix())
generated_config = load_yaml(tmp_path / "deeprvat_config.yaml")
#nested test on equality
assert generated_config == expected_full_config



Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
association_testing_data:
dataloader_config:
batch_size: 16
num_workers: 10
dataset_config:
annotation_file: annotations.parquet
annotations:
- MAF_MB
- MAF
- CADD_PHRED
- CADD_raw
- sift_score
- polyphen_score
- Consequence_splice_acceptor_variant
- Consequence_splice_donor_variant
- Consequence_stop_gained
- Consequence_frameshift_variant
- Consequence_stop_lost
- Consequence_start_lost
- Consequence_inframe_insertion
- Consequence_inframe_deletion
- Consequence_missense_variant
- Consequence_protein_altering_variant
- Consequence_splice_region_variant
- condel_score
- DeepSEA_PC_1
- DeepSEA_PC_2
- DeepSEA_PC_3
- DeepSEA_PC_4
- DeepSEA_PC_5
- DeepSEA_PC_6
- PrimateAI_score
- AbSplice_DNA
- DeepRipe_plus_QKI_lip_hg2
- DeepRipe_plus_QKI_clip_k5
- DeepRipe_plus_KHDRBS1_clip_k5
- DeepRipe_plus_ELAVL1_parclip
- DeepRipe_plus_TARDBP_parclip
- DeepRipe_plus_HNRNPD_parclip
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
- alphamissense
gene_file: protein_coding_genes.parquet
min_common_af:
MAF: 0.001
phenotype_file: phenotypes.parquet
rare_embedding:
config:
annotations: &id001
- MAF_MB
- CADD_raw
- sift_score
- polyphen_score
- Consequence_splice_acceptor_variant
- Consequence_splice_donor_variant
- Consequence_stop_gained
- Consequence_frameshift_variant
- Consequence_stop_lost
- Consequence_start_lost
- Consequence_inframe_insertion
- Consequence_inframe_deletion
- Consequence_missense_variant
- Consequence_protein_altering_variant
- Consequence_splice_region_variant
- condel_score
- DeepSEA_PC_1
- DeepSEA_PC_2
- DeepSEA_PC_3
- DeepSEA_PC_4
- DeepSEA_PC_5
- DeepSEA_PC_6
- PrimateAI_score
- AbSplice_DNA
- DeepRipe_plus_QKI_lip_hg2
- DeepRipe_plus_QKI_clip_k5
- DeepRipe_plus_KHDRBS1_clip_k5
- DeepRipe_plus_ELAVL1_parclip
- DeepRipe_plus_TARDBP_parclip
- DeepRipe_plus_HNRNPD_parclip
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
- alphamissense
gene_file: protein_coding_genes.parquet
low_memory: true
thresholds:
CADD_PHRED: CADD_PHRED > 5
MAF: MAF < 1e-3
verbose: true
type: PaddedAnnotations
use_common_variants: false
use_rare_variants: true
verbose: true
x_phenotypes: &id002
- age
- age2
- age_sex
- genetic_sex
- genetic_PC_1
- genetic_PC_2
- genetic_PC_3
- genetic_PC_4
- genetic_PC_5
- genetic_PC_6
- genetic_PC_7
- genetic_PC_8
- genetic_PC_9
- genetic_PC_10
- genetic_PC_11
- genetic_PC_12
- genetic_PC_13
- genetic_PC_14
- genetic_PC_15
- genetic_PC_16
- genetic_PC_17
- genetic_PC_18
- genetic_PC_19
- genetic_PC_20
y_transformation: quantile_transform
gt_file: genotypes.h5
variant_file: variants.parquet
cv_exp: false
deterministic: false
do_scoretest: true
evaluation:
alpha: 0.05
correction_method: Bonferroni
hyperparameter_optimization:
direction: maximize
n_trials: 1
sampler:
config: {}
type: TPESampler
model:
checkpoint: combined_agg.pt
config:
activation: LeakyReLU
metrics:
all:
MAE: {}
MSE: {}
PearsonCorrTorch: {}
RSquared: {}
loss: MSE
objective: MSE
objective_mode: min
optimizer:
config: {}
type: AdamW
phi_hidden_dim: 20
phi_layers: 2
pool: max
rho_hidden_dim: 10
rho_layers: 3
use_sigmoid: true
model_collection: agg_models
type: DeepSet
n_avg_chunks: 1
n_burden_chunks: 5
n_regression_chunks: 2
n_repeats: 30
phenotypes:
- Apolipoprotein_A
- Apolipoprotein_B
- Calcium
pretrained_model_path: pretrained_models
regenie_exp: false
training:
dataloader_config:
batch_size: 1024
cache_tensors: true
chunksize: 100
num_workers: 0
temp_dir: $TMPDIR/deeprvat_train
drop_n_bags: 0
min_variant_count: 0
n_bags: 1
n_parallel_jobs: 6
sample_with_replacement: false
train_proportion: 0.8
training_data:
dataloader_config:
batch_size: 64
num_workers: 8
dataset_config:
annotation_file: annotations.parquet
annotations:
- MAF_MB
- MAF
- CADD_PHRED
- CADD_raw
- sift_score
- polyphen_score
- Consequence_splice_acceptor_variant
- Consequence_splice_donor_variant
- Consequence_stop_gained
- Consequence_frameshift_variant
- Consequence_stop_lost
- Consequence_start_lost
- Consequence_inframe_insertion
- Consequence_inframe_deletion
- Consequence_missense_variant
- Consequence_protein_altering_variant
- Consequence_splice_region_variant
- condel_score
- DeepSEA_PC_1
- DeepSEA_PC_2
- DeepSEA_PC_3
- DeepSEA_PC_4
- DeepSEA_PC_5
- DeepSEA_PC_6
- PrimateAI_score
- AbSplice_DNA
- DeepRipe_plus_QKI_lip_hg2
- DeepRipe_plus_QKI_clip_k5
- DeepRipe_plus_KHDRBS1_clip_k5
- DeepRipe_plus_ELAVL1_parclip
- DeepRipe_plus_TARDBP_parclip
- DeepRipe_plus_HNRNPD_parclip
- DeepRipe_plus_MBNL1_parclip
- DeepRipe_plus_QKI_parclip
- SpliceAI_delta_score
- alphamissense
min_common_af:
MAF: 0.01
phenotype_file: phenotypes.parquet
rare_embedding:
config:
annotations: *id001
low_memory: true
thresholds:
CADD_PHRED: CADD_PHRED > 5
MAF: MAF < 1e-2
verbose: true
type: PaddedAnnotations
use_common_variants: false
use_rare_variants: true
verbose: true
x_phenotypes: *id002
y_transformation: quantile_transform
gt_file: genotypes.h5
variant_file: variants.parquet
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
use_pretrained_models: True
pretrained_model_path : pretrained_models

#Phenotypes to be used only for Association Testing
phenotypes_for_association_testing:
- Apolipoprotein_A
- Apolipoprotein_B
- Calcium

#File paths of necessary input files to DeepRVAT
gt_filename: genotypes.h5
variant_filename: variants.parquet
phenotype_filename: phenotypes.parquet
annotation_filename: annotations.parquet
gene_filename: protein_coding_genes.parquet

covariates: #x_phenotypes
- age
- age2
- age_sex
- genetic_sex
- genetic_PC_1
- genetic_PC_2
- genetic_PC_3
- genetic_PC_4
- genetic_PC_5
- genetic_PC_6
- genetic_PC_7
- genetic_PC_8
- genetic_PC_9
- genetic_PC_10
- genetic_PC_11
- genetic_PC_12
- genetic_PC_13
- genetic_PC_14
- genetic_PC_15
- genetic_PC_16
- genetic_PC_17
- genetic_PC_18
- genetic_PC_19
- genetic_PC_20

association_testing_data_thresholds:
MAF: "< 1e-3"
CADD_PHRED: "> 5"

#DeepRVAT model settings
n_repeats: 30
y_transformation: quantile_transform

# Results evaluation settings
evaluation:
correction_method: Bonferroni
alpha: 0.05

# Subsetting samples for association testing
#sample_files:
# association_testing: association_testing_samples.pkl

#Additional settings if using the CV pipeline
cv_options:
cv_exp: False
#cv_path: sample_files
#n_folds: 5

#Additional settings if using the REGENIE integration
regenie_options:
regenie_exp: False
# gtf_file: gencode.v38.basic.annotation.gtf.gz
# step_1:
# bgen: imputation.bgen
# snplist: imputation.snplist
# bsize: 1000
# options:
# - "--sample imputation.sample"
# - "--qt"
# step_2:
# bsize: 400
# options:
# - "--qt"
Loading

0 comments on commit 33d1798

Please sign in to comment.