Skip to content

Commit

Permalink
Feature binary traits (#27)
Browse files Browse the repository at this point in the history
* necessary changes to allow for testing of binary traits

* update config

* fixup! Format Python code with psf/black pull_request

* fix gene file in config

* fix config

* adapting conifg to work with tests

* address comments in pull request

* address comments from pull request

* fixup! Format Python code with psf/black pull_request

* addressing comments from pull request

* addressing comments from pull request

* fixup! Format Python code with psf/black pull_request

* address comments from pull request

* fixup! Format Python code with psf/black pull_request

* adding max_n_markers argument

* require MAF column argument

* fixup! Format Python code with psf/black pull_request

* reformatting

---------

Co-authored-by: PMBio <PMBio@users.noreply.github.com>
  • Loading branch information
HolEv and PMBio authored Oct 13, 2023
1 parent 7215b4e commit 955f535
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 46 deletions.
15 changes: 14 additions & 1 deletion deeprvat/data/dense_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,18 @@ def transform_data(self):
self.phenotype_df[col] = rng.permutation(
self.phenotype_df[col].to_numpy()
)

if len(self.y_phenotypes) > 0:
unique_y_val = self.phenotype_df[self.y_phenotypes[0]].unique()
n_unique_y_val = np.count_nonzero(~np.isnan(unique_y_val))
logger.info(f"unique y values {unique_y_val}")
logger.info(n_unique_y_val)
else:
n_unique_y_val = 0
if n_unique_y_val == 2:
logger.warning(
"Not applying y transformation because y only has two values and seems to be binary"
)
self.y_transformation = None
if self.y_transformation is not None:
if self.y_transformation == "standardize":
logger.debug(" Standardizing target phenotype")
Expand All @@ -425,6 +436,8 @@ def transform_data(self):
)
else:
raise ValueError(f"Unknown y_transformation: {self.y_transformation}")
else:
logger.warning("Not transforming phenotype")

def setup_annotations(
self,
Expand Down
14 changes: 11 additions & 3 deletions deeprvat/deeprvat/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,8 +496,11 @@ def regress_on_gene_scoretest(gene: str, burdens: np.ndarray, model_score):
f"gene {gene}, p-value: {pv}, using saddle instead."
)
pv = model_score.pv_alt_model(burdens, method="saddle")

beta = model_score.coef(burdens)["beta"][0, 0]
# beta only for linear models
try:
beta = model_score.coef(burdens)["beta"][0, 0]
except:
beta = None

genes_params_pvalues = ([], [], [])
genes_params_pvalues[0].append(gene)
Expand Down Expand Up @@ -576,7 +579,12 @@ def regress_(
logger.info(f"X shape: {X.shape}, Y shape: {y.shape}")

# compute null_model for score test
model_score = scoretest.ScoretestNoK(y, X)
if len(np.unique(y)) == 2:
logger.info("Fitting binary model since only found two distinct y values")
model_score = scoretest.ScoretestLogit(y, X)
else:
logger.info("Fitting linear model")
model_score = scoretest.ScoretestNoK(y, X)
genes_betas_pvals = [
regress_on_gene_scoretest(gene, burdens[mask, i], model_score)
for i, gene in tqdm(
Expand Down
18 changes: 16 additions & 2 deletions deeprvat/seed_gene_discovery/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,20 @@ phenotypes:
# - Platelet_crit
# - Platelet_distribution_width
# - Red_blood_cell_erythrocyte_count

# - Body_mass_index_BMI
# - Glucose
# - Vitamin_D
# - Albumin
# - Total_protein
# - Cystatin_C
# - Gamma_glutamyltransferase
# - Alkaline_phosphatase
# - Creatinine
# - Whole_body_fat_free_mass
# - Forced_expiratory_volume_in_1_second_FEV1
# - Glycated_haemoglobin_HbA1c
# - WHR_Body_mass_index_BMI_corrected

variant_types:
- missense
- plof
Expand All @@ -42,7 +55,7 @@ test_config:
neglect_homozygous: False
collapse_method: sum #collapsing method for burde
var_weight_function: beta_maf

min_mac: 10
variant_file: variants.parquet

data:
Expand Down Expand Up @@ -99,3 +112,4 @@ data:
num_workers: 10
#batch_size: 20


95 changes: 72 additions & 23 deletions deeprvat/seed_gene_discovery/seed_gene_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import copy


import click
import numpy as np
Expand All @@ -18,7 +20,7 @@
from tqdm import tqdm

from deeprvat.data import DenseGTDataset
from seak.scoretest import ScoretestNoK
from seak import scoretest

logging.basicConfig(
format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
Expand All @@ -38,6 +40,14 @@ def replace_in_array(arr, old_val, new_val):
return np.where(arr == old_val, new_val, arr)


def get_caf(G):
# get the cumulative allele frequency
ac = G.sum(axis=0) # allele count of each variant
af = ac / (G.shape[0] * 2) # allele frequency of each variant
caf = af.sum()
return caf


# return mask
def save_burdens(GW_list, GW_full_list, split, chunk, out_dir):
burdens_path = Path(f"{out_dir}/burdens")
Expand Down Expand Up @@ -178,7 +188,11 @@ def get_anno(
def call_score(GV, null_model_score, pval_dict, test_type):
# score test
# p-value for the score-test
start_time = time.time()
pv = null_model_score.pv_alt_model(GV)
end_time = time.time()
time_diff = end_time - start_time
pval_dict["time"] = time_diff
logger.info(f"p-value: {pv}")
if pv < 0.0:
logger.warning(
Expand All @@ -195,10 +209,15 @@ def call_score(GV, null_model_score, pval_dict, test_type):
if pv < 1e-3 and test_type == "burden":
logger.info("Computing regression coefficient")
# if gene is quite significant get the regression coefficient + SE
beta = null_model_score.coef(GV)
logger.info(f"Regression coefficient: {beta}")
pval_dict["beta"] = beta["beta"][0, 0]
pval_dict["betaSd"] = np.sqrt(beta["var_beta"][0, 0])
# only works for quantitative traits
try:
beta = null_model_score.coef(GV)
logger.info(f"Regression coefficient: {beta}")
pval_dict["beta"] = beta["beta"][0, 0]
pval_dict["betaSd"] = np.sqrt(beta["var_beta"][0, 0])
except:
pval_dict["beta"] = None
pval_dict["betaSd"] = None
return pval_dict


Expand All @@ -207,13 +226,14 @@ def test_gene(
G_full: spmatrix,
gene: int,
grouped_annotations: pd.DataFrame,
dataset: DenseGTDataset,
Y,
weight_cols: List[str],
null_model_score: ScoretestNoK,
null_model_score: scoretest.ScoretestNoK,
test_config: Dict,
var_type,
test_type,
maf_col,
min_mac,
) -> Dict[str, Any]:
# Find variants present in gene
# Convert sparse genotype to CSC
Expand All @@ -232,11 +252,16 @@ def test_gene(
# GET expected allele count (EAC) as in Karczewski et al. 2022/Genebass
vars_per_sample = np.sum(G, axis=1)
samples_with_variant = vars_per_sample[vars_per_sample > 0].shape[0]
EAC = np.sum(vars_per_sample)
if len(np.unique(Y)) == 2:
n_cases = (Y > 0).sum()
else:
n_cases = Y.shape[0]
EAC = get_caf(G) * n_cases

pval_dict = {}

pval_dict["EAC"] = EAC
pval_dict["n_cases"] = n_cases
pval_dict["gene"] = gene
pval_dict["pval"] = np.nan
pval_dict["EAC_filtered"] = np.nan
Expand All @@ -247,11 +272,11 @@ def test_gene(
pval_dict["time"] = np.nan

var_weight_function = test_config.get("var_weight_function", "sift_polyphen")
max_n_markers = test_config.get("max_n_markers", 5000)
# skips genes with more than max_n_markers qualifying variants

logger.info(f"Using function {var_weight_function} for variant weighting")

# keep backwards compatibility

(
weights,
_,
Expand All @@ -272,12 +297,12 @@ def test_gene(
f"Number of variants after thresholding using threshold {variant_weight_th}: {len(pos)}"
)
pval_dict["n_QV"] = len(pos)

if len(pos) > 0:
pval_dict["markers_after_mac_collapsing"] = len(pos)
if (len(pos) > 0) & (len(pos) < max_n_markers):
G_f = G[:, pos]
EAC_filtered = np.sum(np.sum(G_f, axis=1))
EAC_filtered = EAC = get_caf(G_f) * n_cases
pval_dict["EAC_filtered"] = EAC_filtered

MAC = G_f.sum(axis=0)
count = G_f[G_f == 2].shape[0]

# confirm that variants we include are rare variants
Expand All @@ -303,11 +328,28 @@ def test_gene(
pval_dict["n_cluster"] = GW.shape[1]

### COLLAPSE kernel if doing burden test

collapse_ultra_rare = True
if test_type == "skat":
logger.info("Running Skat test")
GW = GW

if collapse_ultra_rare:
logger.info(f"Max Collapsing variants with MAC <= {min_mac}")
MAC_mask = MAC <= min_mac
if MAC_mask.sum() > 0:
logger.info(f"Number of collapsed positions: {MAC_mask.sum()}")
GW_collapse = copy.deepcopy(GW)
GW_collapse = GW_collapse[:, MAC_mask].max(axis=1).reshape(-1, 1)
GW = GW[:, ~MAC_mask]
GW = np.hstack((GW_collapse, GW))
logger.info(f"GW shape {GW.shape}")
else:
logger.info(
f"No ultra rare variants to collapse ({MAC_mask.sum()})"
)
GW = GW
else:
GW = GW

pval_dict["markers_after_mac_collapsing"] = GW.shape[1]
if test_type == "burden":
collapse_method = test_config.get("collapse_method", "binary")
logger.info(f"Running burden test with collapsing method {collapse_method}")
Expand Down Expand Up @@ -335,14 +377,18 @@ def run_association_(
) -> pd.DataFrame:
# initialize the null models
# ScoretestNoK automatically adds a bias column if not present
null_model_score = ScoretestNoK(Y, X)
if len(np.unique(Y)) == 2:
print("Fitting binary model since only found two distinct y values")
null_model_score = scoretest.ScoretestLogit(Y, X)
else:
null_model_score = scoretest.ScoretestNoK(Y, X)
stats = []
GW_list = {}
GW_full_list = {}
time_list_inner = {}
weight_cols = config.get("weight_cols", [])
logger.info(f"Testing with this config: {config['test_config']}")

min_mac = config["test_config"].get("min_mac", 0)
# Get column with minor allele frequency
annotations = config["data"]["dataset_config"]["annotations"]
maf_col = [
Expand All @@ -360,13 +406,14 @@ def run_association_(
G_full,
gene,
grouped_annotations,
dataset,
Y,
weight_cols,
null_model_score,
config["test_config"],
var_type,
test_type,
maf_col,
min_mac,
)
if persist_burdens:
GW_list[gene] = GW
Expand Down Expand Up @@ -421,7 +468,7 @@ def update_config(
simulated_phenotype_file: str,
variant_type: Optional[str],
rare_maf: Optional[float],
maf_column: Optional[str],
maf_column: str,
new_config_file: str,
):
with open(old_config_file) as f:
Expand All @@ -431,6 +478,7 @@ def update_config(
config["data"]["dataset_config"][
"sim_phenotype_file"
] = simulated_phenotype_file
logger.info(f"Reading MAF column from column {maf_column}")

if phenotype is not None:
config["data"]["dataset_config"]["y_phenotypes"] = [phenotype]
Expand Down Expand Up @@ -645,9 +693,10 @@ def run_association(
exploded_annotations = (
dataset.annotation_df.query("id in @all_variants")
.explode("gene_ids")
.reset_index()
.drop_duplicates()
) # row can be duplicated if a variant is assigned to a gene multiple times

.set_index("id")
)
grouped_annotations = exploded_annotations.groupby("gene_ids")
gene_ids = pd.read_parquet(dataset.gene_file, columns=["id"])["id"].to_list()
gene_ids = list(
Expand Down
Loading

0 comments on commit 955f535

Please sign in to comment.