Skip to content

Commit

Permalink
Merge branch 'feature-binary-traits' of https://github.com/PMBio/deep…
Browse files Browse the repository at this point in the history
…rvat into feature-binary-traits
  • Loading branch information
HolEv committed Oct 10, 2023
2 parents 2f7b846 + dfd821d commit da5c6d7
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 30 deletions.
9 changes: 6 additions & 3 deletions deeprvat/data/dense_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,12 +412,14 @@ def transform_data(self):
if len(self.y_phenotypes) > 0:
unique_y_val = self.phenotype_df[self.y_phenotypes[0]].unique()
n_unique_y_val = np.count_nonzero(~np.isnan(unique_y_val))
logger.info(f'unique y values {unique_y_val}')
logger.info(f"unique y values {unique_y_val}")
logger.info(n_unique_y_val)
else:
n_unique_y_val = 0
if n_unique_y_val == 2:
logger.info('Not applying y transformation because y only has two values and seems to be binary')
logger.info(
"Not applying y transformation because y only has two values and seems to be binary"
)
self.y_transformation = None
if self.y_transformation is not None:
if self.y_transformation == "standardize":
Expand All @@ -435,7 +437,8 @@ def transform_data(self):
else:
raise ValueError(f"Unknown y_transformation: {self.y_transformation}")
else:
logger.info('Not tranforming phenotype')
logger.info("Not tranforming phenotype")

def setup_annotations(
self,
annotation_file: Optional[str],
Expand Down
8 changes: 5 additions & 3 deletions deeprvat/deeprvat/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def regress_on_gene_scoretest(gene: str, burdens: np.ndarray, model_score):
f"gene {gene}, p-value: {pv}, using saddle instead."
)
pv = model_score.pv_alt_model(burdens, method="saddle")
#beta only for linear models
# beta only for linear models
try:
beta = model_score.coef(burdens)["beta"][0, 0]
except:
Expand Down Expand Up @@ -580,10 +580,12 @@ def regress_(

# compute null_model for score test
if len(np.unique(y)) == 2:
logger.warning('Fitting binary model since only found two distinct y values')
logger.warning(
"Fitting binary model since only found two distinct y values"
)
model_score = scoretest.ScoretestLogit(y, X)
else:
logger.warning('Fitting linear model')
logger.warning("Fitting linear model")
model_score = scoretest.ScoretestNoK(y, X)
genes_betas_pvals = [
regress_on_gene_scoretest(gene, burdens[mask, i], model_score)
Expand Down
54 changes: 30 additions & 24 deletions deeprvat/seed_gene_discovery/seed_gene_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,16 @@ class GotNone(Exception):

def replace_in_array(arr, old_val, new_val):
return np.where(arr == old_val, new_val, arr)


def get_caf(G):
#get the cumulative allele frequency
ac = G.sum(axis = 0) #allele count of each variant
af = ac/(G.shape[0] * 2) #allele frequency of each variant
# get the cumulative allele frequency
ac = G.sum(axis=0) # allele count of each variant
af = ac / (G.shape[0] * 2) # allele frequency of each variant
caf = af.sum()
return caf


# return mask
def save_burdens(GW_list, GW_full_list, split, chunk, out_dir):
burdens_path = Path(f"{out_dir}/burdens")
Expand Down Expand Up @@ -189,7 +192,7 @@ def call_score(GV, null_model_score, pval_dict, test_type):
pv = null_model_score.pv_alt_model(GV)
end_time = time.time()
time_diff = end_time - start_time
pval_dict['time'] = time_diff
pval_dict["time"] = time_diff
logger.info(f"p-value: {pv}")
if pv < 0.0:
logger.warning(
Expand Down Expand Up @@ -230,7 +233,7 @@ def test_gene(
var_type,
test_type,
maf_col,
min_mac
min_mac,
) -> Dict[str, Any]:
# Find variants present in gene
# Convert sparse genotype to CSC
Expand All @@ -252,7 +255,7 @@ def test_gene(
if len(np.unique(Y)) == 2:
n_cases = (Y > 0).sum()
else:
n_cases = Y.shape[0]
n_cases = Y.shape[0]
EAC = get_caf(G) * n_cases

pval_dict = {}
Expand Down Expand Up @@ -295,13 +298,13 @@ def test_gene(
)
pval_dict["n_QV"] = len(pos)
pval_dict["markers_after_mac_collapsing"] = len(pos)
if ((len(pos) > 0) & (len(pos) < 5000)):
#there is only one gene with > 5000 missense variants (Titin, 10k missense)
if (len(pos) > 0) & (len(pos) < 5000):
# there is only one gene with > 5000 missense variants (Titin, 10k missense)
# which always causes a super large memory demand therefore we exclude it
G_f = G[:, pos]
EAC_filtered = EAC = get_caf(G_f) * n_cases
pval_dict["EAC_filtered"] = EAC_filtered
MAC = G_f.sum(axis = 0)
MAC = G_f.sum(axis=0)
count = G_f[G_f == 2].shape[0]

# confirm that variants we include are rare variants
Expand Down Expand Up @@ -331,17 +334,19 @@ def test_gene(
if test_type == "skat":
logger.info("Running Skat test")
if collapse_ultra_rare:
logger.info(f'Max Collapsing variants with MAC <= {min_mac}')
logger.info(f"Max Collapsing variants with MAC <= {min_mac}")
MAC_mask = MAC <= min_mac
if MAC_mask.sum() > 0:
logger.info(f'Number of collapsed positions: {MAC_mask.sum()}')
logger.info(f"Number of collapsed positions: {MAC_mask.sum()}")
GW_collapse = copy.deepcopy(GW)
GW_collapse = GW_collapse[:,MAC_mask].max(axis = 1).reshape(-1, 1)
GW = GW[:,~MAC_mask]
GW_collapse = GW_collapse[:, MAC_mask].max(axis=1).reshape(-1, 1)
GW = GW[:, ~MAC_mask]
GW = np.hstack((GW_collapse, GW))
logger.info(f'GW shape {GW.shape}')
logger.info(f"GW shape {GW.shape}")
else:
logger.info(f'No ultra rare variants to collapse ({MAC_mask.sum()})')
logger.info(
f"No ultra rare variants to collapse ({MAC_mask.sum()})"
)
GW = GW
else:
GW = GW
Expand All @@ -357,7 +362,7 @@ def test_gene(
else:
GW = GW_full = np.zeros(G.shape[0])

return pval_dict, GW, GW_full
return pval_dict, GW, GW_full


def run_association_(
Expand All @@ -375,7 +380,7 @@ def run_association_(
# initialize the null models
# ScoretestNoK automatically adds a bias column if not present
if len(np.unique(Y)) == 2:
print('Fitting binary model since only found two distinct y values')
print("Fitting binary model since only found two distinct y values")
null_model_score = scoretest.ScoretestLogit(Y, X)
else:
null_model_score = scoretest.ScoretestNoK(Y, X)
Expand All @@ -385,7 +390,7 @@ def run_association_(
time_list_inner = {}
weight_cols = config.get("weight_cols", [])
logger.info(f"Testing with this config: {config['test_config']}")
min_mac = config['test_config'].get('min_mac', 0)
min_mac = config["test_config"].get("min_mac", 0)
# Get column with minor allele frequency
annotations = config["data"]["dataset_config"]["annotations"]
maf_col = [
Expand All @@ -410,7 +415,7 @@ def run_association_(
var_type,
test_type,
maf_col,
min_mac
min_mac,
)
if persist_burdens:
GW_list[gene] = GW
Expand Down Expand Up @@ -476,8 +481,8 @@ def update_config(
"sim_phenotype_file"
] = simulated_phenotype_file
if maf_column is None:
annotations = config['data']["dataset_config"]['annotations']
af_pattern = re.compile(r'.*(_MAF|_AF|MAF)\b')
annotations = config["data"]["dataset_config"]["annotations"]
af_pattern = re.compile(r".*(_MAF|_AF|MAF)\b")
rare_maf_col = [s for s in annotations if af_pattern.match(s)]
assert len(rare_maf_col) == 1
maf_column = rare_maf_col[0]
Expand Down Expand Up @@ -694,10 +699,11 @@ def run_association(
logger.info("Grouping variants by gene")
exploded_annotations = (
dataset.annotation_df.query("id in @all_variants")
.explode('gene_ids').reset_index()
.explode("gene_ids")
.reset_index()
.drop_duplicates()
.set_index('id')
)
.set_index("id")
)
grouped_annotations = exploded_annotations.groupby("gene_ids")
gene_ids = pd.read_parquet(dataset.gene_file, columns=["id"])["id"].to_list()
gene_ids = list(
Expand Down

0 comments on commit da5c6d7

Please sign in to comment.