diff --git a/deeprvat/data/dense_gt.py b/deeprvat/data/dense_gt.py index bf4a7339..d437e251 100644 --- a/deeprvat/data/dense_gt.py +++ b/deeprvat/data/dense_gt.py @@ -412,12 +412,14 @@ def transform_data(self): if len(self.y_phenotypes) > 0: unique_y_val = self.phenotype_df[self.y_phenotypes[0]].unique() n_unique_y_val = np.count_nonzero(~np.isnan(unique_y_val)) - logger.info(f'unique y values {unique_y_val}') + logger.info(f"unique y values {unique_y_val}") logger.info(n_unique_y_val) else: n_unique_y_val = 0 if n_unique_y_val == 2: - logger.info('Not applying y transformation because y only has two values and seems to be binary') + logger.info( + "Not applying y transformation because y only has two values and seems to be binary" + ) self.y_transformation = None if self.y_transformation is not None: if self.y_transformation == "standardize": @@ -435,7 +437,8 @@ def transform_data(self): else: raise ValueError(f"Unknown y_transformation: {self.y_transformation}") else: - logger.info('Not tranforming phenotype') + logger.info("Not tranforming phenotype") + def setup_annotations( self, annotation_file: Optional[str], diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index 142a97e0..5e5e845c 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -496,7 +496,7 @@ def regress_on_gene_scoretest(gene: str, burdens: np.ndarray, model_score): f"gene {gene}, p-value: {pv}, using saddle instead." ) pv = model_score.pv_alt_model(burdens, method="saddle") - #beta only for linear models + # beta only for linear models try: beta = model_score.coef(burdens)["beta"][0, 0] except: @@ -580,10 +580,12 @@ def regress_( # compute null_model for score test if len(np.unique(y)) == 2: - logger.warning('Fitting binary model since only found two distinct y values') + logger.warning( + "Fitting binary model since only found two distinct y values" + ) model_score = scoretest.ScoretestLogit(y, X) else: - logger.warning('Fitting linear model') + logger.warning("Fitting linear model") model_score = scoretest.ScoretestNoK(y, X) genes_betas_pvals = [ regress_on_gene_scoretest(gene, burdens[mask, i], model_score) diff --git a/deeprvat/seed_gene_discovery/seed_gene_discovery.py b/deeprvat/seed_gene_discovery/seed_gene_discovery.py index 180a3da8..974635d7 100644 --- a/deeprvat/seed_gene_discovery/seed_gene_discovery.py +++ b/deeprvat/seed_gene_discovery/seed_gene_discovery.py @@ -38,13 +38,16 @@ class GotNone(Exception): def replace_in_array(arr, old_val, new_val): return np.where(arr == old_val, new_val, arr) + + def get_caf(G): - #get the cumulative allele frequency - ac = G.sum(axis = 0) #allele count of each variant - af = ac/(G.shape[0] * 2) #allele frequency of each variant + # get the cumulative allele frequency + ac = G.sum(axis=0) # allele count of each variant + af = ac / (G.shape[0] * 2) # allele frequency of each variant caf = af.sum() return caf + # return mask def save_burdens(GW_list, GW_full_list, split, chunk, out_dir): burdens_path = Path(f"{out_dir}/burdens") @@ -189,7 +192,7 @@ def call_score(GV, null_model_score, pval_dict, test_type): pv = null_model_score.pv_alt_model(GV) end_time = time.time() time_diff = end_time - start_time - pval_dict['time'] = time_diff + pval_dict["time"] = time_diff logger.info(f"p-value: {pv}") if pv < 0.0: logger.warning( @@ -230,7 +233,7 @@ def test_gene( var_type, test_type, maf_col, - min_mac + min_mac, ) -> Dict[str, Any]: # Find variants present in gene # Convert sparse genotype to CSC @@ -252,7 +255,7 @@ def test_gene( if len(np.unique(Y)) == 2: n_cases = (Y > 0).sum() else: - n_cases = Y.shape[0] + n_cases = Y.shape[0] EAC = get_caf(G) * n_cases pval_dict = {} @@ -295,13 +298,13 @@ def test_gene( ) pval_dict["n_QV"] = len(pos) pval_dict["markers_after_mac_collapsing"] = len(pos) - if ((len(pos) > 0) & (len(pos) < 5000)): - #there is only one gene with > 5000 missense variants (Titin, 10k missense) + if (len(pos) > 0) & (len(pos) < 5000): + # there is only one gene with > 5000 missense variants (Titin, 10k missense) # which always causes a super large memory demand therefore we exclude it G_f = G[:, pos] EAC_filtered = EAC = get_caf(G_f) * n_cases pval_dict["EAC_filtered"] = EAC_filtered - MAC = G_f.sum(axis = 0) + MAC = G_f.sum(axis=0) count = G_f[G_f == 2].shape[0] # confirm that variants we include are rare variants @@ -331,17 +334,19 @@ def test_gene( if test_type == "skat": logger.info("Running Skat test") if collapse_ultra_rare: - logger.info(f'Max Collapsing variants with MAC <= {min_mac}') + logger.info(f"Max Collapsing variants with MAC <= {min_mac}") MAC_mask = MAC <= min_mac if MAC_mask.sum() > 0: - logger.info(f'Number of collapsed positions: {MAC_mask.sum()}') + logger.info(f"Number of collapsed positions: {MAC_mask.sum()}") GW_collapse = copy.deepcopy(GW) - GW_collapse = GW_collapse[:,MAC_mask].max(axis = 1).reshape(-1, 1) - GW = GW[:,~MAC_mask] + GW_collapse = GW_collapse[:, MAC_mask].max(axis=1).reshape(-1, 1) + GW = GW[:, ~MAC_mask] GW = np.hstack((GW_collapse, GW)) - logger.info(f'GW shape {GW.shape}') + logger.info(f"GW shape {GW.shape}") else: - logger.info(f'No ultra rare variants to collapse ({MAC_mask.sum()})') + logger.info( + f"No ultra rare variants to collapse ({MAC_mask.sum()})" + ) GW = GW else: GW = GW @@ -357,7 +362,7 @@ def test_gene( else: GW = GW_full = np.zeros(G.shape[0]) - return pval_dict, GW, GW_full + return pval_dict, GW, GW_full def run_association_( @@ -375,7 +380,7 @@ def run_association_( # initialize the null models # ScoretestNoK automatically adds a bias column if not present if len(np.unique(Y)) == 2: - print('Fitting binary model since only found two distinct y values') + print("Fitting binary model since only found two distinct y values") null_model_score = scoretest.ScoretestLogit(Y, X) else: null_model_score = scoretest.ScoretestNoK(Y, X) @@ -385,7 +390,7 @@ def run_association_( time_list_inner = {} weight_cols = config.get("weight_cols", []) logger.info(f"Testing with this config: {config['test_config']}") - min_mac = config['test_config'].get('min_mac', 0) + min_mac = config["test_config"].get("min_mac", 0) # Get column with minor allele frequency annotations = config["data"]["dataset_config"]["annotations"] maf_col = [ @@ -410,7 +415,7 @@ def run_association_( var_type, test_type, maf_col, - min_mac + min_mac, ) if persist_burdens: GW_list[gene] = GW @@ -476,8 +481,8 @@ def update_config( "sim_phenotype_file" ] = simulated_phenotype_file if maf_column is None: - annotations = config['data']["dataset_config"]['annotations'] - af_pattern = re.compile(r'.*(_MAF|_AF|MAF)\b') + annotations = config["data"]["dataset_config"]["annotations"] + af_pattern = re.compile(r".*(_MAF|_AF|MAF)\b") rare_maf_col = [s for s in annotations if af_pattern.match(s)] assert len(rare_maf_col) == 1 maf_column = rare_maf_col[0] @@ -694,10 +699,11 @@ def run_association( logger.info("Grouping variants by gene") exploded_annotations = ( dataset.annotation_df.query("id in @all_variants") - .explode('gene_ids').reset_index() + .explode("gene_ids") + .reset_index() .drop_duplicates() - .set_index('id') - ) + .set_index("id") + ) grouped_annotations = exploded_annotations.groupby("gene_ids") gene_ids = pd.read_parquet(dataset.gene_file, columns=["id"])["id"].to_list() gene_ids = list(