From a0c1583241bfc008b288962eb4b8c2db05e71003 Mon Sep 17 00:00:00 2001 From: Eva Holtkamp Date: Fri, 23 Feb 2024 11:28:39 +0100 Subject: [PATCH] integrating suggested PR changes --- deeprvat/data/dense_gt.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/deeprvat/data/dense_gt.py b/deeprvat/data/dense_gt.py index e30d7545..a7acc1ca 100644 --- a/deeprvat/data/dense_gt.py +++ b/deeprvat/data/dense_gt.py @@ -288,10 +288,8 @@ def setup_phenotypes( ): logger.debug("Reading phenotype dataframe") self.phenotype_df = pd.read_parquet(phenotype_file, engine="pyarrow") - gt_file = h5py.File( - self.gt_filename, "r" - ) # TODO change this to using with open - samples_gt = gt_file["samples"][:] + with h5py.File(self.gt_filename, "r") as f: + samples_gt = f["samples"][:] samples_gt = np.array([item.decode("utf-8") for item in samples_gt]) if self.check_samples: self.samples_gt = samples_gt @@ -311,14 +309,17 @@ def setup_phenotypes( samples_to_keep = pickle.load(f) samples_to_keep = np.array(samples_to_keep) logger.info(f"Number of samples in sample file: {len(samples_to_keep)}") - samples_to_keep = np.array( + shared_samples = np.array( list(set(samples_to_keep).intersection(set(samples_phenotype_df))) ) + if len(shared_samples) < len(samples_to_keep): + logger.warning('Some samples from the sample file were not found in the data') + sample_to_keep = shared_samples logger.info( f"Number of samples in sample file and in phenotype_df: {len(samples_to_keep)}" ) else: - logger.info("Using all samples in phenotyp df") + logger.info("Using all samples in phenotype df") samples_to_keep = copy.deepcopy(samples_phenotype_df) logger.info("Removing samples that are not in genotype file") @@ -563,11 +564,7 @@ def setup_annotations( self.gene_specific_anno = self.annotation_df["gene_id"].dtype != np.dtype( "O" ) - - self.gene_specific_anno = self.annotation_df["gene_id"].dtype != np.dtype( - "O" - ) - + if type(annotation_aggregation) == str: self.annotation_aggregation = AGGREGATIONS.get( annotation_aggregation, annotation_aggregation