From a0c1583241bfc008b288962eb4b8c2db05e71003 Mon Sep 17 00:00:00 2001
From: Eva Holtkamp <eva.holtkamp@gmx.de>
Date: Fri, 23 Feb 2024 11:28:39 +0100
Subject: [PATCH] integrating suggested PR changes

---
 deeprvat/data/dense_gt.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/deeprvat/data/dense_gt.py b/deeprvat/data/dense_gt.py
index e30d7545..a7acc1ca 100644
--- a/deeprvat/data/dense_gt.py
+++ b/deeprvat/data/dense_gt.py
@@ -288,10 +288,8 @@ def setup_phenotypes(
     ):
         logger.debug("Reading phenotype dataframe")
         self.phenotype_df = pd.read_parquet(phenotype_file, engine="pyarrow")
-        gt_file = h5py.File(
-            self.gt_filename, "r"
-        )  # TODO change this to using with open
-        samples_gt = gt_file["samples"][:]
+        with h5py.File(self.gt_filename, "r") as f:
+            samples_gt = f["samples"][:]
         samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
         if self.check_samples:
             self.samples_gt = samples_gt
@@ -311,14 +309,17 @@ def setup_phenotypes(
                 samples_to_keep = pickle.load(f)
             samples_to_keep = np.array(samples_to_keep)
             logger.info(f"Number of samples in sample file: {len(samples_to_keep)}")
-            samples_to_keep = np.array(
+            shared_samples = np.array(
                 list(set(samples_to_keep).intersection(set(samples_phenotype_df)))
             )
+            if len(shared_samples) < len(samples_to_keep):
+                logger.warning('Some samples from the sample file were not found in the data')
+            sample_to_keep = shared_samples
             logger.info(
                 f"Number of samples in sample file and in phenotype_df: {len(samples_to_keep)}"
             )
         else:
-            logger.info("Using all samples in phenotyp df")
+            logger.info("Using all samples in phenotype df")
             samples_to_keep = copy.deepcopy(samples_phenotype_df)
 
         logger.info("Removing samples that are not in genotype file")
@@ -563,11 +564,7 @@ def setup_annotations(
             self.gene_specific_anno = self.annotation_df["gene_id"].dtype != np.dtype(
                 "O"
             )
-
-            self.gene_specific_anno = self.annotation_df["gene_id"].dtype != np.dtype(
-                "O"
-            )
-
+            
             if type(annotation_aggregation) == str:
                 self.annotation_aggregation = AGGREGATIONS.get(
                     annotation_aggregation, annotation_aggregation