monarch-initiative · lnrekerle · Oct 31, 2023 · Oct 10, 2023 · Oct 13, 2023 · Oct 18, 2023
diff --git a/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb b/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb
diff --git a/src/genophenocorr/model/_cohort.py b/src/genophenocorr/model/_cohort.py
@@ -89,7 +89,7 @@ def __hash__(self) -> int:
 class Cohort(typing.Sized):
 
     @staticmethod
-    def from_patients(members: typing.Sequence[Patient]):
+    def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
         """
         Create a cohort from a sequence of patients.
 
@@ -99,17 +99,21 @@ def from_patients(members: typing.Sequence[Patient]):
         cohort_variants, cohort_phenotypes, cohort_proteins = set(), set(), set()  # , cohort_proteins
         var_counts, pheno_count, prot_counts = Counter(), Counter(), Counter()  # , prot_counts
         members = set(members)
+        excluded_members = []
         for patient in members:
+            if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
+                excluded_members.append(patient)
+                continue
+            cohort_phenotypes.update(patient.phenotypes)
             cohort_variants.update(patient.variants)
             var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])
-            cohort_phenotypes.update(patient.phenotypes)
             pheno_count.update([pheno.identifier.value for pheno in patient.phenotypes if pheno.observed == True])
             cohort_proteins.update(patient.proteins)
             prot_counts.update([prot.protein_id for prot in patient.proteins])
         all_counts = {'patients': len(members), 'variants': var_counts, 'phenotypes': pheno_count,
                       'proteins': prot_counts}  # 'proteins':prot_counts
         return Cohort(members, cohort_phenotypes, cohort_variants, cohort_proteins,
-                      all_counts)  # cohort_proteins, all_counts
+                      all_counts, excluded_members)  # cohort_proteins, all_counts
 
     """This class creates a collection of patients and makes it easier to determine overlapping diseases, 
     phenotypes, variants, and proteins among the patients. If a list of JSON files is given, it will
@@ -130,7 +134,7 @@ def from_patients(members: typing.Sequence[Patient]):
         list_data_by_tx(transcript:Optional[string]): A list and count of all the variants effects found for all transcripts or a given transcript if transcript is not None.
     """
 
-    def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict,
+    def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict, excluded_members,
                  recessive=False):
         """Constructs all necessary attributes for a Cohort object
 
@@ -151,6 +155,7 @@ def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set,
         self._protein_set = protein_set
         self._variant_set = variant_set
         self._all_counts_dict = counts_dict
+        self._excluded_members = excluded_members
         self._recessive = recessive
 
     @property
@@ -196,6 +201,10 @@ def all_transcripts(self):
             all_trans.update([trans.transcript_id for trans in var.tx_annotations])
         return all_trans
 
+    @property
+    def all_excluded_patients(self):
+        return self._excluded_members
+
     @property
     def total_patient_count(self):
         """
@@ -261,5 +270,31 @@ def list_data_by_tx(self, transcript=None):
             del var_type_dict[tx_id]
         return var_type_dict
 
+    def get_excluded_ids(self):
+        return [ex.patient_id for ex in self.all_excluded_patients]
+
+    def get_excluded_count(self):
+        return len(self.all_excluded_patients)
+
+    def get_protein_features_affected(self, transcript):
+        all_features = Counter()
+        protein_set = set()
+        var_coords = []
+        for var in self.all_variants:
+            for tx in var.tx_annotations:
+                if tx.transcript_id == transcript:
+                    protein_set.add(tx.protein_affected)
+                    if tx.protein_effect_location is None or tx.protein_effect_location[0] is None or tx.protein_effect_location[1] is None:
+                        continue
+                    else:
+                        var_coords.append(tx.protein_effect_location)
+        if len(protein_set) != 1:
+            raise ValueError(f"Found more than 1 protein: {protein_set}")
+        else:
+            protein = list(protein_set)[0][0]
+        for pair in var_coords:
+            all_features.update(list(protein.get_features_variant_overlaps(pair[0], pair[1])))
+        return all_features
+
     def __len__(self) -> int:
         return len(self._patient_set)
diff --git a/src/genophenocorr/model/_protein.py b/src/genophenocorr/model/_protein.py
@@ -7,6 +7,7 @@
 from .genome import Region
 
 
+
 class FeatureInfo:
     """A class that represents a protein feature
     (e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
@@ -141,6 +142,9 @@ def feature_type(self) -> FeatureType:
         """
         return self._type
 
+    def to_string(self) -> str:
+        return f"{self.feature_type.name}-{self.info.name}-{self.info.region}"
+
     def __str__(self) -> str:
         return f"SimpleProteinFeature(type={self.feature_type}, " \
                f"info={str(self.info)})"
@@ -242,6 +246,18 @@ def motifs(self) -> typing.Iterable[ProteinFeature]:
         """
         return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)
 
+    def get_features_variant_overlaps(self, var_start: int, var_end: int) -> typing.Sequence[ProteinFeature]:
+        affected_features = set()
+        for feat in self.protein_features:
+            if feat.info.start is None or feat.info.end is None:
+                print(f"{feat.info.name} has no start and end info")
+                continue
+            if feat.info.start <= var_start <= feat.info.end:
+                affected_features.add(feat.to_string())
+            elif feat.info.start <= var_end <= feat.info.end:
+                affected_features.add(feat.to_string())
+        return affected_features 
+
     def __str__(self) -> str:
         return f"ProteinMetadata(id={self.protein_id}, " \
                f"label={self.label}, " \

diff --git a/src/genophenocorr/model/_variant_effects.py b/src/genophenocorr/model/_variant_effects.py
@@ -74,3 +74,11 @@ def curie(self) -> str:
 
     def __str__(self) -> str:
         return self.name.lower()
+
+    def __eq__(self, other) -> bool:
+        return isinstance(other, VariantEffect) \
+            and self.value == other.value \
+            and self.name == other.name 
+
+    def __hash__(self) -> int:
+        return hash((self.value, self.name))
diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py
@@ -172,7 +172,7 @@ def _add_phenotypes(self, pp: Phenopacket) -> typing.Sequence[Phenotype]:
         for hpo_id in pp.phenotypic_features:
             hpo_id_list.append((hpo_id.type.id, not hpo_id.excluded))
         if len(hpo_id_list) == 0:
-            self._logger.warning(f'Expected at least one HPO term per patient, but received none for patient {pp.id}')
+            #self._logger.warning(f'Expected at least one HPO term per patient, but received none for patient {pp.id}')
             return []
         return self._phenotype_creator.create_phenotype(hpo_id_list)
 
@@ -193,7 +193,8 @@ def _add_protein_data(self, variants: typing.Sequence[Variant]) -> typing.Collec
 
 
 def load_phenopacket_folder(pp_directory: str,
-                            patient_creator: PhenopacketPatientCreator) -> Cohort:
+                            patient_creator: PhenopacketPatientCreator,
+                            include_patients_with_no_HPO: bool = False) -> Cohort:
     """
     Creates a Patient object for each phenopacket formatted JSON file in the given directory `pp_directory`.
 
@@ -215,7 +216,7 @@ def load_phenopacket_folder(pp_directory: str,
     patients = [patient_creator.create_patient(pp) for pp in pps]
 
     # create cohort from patients
-    return Cohort.from_patients(patients)
+    return Cohort.from_patients(patients, include_patients_with_no_HPO)
 
 
 def _load_phenopacket_dir(pp_dir: str) -> typing.Sequence[Phenopacket]:

diff --git a/src/genophenocorr/preprocessing/_uniprot.py b/src/genophenocorr/preprocessing/_uniprot.py
@@ -20,6 +20,11 @@ def __init__(self):
         """Constructs all necessary attributes for a UniprotProteinMetadataService object
         """
         self._logger = logging.getLogger(__name__)
+        self._logger.setLevel(logging.INFO)
+        handler = logging.FileHandler(f"{__name__}.log", mode='w')
+        formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
+        handler.setFormatter(formatter)
+        self._logger.addHandler(handler)
         self._url = 'https://rest.uniprot.org/uniprotkb/search?query=(%s)AND(reviewed:true)&fields=accession,id,' \
                     'gene_names,gene_primary,protein_name,ft_domain,ft_motif,ft_region,ft_repeat,xref_refseq'
 
@@ -42,11 +47,10 @@ def annotate(self, protein_id: str) -> typing.Sequence[ProteinMetadata]:
             return []
         protein_list = []
         for protein in results:
-            verify = False
+            unis = []
             for uni in protein['uniProtKBCrossReferences']:
-                if uni['id'] == protein_id:
-                    verify = True
-            if verify:
+                unis.append(uni['id'])
+            if protein_id in unis:
                 try:
                     protein_name = protein['proteinDescription']['recommendedName']['fullName']['value']
                 except KeyError:
@@ -64,8 +68,8 @@ def annotate(self, protein_id: str) -> typing.Sequence[ProteinMetadata]:
                     self._logger.warning(f"No features for {protein_id}")
                 protein_list.append(ProteinMetadata(protein_id, protein_name, all_features_list))
             else:
-                self._logger.warning(f"ID {protein_id} did not match")
-        self._logger.warning(f'Protein ID {protein_id} got {len(protein_list)} results')
-
+                self._logger.warning(f"UniProt did not return a protein ID that matches the ID we searched for: {protein_id} not in {unis}")
+        if len(protein_list) > 1:
+            self._logger.info(f'UniProt found {len(protein_list)} results for ID {protein_id}')
         # TODO - DD would like to discuss an example when there are >1 items in this list.
         return protein_list
diff --git a/src/genophenocorr/preprocessing/_vep.py b/src/genophenocorr/preprocessing/_vep.py
@@ -101,7 +101,7 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
         if not self._include_computational_txs and not trans_id.startswith('NM_'):
             # Skipping a computational transcript
             return None
-        is_preferred = True if 'canonical' in item and item['canonical'] else False
+        is_preferred = True if ('canonical' in item and item['canonical'] == 1) else False
         hgvsc_id = item.get('hgvsc')
         var_effects = []
         consequences = item.get('consequence_terms')
@@ -140,7 +140,7 @@ def _query_vep(self, variant_coordinates: VariantCoordinates) -> dict:
         api_url = self._url % (verify_start_end_coordinates(variant_coordinates))
         r = requests.get(api_url, headers={'Content-Type': 'application/json'})
         if not r.ok:
-            self._logging.error(f"Expected a result but got an Error for variant: {variant_coordinates}")
+            self._logging.error(f"Expected a result but got an Error for variant: {variant_coordinates.variant_key}")
             r.raise_for_status()
         results = r.json()
         if not isinstance(results, list):

diff --git a/src/genophenocorr/view/_cohort.py b/src/genophenocorr/view/_cohort.py
@@ -130,17 +130,20 @@ def variants_table(self, cohort, preferred_transcript, min_count=2) -> str:
         all_variant_counter = {x[0]:x[1] for x in all_variant_tuple_list}
         for variant in cohort.all_variants:
             var_count = all_variant_counter[variant.variant_string]
-            targets = [txa for txa in variant.tx_annotations if txa.transcript_id == "NM_001318852.2"]
+            targets = [txa for txa in variant.tx_annotations if txa.transcript_id == preferred_transcript]
             if len(targets) == 1:
                 target_txa = targets[0]
-                hgvsc_id = target_txa.hgvsc_id
+                if target_txa.hgvsc_id is not None:
+                    hgvsc_id = target_txa.hgvsc_id
+                else:
+                    hgvsc_id = "NA"
                 # split out the variant
                 fields = hgvsc_id.split(":")
                 if len(fields) == 2:
                     hgvs = fields[1]
                 else:
                     hgvs = hgvsc_id
-                effect_tuple = target_txa.variant_effects
+                effect_tuple = [var_eff.name for var_eff in target_txa.variant_effects]
                 variant_count_d[hgvs] = var_count
                 variant_to_effect_d[hgvs] = effect_tuple[0] # for simplicity, just display first effect
                 variant_to_key[hgvs] = variant.variant_string
@@ -160,7 +163,7 @@ def variants_table(self, cohort, preferred_transcript, min_count=2) -> str:
         for var in sorted_vars:
             items = []
             var_count = variant_count_d.get(var)
-            print(f"{var} - {var_count}")
+            #print(f"{var} - {var_count}")
             if var_count >= min_count: 
                 variant_key = variant_to_key.get(var)
                 items.append(var)
@@ -177,3 +180,123 @@ def variants_table(self, cohort, preferred_transcript, min_count=2) -> str:
             rows.append(f"{var_str}.</p>")
         rows.append("<p>Use the entry in the \"Key\" column to investigate whether specific variants display genotype-phenotype correlations</p>")
         return "\n".join(rows)
+
+
+    def cohort_summary_table(self, cohort, min_count=1) -> str:
+        """
+        Generate HTML code designed to be displayed on a Jupyter notebook using ipython/display/HTML
+        Show the number of annotations per HPO terms. Provide an explanation.
+
+        :param cohort: A cohort of patients to be analyzed
+        :type cohort: Cohort
+        :param min_count: Minimum number of annotations to be displayed in the table
+        :type min_count: int
+        :returns: HTML code for display
+        """
+        if not isinstance(cohort, Cohort):
+            raise ValueError(f"cohort argument must be a Cohort object but we got a {type(cohort)} object")
+        rows = list()
+        rows.append(f"<style>\n{CSS_CODE}</style>\n")
+        rows.append("<table>")
+        header_items = ["Item", "Description"]
+        rows.append(CohortViewer.html_row(header_items))
+        n_individuals = cohort.total_patient_count
+        n_unique_hpo = len(cohort.all_phenotypes)
+        n_unique_variants = len(cohort.all_variants)
+        n_excluded_patients = cohort.get_excluded_count()
+        excluded_patients = cohort.get_excluded_ids()
+
+        cap = "Description of the cohort. "
+        if n_excluded_patients > 0:
+            cap =  cap + f"{n_excluded_patients} individuals were removed from the cohort because they had no HPO terms."
+
+        capt = f"<caption>{cap}</caption>"
+        rows.append(capt)
+
+        rows.append(CohortViewer.html_row(["Total Individuals", str(n_individuals)]))
+        #TODO: Add Diseases
+        if n_excluded_patients > 0:
+            rows.append(CohortViewer.html_row(["Excluded Individuals", f"{str(n_excluded_patients)}: {';'.join(excluded_patients)}"]))
+        rows.append(CohortViewer.html_row(["Total Unique HPO Terms", str(n_unique_hpo)]))
+        rows.append(CohortViewer.html_row(["Total Unique Variants", str(n_unique_variants)]))
+
+        rows.append("</table>")
+
+        return "\n".join(rows)
+
+
+    def protein_features_table(self, cohort, preferred_transcript, min_count=2) -> str:
+        """
+        Generate HTML code designed to be displayed on a Jupyter notebook using ipython/display/HTML
+        Show genotype-phenotype correlation tests that could be run.
+
+        :param cohort: A cohort of patients to be analyzed
+        :type cohort: Cohort
+        :param min_count: Minimum number of annotations to be displayed in the table
+        :type min_count: int
+        :returns: HTML code for display
+        """
+        if not isinstance(cohort, Cohort):
+            raise ValueError(f"cohort argument must be a Cohort object but we got a {type(cohort)} object")
+        rows = list()
+        # Get a list of variants for the preferred transcript
+        variant_count_d = defaultdict(int)
+        variant_to_effect_d = defaultdict()
+        variant_to_key = defaultdict()
+        # key, variant string, value int
+        all_variant_tuple_list = cohort.list_all_variants()
+        if not isinstance(all_variant_tuple_list, list):
+            raise ValueError(f"all_variant_tuple_list is not a list but is a {type(all_variant_tuple_list)}")
+        all_variant_counter = {x[0]:x[1] for x in all_variant_tuple_list}
+        for variant in cohort.all_variants:
+            var_count = all_variant_counter[variant.variant_string]
+            targets = [txa for txa in variant.tx_annotations if txa.transcript_id == preferred_transcript]
+            if len(targets) == 1:
+                target_txa = targets[0]
+                if target_txa.hgvsc_id is not None:
+                    hgvsc_id = target_txa.hgvsc_id
+                else:
+                    hgvsc_id = "NA"
+                # split out the variant
+                fields = hgvsc_id.split(":")
+                if len(fields) == 2:
+                    hgvs = fields[1]
+                else:
+                    hgvs = hgvsc_id
+                effect_tuple = [var_eff.name for var_eff in target_txa.variant_effects]
+                variant_count_d[hgvs] = var_count
+                variant_to_effect_d[hgvs] = effect_tuple[0] # for simplicity, just display first effect
+                variant_to_key[hgvs] = variant.variant_string
+            else:
+                print(f"[WARN] could not identify a single variant for target transcript (got {len(targets)}), variant {variant.variant_string}")
+                # could not find an entry for our transcript, so just show the genomic coordinates
+                variant_count_d[variant.variant_string] = var_count
+                variant_to_key[variant.variant_string] = variant.variant_string
+        # sort the variants by count and put variants below mininum count for display into a separate set
+        sorted_vars = sorted(variant_count_d.items(), key=lambda x:x[1], reverse=True) # sort descending by values
+        sorted_vars = [x[0] for x in sorted_vars] # take the first item from each sorted tuple
+        below_threshold_vars = set()
+        rows.append(f"<style>\n{CSS_CODE}</style>\n")
+        rows.append("<table>")
+        header_items = ["Variant", "Effect", "Count", "Key"]
+        rows.append(CohortViewer.html_row(header_items))
+        for var in sorted_vars:
+            items = []
+            var_count = variant_count_d.get(var)
+            #print(f"{var} - {var_count}")
+            if var_count >= min_count: 
+                variant_key = variant_to_key.get(var)
+                items.append(var)
+                items.append(variant_to_effect_d.get(var, "n/a"))
+                items.append(str(var_count))
+                items.append(variant_key)
+                rows.append(CohortViewer.html_row(items))
+            else:
+                below_threshold_vars.add(var)
+        rows.append("</table>")
+        if len(below_threshold_vars) > 0:
+            var_str = "; ".join(below_threshold_vars)
+            rows.append(f"<p>Additionally, the following variants were observed {min_count-1} or fewer times: ")
+            rows.append(f"{var_str}.</p>")
+        rows.append("<p>Use the entry in the \"Key\" column to investigate whether specific variants display genotype-phenotype correlations</p>")
+        return "\n".join(rows)