Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created Cohort Viewer that shows Excluded Patients #85

Merged
merged 11 commits into from
Oct 31, 2023
2,804 changes: 1,429 additions & 1,375 deletions notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

43 changes: 39 additions & 4 deletions src/genophenocorr/model/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __hash__(self) -> int:
class Cohort(typing.Sized):

@staticmethod
def from_patients(members: typing.Sequence[Patient]):
def from_patients(members: typing.Sequence[Patient], include_patients_with_no_HPO: bool = False):
"""
Create a cohort from a sequence of patients.

Expand All @@ -99,17 +99,21 @@ def from_patients(members: typing.Sequence[Patient]):
cohort_variants, cohort_phenotypes, cohort_proteins = set(), set(), set() # , cohort_proteins
var_counts, pheno_count, prot_counts = Counter(), Counter(), Counter() # , prot_counts
members = set(members)
excluded_members = []
for patient in members:
if len(patient.phenotypes) == 0 and not include_patients_with_no_HPO:
excluded_members.append(patient)
continue
cohort_phenotypes.update(patient.phenotypes)
cohort_variants.update(patient.variants)
var_counts.update([var.variant_coordinates.variant_key for var in patient.variants])
cohort_phenotypes.update(patient.phenotypes)
pheno_count.update([pheno.identifier.value for pheno in patient.phenotypes if pheno.observed == True])
cohort_proteins.update(patient.proteins)
prot_counts.update([prot.protein_id for prot in patient.proteins])
all_counts = {'patients': len(members), 'variants': var_counts, 'phenotypes': pheno_count,
'proteins': prot_counts} # 'proteins':prot_counts
return Cohort(members, cohort_phenotypes, cohort_variants, cohort_proteins,
all_counts) # cohort_proteins, all_counts
all_counts, excluded_members) # cohort_proteins, all_counts

"""This class creates a collection of patients and makes it easier to determine overlapping diseases,
phenotypes, variants, and proteins among the patients. If a list of JSON files is given, it will
Expand All @@ -130,7 +134,7 @@ def from_patients(members: typing.Sequence[Patient]):
list_data_by_tx(transcript:Optional[string]): A list and count of all the variants effects found for all transcripts or a given transcript if transcript is not None.
"""

def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict,
def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set, protein_set, counts_dict, excluded_members,
recessive=False):
"""Constructs all necessary attributes for a Cohort object

Expand All @@ -151,6 +155,7 @@ def __init__(self, patient_set: typing.Set[Patient], phenotype_set, variant_set,
self._protein_set = protein_set
self._variant_set = variant_set
self._all_counts_dict = counts_dict
self._excluded_members = excluded_members
self._recessive = recessive

@property
Expand Down Expand Up @@ -196,6 +201,10 @@ def all_transcripts(self):
all_trans.update([trans.transcript_id for trans in var.tx_annotations])
return all_trans

@property
def all_excluded_patients(self):
return self._excluded_members

@property
def total_patient_count(self):
"""
Expand Down Expand Up @@ -261,5 +270,31 @@ def list_data_by_tx(self, transcript=None):
del var_type_dict[tx_id]
return var_type_dict

def get_excluded_ids(self):
return [ex.patient_id for ex in self.all_excluded_patients]

def get_excluded_count(self):
return len(self.all_excluded_patients)

def get_protein_features_affected(self, transcript):
all_features = Counter()
protein_set = set()
var_coords = []
for var in self.all_variants:
for tx in var.tx_annotations:
if tx.transcript_id == transcript:
protein_set.add(tx.protein_affected)
if tx.protein_effect_location is None or tx.protein_effect_location[0] is None or tx.protein_effect_location[1] is None:
continue
else:
var_coords.append(tx.protein_effect_location)
if len(protein_set) != 1:
raise ValueError(f"Found more than 1 protein: {protein_set}")
else:
protein = list(protein_set)[0][0]
for pair in var_coords:
all_features.update(list(protein.get_features_variant_overlaps(pair[0], pair[1])))
return all_features

def __len__(self) -> int:
return len(self._patient_set)
16 changes: 16 additions & 0 deletions src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .genome import Region



class FeatureInfo:
"""A class that represents a protein feature
(e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
Expand Down Expand Up @@ -141,6 +142,9 @@ def feature_type(self) -> FeatureType:
"""
return self._type

def to_string(self) -> str:
ielis marked this conversation as resolved.
Show resolved Hide resolved
return f"{self.feature_type.name}-{self.info.name}-{self.info.region}"

def __str__(self) -> str:
return f"SimpleProteinFeature(type={self.feature_type}, " \
f"info={str(self.info)})"
Expand Down Expand Up @@ -242,6 +246,18 @@ def motifs(self) -> typing.Iterable[ProteinFeature]:
"""
return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)

def get_features_variant_overlaps(self, var_start: int, var_end: int) -> typing.Sequence[ProteinFeature]:
affected_features = set()
for feat in self.protein_features:
if feat.info.start is None or feat.info.end is None:
lnrekerle marked this conversation as resolved.
Show resolved Hide resolved
print(f"{feat.info.name} has no start and end info")
continue
if feat.info.start <= var_start <= feat.info.end:
affected_features.add(feat.to_string())
ielis marked this conversation as resolved.
Show resolved Hide resolved
elif feat.info.start <= var_end <= feat.info.end:
affected_features.add(feat.to_string())
return affected_features
ielis marked this conversation as resolved.
Show resolved Hide resolved

def __str__(self) -> str:
return f"ProteinMetadata(id={self.protein_id}, " \
f"label={self.label}, " \
Expand Down
8 changes: 8 additions & 0 deletions src/genophenocorr/model/_variant_effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,11 @@ def curie(self) -> str:

def __str__(self) -> str:
return self.name.lower()

ielis marked this conversation as resolved.
Show resolved Hide resolved
def __eq__(self, other) -> bool:
return isinstance(other, VariantEffect) \
and self.value == other.value \
and self.name == other.name

def __hash__(self) -> int:
return hash((self.value, self.name))
7 changes: 4 additions & 3 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def _add_phenotypes(self, pp: Phenopacket) -> typing.Sequence[Phenotype]:
for hpo_id in pp.phenotypic_features:
hpo_id_list.append((hpo_id.type.id, not hpo_id.excluded))
if len(hpo_id_list) == 0:
self._logger.warning(f'Expected at least one HPO term per patient, but received none for patient {pp.id}')
#self._logger.warning(f'Expected at least one HPO term per patient, but received none for patient {pp.id}')
ielis marked this conversation as resolved.
Show resolved Hide resolved
return []
return self._phenotype_creator.create_phenotype(hpo_id_list)

Expand All @@ -193,7 +193,8 @@ def _add_protein_data(self, variants: typing.Sequence[Variant]) -> typing.Collec


def load_phenopacket_folder(pp_directory: str,
patient_creator: PhenopacketPatientCreator) -> Cohort:
patient_creator: PhenopacketPatientCreator,
include_patients_with_no_HPO: bool = False) -> Cohort:
"""
Creates a Patient object for each phenopacket formatted JSON file in the given directory `pp_directory`.

Expand All @@ -215,7 +216,7 @@ def load_phenopacket_folder(pp_directory: str,
patients = [patient_creator.create_patient(pp) for pp in pps]

# create cohort from patients
return Cohort.from_patients(patients)
return Cohort.from_patients(patients, include_patients_with_no_HPO)


def _load_phenopacket_dir(pp_dir: str) -> typing.Sequence[Phenopacket]:
Expand Down
18 changes: 11 additions & 7 deletions src/genophenocorr/preprocessing/_uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ def __init__(self):
"""Constructs all necessary attributes for a UniprotProteinMetadataService object
"""
self._logger = logging.getLogger(__name__)
self._logger.setLevel(logging.INFO)
ielis marked this conversation as resolved.
Show resolved Hide resolved
handler = logging.FileHandler(f"{__name__}.log", mode='w')
formatter = logging.Formatter("%(name)s %(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
self._logger.addHandler(handler)
self._url = 'https://rest.uniprot.org/uniprotkb/search?query=(%s)AND(reviewed:true)&fields=accession,id,' \
'gene_names,gene_primary,protein_name,ft_domain,ft_motif,ft_region,ft_repeat,xref_refseq'

Expand All @@ -42,11 +47,10 @@ def annotate(self, protein_id: str) -> typing.Sequence[ProteinMetadata]:
return []
protein_list = []
for protein in results:
verify = False
unis = []
for uni in protein['uniProtKBCrossReferences']:
if uni['id'] == protein_id:
verify = True
if verify:
unis.append(uni['id'])
if protein_id in unis:
try:
protein_name = protein['proteinDescription']['recommendedName']['fullName']['value']
except KeyError:
Expand All @@ -64,8 +68,8 @@ def annotate(self, protein_id: str) -> typing.Sequence[ProteinMetadata]:
self._logger.warning(f"No features for {protein_id}")
protein_list.append(ProteinMetadata(protein_id, protein_name, all_features_list))
else:
self._logger.warning(f"ID {protein_id} did not match")
self._logger.warning(f'Protein ID {protein_id} got {len(protein_list)} results')

self._logger.warning(f"UniProt did not return a protein ID that matches the ID we searched for: {protein_id} not in {unis}")
ielis marked this conversation as resolved.
Show resolved Hide resolved
if len(protein_list) > 1:
ielis marked this conversation as resolved.
Show resolved Hide resolved
self._logger.info(f'UniProt found {len(protein_list)} results for ID {protein_id}')
# TODO - DD would like to discuss an example when there are >1 items in this list.
return protein_list
4 changes: 2 additions & 2 deletions src/genophenocorr/preprocessing/_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
if not self._include_computational_txs and not trans_id.startswith('NM_'):
# Skipping a computational transcript
return None
is_preferred = True if 'canonical' in item and item['canonical'] else False
is_preferred = True if ('canonical' in item and item['canonical'] == 1) else False
hgvsc_id = item.get('hgvsc')
var_effects = []
consequences = item.get('consequence_terms')
Expand Down Expand Up @@ -140,7 +140,7 @@ def _query_vep(self, variant_coordinates: VariantCoordinates) -> dict:
api_url = self._url % (verify_start_end_coordinates(variant_coordinates))
r = requests.get(api_url, headers={'Content-Type': 'application/json'})
if not r.ok:
self._logging.error(f"Expected a result but got an Error for variant: {variant_coordinates}")
self._logging.error(f"Expected a result but got an Error for variant: {variant_coordinates.variant_key}")
r.raise_for_status()
results = r.json()
if not isinstance(results, list):
Expand Down
131 changes: 127 additions & 4 deletions src/genophenocorr/view/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,17 +130,20 @@ def variants_table(self, cohort, preferred_transcript, min_count=2) -> str:
all_variant_counter = {x[0]:x[1] for x in all_variant_tuple_list}
for variant in cohort.all_variants:
var_count = all_variant_counter[variant.variant_string]
targets = [txa for txa in variant.tx_annotations if txa.transcript_id == "NM_001318852.2"]
targets = [txa for txa in variant.tx_annotations if txa.transcript_id == preferred_transcript]
if len(targets) == 1:
target_txa = targets[0]
hgvsc_id = target_txa.hgvsc_id
if target_txa.hgvsc_id is not None:
hgvsc_id = target_txa.hgvsc_id
else:
hgvsc_id = "NA"
# split out the variant
fields = hgvsc_id.split(":")
if len(fields) == 2:
hgvs = fields[1]
else:
hgvs = hgvsc_id
effect_tuple = target_txa.variant_effects
effect_tuple = [var_eff.name for var_eff in target_txa.variant_effects]
variant_count_d[hgvs] = var_count
variant_to_effect_d[hgvs] = effect_tuple[0] # for simplicity, just display first effect
variant_to_key[hgvs] = variant.variant_string
Expand All @@ -160,7 +163,7 @@ def variants_table(self, cohort, preferred_transcript, min_count=2) -> str:
for var in sorted_vars:
items = []
var_count = variant_count_d.get(var)
print(f"{var} - {var_count}")
#print(f"{var} - {var_count}")
if var_count >= min_count:
variant_key = variant_to_key.get(var)
items.append(var)
Expand All @@ -177,3 +180,123 @@ def variants_table(self, cohort, preferred_transcript, min_count=2) -> str:
rows.append(f"{var_str}.</p>")
rows.append("<p>Use the entry in the \"Key\" column to investigate whether specific variants display genotype-phenotype correlations</p>")
return "\n".join(rows)


def cohort_summary_table(self, cohort, min_count=1) -> str:
"""
Generate HTML code designed to be displayed on a Jupyter notebook using ipython/display/HTML
Show the number of annotations per HPO terms. Provide an explanation.

:param cohort: A cohort of patients to be analyzed
:type cohort: Cohort
:param min_count: Minimum number of annotations to be displayed in the table
:type min_count: int
:returns: HTML code for display
"""
if not isinstance(cohort, Cohort):
raise ValueError(f"cohort argument must be a Cohort object but we got a {type(cohort)} object")
rows = list()
rows.append(f"<style>\n{CSS_CODE}</style>\n")
rows.append("<table>")
header_items = ["Item", "Description"]
rows.append(CohortViewer.html_row(header_items))
n_individuals = cohort.total_patient_count
n_unique_hpo = len(cohort.all_phenotypes)
n_unique_variants = len(cohort.all_variants)
n_excluded_patients = cohort.get_excluded_count()
excluded_patients = cohort.get_excluded_ids()

cap = "Description of the cohort. "
if n_excluded_patients > 0:
cap = cap + f"{n_excluded_patients} individuals were removed from the cohort because they had no HPO terms."

capt = f"<caption>{cap}</caption>"
rows.append(capt)

rows.append(CohortViewer.html_row(["Total Individuals", str(n_individuals)]))
#TODO: Add Diseases
if n_excluded_patients > 0:
rows.append(CohortViewer.html_row(["Excluded Individuals", f"{str(n_excluded_patients)}: {';'.join(excluded_patients)}"]))
rows.append(CohortViewer.html_row(["Total Unique HPO Terms", str(n_unique_hpo)]))
rows.append(CohortViewer.html_row(["Total Unique Variants", str(n_unique_variants)]))

rows.append("</table>")

return "\n".join(rows)


def protein_features_table(self, cohort, preferred_transcript, min_count=2) -> str:
"""
Generate HTML code designed to be displayed on a Jupyter notebook using ipython/display/HTML
Show genotype-phenotype correlation tests that could be run.

:param cohort: A cohort of patients to be analyzed
:type cohort: Cohort
:param min_count: Minimum number of annotations to be displayed in the table
:type min_count: int
:returns: HTML code for display
"""
if not isinstance(cohort, Cohort):
raise ValueError(f"cohort argument must be a Cohort object but we got a {type(cohort)} object")
rows = list()
# Get a list of variants for the preferred transcript
variant_count_d = defaultdict(int)
variant_to_effect_d = defaultdict()
variant_to_key = defaultdict()
# key, variant string, value int
all_variant_tuple_list = cohort.list_all_variants()
if not isinstance(all_variant_tuple_list, list):
raise ValueError(f"all_variant_tuple_list is not a list but is a {type(all_variant_tuple_list)}")
all_variant_counter = {x[0]:x[1] for x in all_variant_tuple_list}
for variant in cohort.all_variants:
var_count = all_variant_counter[variant.variant_string]
targets = [txa for txa in variant.tx_annotations if txa.transcript_id == preferred_transcript]
if len(targets) == 1:
target_txa = targets[0]
if target_txa.hgvsc_id is not None:
hgvsc_id = target_txa.hgvsc_id
else:
hgvsc_id = "NA"
# split out the variant
fields = hgvsc_id.split(":")
if len(fields) == 2:
hgvs = fields[1]
else:
hgvs = hgvsc_id
effect_tuple = [var_eff.name for var_eff in target_txa.variant_effects]
variant_count_d[hgvs] = var_count
variant_to_effect_d[hgvs] = effect_tuple[0] # for simplicity, just display first effect
variant_to_key[hgvs] = variant.variant_string
else:
print(f"[WARN] could not identify a single variant for target transcript (got {len(targets)}), variant {variant.variant_string}")
# could not find an entry for our transcript, so just show the genomic coordinates
variant_count_d[variant.variant_string] = var_count
variant_to_key[variant.variant_string] = variant.variant_string
# sort the variants by count and put variants below mininum count for display into a separate set
sorted_vars = sorted(variant_count_d.items(), key=lambda x:x[1], reverse=True) # sort descending by values
sorted_vars = [x[0] for x in sorted_vars] # take the first item from each sorted tuple
below_threshold_vars = set()
rows.append(f"<style>\n{CSS_CODE}</style>\n")
rows.append("<table>")
header_items = ["Variant", "Effect", "Count", "Key"]
rows.append(CohortViewer.html_row(header_items))
for var in sorted_vars:
items = []
var_count = variant_count_d.get(var)
#print(f"{var} - {var_count}")
if var_count >= min_count:
variant_key = variant_to_key.get(var)
items.append(var)
items.append(variant_to_effect_d.get(var, "n/a"))
items.append(str(var_count))
items.append(variant_key)
rows.append(CohortViewer.html_row(items))
else:
below_threshold_vars.add(var)
rows.append("</table>")
if len(below_threshold_vars) > 0:
var_str = "; ".join(below_threshold_vars)
rows.append(f"<p>Additionally, the following variants were observed {min_count-1} or fewer times: ")
rows.append(f"{var_str}.</p>")
rows.append("<p>Use the entry in the \"Key\" column to investigate whether specific variants display genotype-phenotype correlations</p>")
return "\n".join(rows)