From 9edc345225d31e9cca86db724e2f788299ff613a Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 4 Sep 2024 16:09:44 +0200 Subject: [PATCH 01/44] Next development iteration `0.3.1.dev0`. --- docs/conf.py | 2 +- src/gpsea/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 6233764c..436b59e1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,7 +62,7 @@ # The short X.Y version. version = u'0.3' # The full version, including alpha/beta/rc tags. -release = u'0.3.0' +release = u'0.3.1.dev0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py index 402fc6cc..327b371c 100644 --- a/src/gpsea/__init__.py +++ b/src/gpsea/__init__.py @@ -2,4 +2,4 @@ GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients. """ -__version__ = "0.3.0" +__version__ = "0.3.1.dev0" From fc72e79c677e7629d8b29571ccfbd256f7eecd87 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 5 Sep 2024 11:11:23 +0200 Subject: [PATCH 02/44] Ignore `dev` folder. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8ad720ca..777fd2ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Cache with transcript/protein pickle files .gpsea_cache +dev/ # Byte-compiled / optimized / DLL files __pycache__/ From f6ac3dfa6935d8a116beaf77d24819349f8ebea8 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 5 Sep 2024 11:14:15 +0200 Subject: [PATCH 03/44] We do not need `gpsea.data` module anymore. --- src/gpsea/data/__init__.py | 3 - src/gpsea/data/_toy.py | 314 ------------------------------------- 2 files changed, 317 deletions(-) delete mode 100644 src/gpsea/data/__init__.py delete mode 100644 src/gpsea/data/_toy.py diff --git a/src/gpsea/data/__init__.py b/src/gpsea/data/__init__.py deleted file mode 100644 index e0372ba7..00000000 --- a/src/gpsea/data/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._toy import get_toy_cohort - -__all__ = ['get_toy_cohort'] diff --git a/src/gpsea/data/_toy.py b/src/gpsea/data/_toy.py deleted file mode 100644 index 24a3b9c4..00000000 --- a/src/gpsea/data/_toy.py +++ /dev/null @@ -1,314 +0,0 @@ -from hpotk import TermId - -from gpsea.model import * -from gpsea.model.genome import Contig, GenomicRegion, Strand - -CONTIG = Contig("1", "GB_ACC", "REFSEQ_NAME", "UCSC_NAME", 1_000) - - -def make_region(start: int, end: int) -> GenomicRegion: - return GenomicRegion(CONTIG, start, end, Strand.POSITIVE) - - -def get_toy_cohort() -> Cohort: - # TODO - Lauren - implement sample patients from the terms below - # - Arachnodactyly HP:0001166 - # - Focal clonic seizure HP:0002266 - # - Perimembranous ventricular septal defect HP:0011682 - # - Hepatosplenomegaly HP:0001433 - # - Tubularization of Bowman capsule HP:0032648 - # - Intercostal muscle weakness HP:0004878 - # - Enuresis nocturna HP:0010677 - # - Spasticity HP:0001257 - # - Chronic pancreatitis HP:0006280 - - arachnodactyly_T = Phenotype(TermId.from_curie("HP:0001166"), True) - focal_clonic_seizure_T = Phenotype(TermId.from_curie("HP:0002266"), True) - seizure_T = Phenotype(TermId.from_curie("HP:0001250"), True) - spasticity_T = Phenotype(TermId.from_curie("HP:0001257"), True) - arachnodactyly_F = Phenotype(TermId.from_curie("HP:0001166"), False) - focal_clonic_seizure_F = Phenotype(TermId.from_curie("HP:0002266"), False) - seizure_F = Phenotype(TermId.from_curie("HP:0001250"), False) - spasticity_F = Phenotype(TermId.from_curie("HP:0001257"), False) - - Disease_T = Disease(TermId.from_curie("OMIM:001234"), "Test present disease", True) - Disease_F = Disease(TermId.from_curie("OMIM:001234"), "Test absent disease", False) - - snv = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(280, 281), "A", "G", 0), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.180A>G", - False, - [VariantEffect.MISSENSE_VARIANT], - [1], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 60, - 60, - Genotypes.from_mapping( - { - SampleLabels("A"): Genotype.HETEROZYGOUS, - SampleLabels("B"): Genotype.HETEROZYGOUS, - SampleLabels("C"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("D"): Genotype.HETEROZYGOUS, - SampleLabels("E"): Genotype.HETEROZYGOUS, - SampleLabels("G"): Genotype.HETEROZYGOUS, - SampleLabels("J"): Genotype.HETEROZYGOUS, - SampleLabels("K"): Genotype.HETEROZYGOUS, - SampleLabels("M"): Genotype.HETEROZYGOUS, - SampleLabels("N"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("P"): Genotype.HETEROZYGOUS, - SampleLabels("Q"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("R"): Genotype.HETEROZYGOUS, - SampleLabels("T"): Genotype.HETEROZYGOUS, - SampleLabels("V"): Genotype.HETEROZYGOUS, - SampleLabels("Y"): Genotype.HETEROZYGOUS, - } - ), - ) - deletion = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(360, 363), "TTC", "T", -2), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.261_263del", - False, - [VariantEffect.FRAMESHIFT_VARIANT], - [2], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 86, - 87, - Genotypes.from_mapping( - { - SampleLabels("D"): Genotype.HETEROZYGOUS, - SampleLabels("F"): Genotype.HETEROZYGOUS, - SampleLabels("G"): Genotype.HETEROZYGOUS, - SampleLabels("H"): Genotype.HETEROZYGOUS, - SampleLabels("I"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("L"): Genotype.HETEROZYGOUS, - SampleLabels("O"): Genotype.HETEROZYGOUS, - SampleLabels("R"): Genotype.HETEROZYGOUS, - SampleLabels("S"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("U"): Genotype.HETEROZYGOUS, - SampleLabels("W"): Genotype.HETEROZYGOUS, - SampleLabels("X"): Genotype.HETEROZYGOUS, - SampleLabels("Z"): Genotype.HETEROZYGOUS, - } - ), - ) - het_dup = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(175, 176), "T", "TG", 1), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.75A>G", - False, - [VariantEffect.FRAMESHIFT_VARIANT], - [1], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 25, - 25, - Genotypes.empty(), - ) # Not used in the patients below, hence `empty()`. - hom_dup = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(175, 176), "T", "TG", 1), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.75A>G", - False, - [VariantEffect.FRAMESHIFT_VARIANT], - [1], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 25, - 25, - Genotypes.empty(), - ) # Not used in the patients below, hence `empty()`. - - patients = ( - Patient.from_raw_parts( - SampleLabels("A"), - sex=None, - phenotypes=(arachnodactyly_T, spasticity_F, seizure_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("B"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("C"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("D"), - sex=None, - phenotypes=(arachnodactyly_T, spasticity_T, seizure_T), - variants=[snv, deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("E"), - sex=None, - phenotypes=(arachnodactyly_T, spasticity_T, seizure_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("F"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("G"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv, deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("H"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("I"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("J"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("K"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("L"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_F, spasticity_F), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("M"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("N"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("O"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_F, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("P"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("Q"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_F, spasticity_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("R"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[snv, deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("S"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("T"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("U"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("V"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("W"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("X"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("Y"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("Z"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - ) - - return Cohort.from_patients(patients) From e4a8b3b759779581c6c3e69c31d2ee3fc06a3087 Mon Sep 17 00:00:00 2001 From: Lauren Rekerle Date: Thu, 5 Sep 2024 10:30:34 -0500 Subject: [PATCH 04/44] updated tqdm to print files/sec instead of it/sec --- src/gpsea/preprocessing/_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 32d8b255..fc23fa81 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -455,7 +455,7 @@ def load_phenopackets( # Turn phenopackets into a cohort using the cohort creator. # Keep track of the progress by wrapping the list of phenopackets # with TQDM 😎 - cohort_iter = tqdm(phenopackets, desc="Patients Created", file=sys.stdout) + cohort_iter = tqdm(phenopackets, desc="Patients Created", file=sys.stdout, unit="files") notepad = cohort_creator.prepare_notepad("Phenopackets") cohort = cohort_creator.process(cohort_iter, notepad) From 3819244f331929955ce0a898a170599fbfc467c0 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 6 Sep 2024 13:33:19 +0200 Subject: [PATCH 05/44] updating documentation (WIP) --- README.md | 3 +- docs/index.rst | 4 +- docs/user-guide/index.rst | 9 +- docs/user-guide/predicates.rst | 466 +----------------- docs/user-guide/{ => predicates}/devries.rst | 0 .../predicates/diagnosis_predicate.rst | 21 + .../predicates/filtering_predicate.rst | 56 +++ .../predicates/genotype_predicates.rst | 33 ++ .../predicates/groups_predicate.rst | 41 ++ docs/user-guide/predicates/hpo_predicate.rst | 44 ++ .../predicates/male_female_predicate.rst | 21 + .../mode_of_inheritance_predicate.rst | 97 ++++ .../{ => predicates}/phenotype_predicates.rst | 1 + .../predicates/variant_predicates.rst | 168 +++++++ docs/user-guide/report/tbx5_frameshift.csv | 2 +- docs/user-guide/stats.rst | 2 +- .../predicate/genotype/_gt_predicates.py | 2 +- 17 files changed, 511 insertions(+), 459 deletions(-) rename docs/user-guide/{ => predicates}/devries.rst (100%) create mode 100644 docs/user-guide/predicates/diagnosis_predicate.rst create mode 100644 docs/user-guide/predicates/filtering_predicate.rst create mode 100644 docs/user-guide/predicates/genotype_predicates.rst create mode 100644 docs/user-guide/predicates/groups_predicate.rst create mode 100644 docs/user-guide/predicates/hpo_predicate.rst create mode 100644 docs/user-guide/predicates/male_female_predicate.rst create mode 100644 docs/user-guide/predicates/mode_of_inheritance_predicate.rst rename docs/user-guide/{ => predicates}/phenotype_predicates.rst (94%) create mode 100644 docs/user-guide/predicates/variant_predicates.rst diff --git a/README.md b/README.md index 2a498eb4..449314b9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ ![PyPi downloads](https://img.shields.io/pypi/dm/gpsea.svg?label=Pypi%20downloads) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gpsea) -GPSEA is a Python library for discovery of genotype-phenotype associations. +GPSEA (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. + See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) and a comprehensive [User guide](https://monarch-initiative.github.io/gpsea/stable/user-guide/index.html) diff --git a/docs/index.rst b/docs/index.rst index f9478fa2..b70bf9c2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,8 +10,8 @@ A key question in biology and human genetics concerns the relationships between genetics, the focus is generally placed on the study of whether specific disease-causing alleles are associated with specific phenotypic manifestations of the disease. -`GPSEA` (genotypes and phenotypes - study and evaluation of associations) is a Python package designed to support genotype-phenotype correlation analysis. -We pronounce GPSEA as "G"-"P"-"C". The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. +`GPSEA` (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. `gpsea` ingests data from these phenopackets and performs analysis of the correlation of specific variants, variant types (e.g., missense vs. premature termination codon), or variant location in protein motifs or other features. The phenotypic abnormalities are represented by `Human Phenotype Ontology (HPO) `_ terms. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index f3e0569a..08097f40 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -4,7 +4,13 @@ User guide ========== -TODO - write high level overview and bridge to individual sections. +GPSEA allows users to perform many different kinds of genotype-phenotype correlation (GPCs) analysis. See the :ref:`tutorial` for an introduction. +In general, the analysis will include steps for data input, exploration of the cohort to assist in generating hypotheses about potential GPCs to test, +corresponding choice of genotype and phenotype predicates to perform the test, choice of statistical test, and approach to multiple-testing correction (mtc). +The pages shown in the table of contents provide more information about each step. + + + .. toctree:: :maxdepth: 1 @@ -13,7 +19,6 @@ TODO - write high level overview and bridge to individual sections. input-data exploratory predicates - phenotype_predicates stats mtc glossary diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 785c3ef0..ed63f9d3 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -4,484 +4,48 @@ Predicates ========== -Searching for genotype-phenotype associations usually requires to partition -the individuals into several discrete groups to allow testing for the inter-group differences. +Searching for genotype-phenotype associations usually requires that +the individuals are partitioned into two or more discrete groups to allow testing for the inter-group differences. GPSEA reflects these requirements with its predicate API. -Perhaps unsurprisingly, a predicate must be capable of partitioning the individuals into two or more groups. + + +A **predicate** must be capable of partitioning the individuals into two or more groups. The groups must be *exclusive* - each individual must be assigned at most into one group. -Moreover, the groups should be *exhaustive* and cover maximum of the possible states. +In general, it is desirable that the groups cover all or at least the majority of the cohort being analyzed to maximize statistical power. However, the predicate is allowed to return `None` if the individual cannot be assigned. -In result, the individual will be omitted from the downstream analysis. +As a result, the individual will be omitted from the downstream analysis. -Predicates serve both *genotype* and *phenotype* prongs of the analysis. +Predicates serve both the *genotype* and *phenotype* prongs of the analysis. Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`) assign the :class:`~gpsea.model.Patient` into a group (mostly) based on the variant information, while the phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`) use the HPO terms to assign a group. -It is literally impossible to use GPSEA without the predicates -because all analyses need at least one predicate (typically a *genotype* predicate). -Luckily, the purpose of this guide is to show all that is to know about predicates. -We will first discuss the genotype predicates and end with phenotype predicates. +All GPSEA analyses need at least one predicate (typically a *genotype* predicate) and many require both *genotype* and *phenotype* predicates. +The following pages provide more information. .. _genotype-predicates: -******************* -Genotype predicates -******************* - -A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. -Typically, this includes using the genotype data, such as presence of a missense variant -in a heterozygous genotype. However, other categorical variables, -such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. - -The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. -The testing is done in two steps. First, we count the alleles -of the matching variants and then we interpret the count, possibly including factors -such as the expected mode of inheritance and sex, to assign the individual into a group. -Finding the matching variants is what -the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. - - -TODO: wordsmith -We must first create the variant predicate and then wrap it in genotype predicate. - - -Variant predicates -================== - -GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class -to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. -The variant predicate can leverage multiple primary data: - -+------------------------+-------------------------------------------------------------------------------------------------+ -| Primary data source | Example | -+========================+=================================================================================================+ -| Allele | the variant being a deletion or a single nucleotide variant (SNV) | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Genome | overlaps of a target genomic region | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Protein data | variant is located in a region encoding a protein domain, protein feature type | -+------------------------+-------------------------------------------------------------------------------------------------+ - -which demands a considerable amount of flexibility for creating the predicate. - -As a rule of thumb, the predicates for testing basic conditions are available off the shelf, -and they can be used as building block for testing for more complex conditions, -such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". - -Let's demonstrate this on few examples. -We will load a cohort of 19 subjects with variants in *RERE*, -leading to `Holt-Oram syndrome MIM:142900 `_. -The the clinical signs and symptoms of the subjects were encoded into HPO terms -along with the pathogenic *RERE* variant. - -Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. -Briefly, we first load HPO: - ->>> import hpotk ->>> store = hpotk.configure_ontology_store() ->>> hpo = store.load_minimal_hpo(release='v2024-07-01') - -then, we configure the cohort creator: - ->>> from gpsea.preprocessing import configure_caching_cohort_creator ->>> cohort_creator = configure_caching_cohort_creator(hpo) - -which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets -from the release `0.1.18` of `Phenopacket Store `_. -We will load 19 individuals with mutations in *RERE* gene: - ->>> from ppktstore.registry import configure_phenopacket_registry ->>> registry = configure_phenopacket_registry() ->>> with registry.open_phenopacket_store(release='0.1.18') as ps: -... phenopackets = tuple(ps.iter_cohort_phenopackets('RERE')) ->>> len(phenopackets) -19 - -and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: - ->>> from gpsea.preprocessing import load_phenopackets ->>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... - -To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds -to a pathogenic variant `VCV000522858.5 `_ -that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with arginine: ``NM_001042681.2(RERE):c.4304A>G (p.His1435Arg)``. - ->>> variant_key_of_interest = '1_8358231_8358231_T_C' ->>> variant = cohort.get_variant_by_key(variant_key_of_interest) - -Building blocks ---------------- - -We can check that the variant overlaps with *RERE*: - ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> gene = VariantPredicates.gene('RERE') ->>> gene.test(variant) -True - -it overlaps with the *MANE* transcript: - ->>> rere_mane_tx_id = 'NM_001042681.2' ->>> tx = VariantPredicates.transcript(rere_mane_tx_id) ->>> tx.test(variant) -True - -it in fact overlaps with the exon 20: - ->>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) ->>> exon20.test(variant) -True - -and leads to a missense mutation with respect to the MANE transcript: - ->>> from gpsea.model import VariantEffect ->>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) ->>> missense.test(variant) -True - -See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` -for more info on the predicates available off the shelf. - - -Complex conditions ------------------- - -We can combine the building blocks to test for more elaborate conditions. -For instance, we can test if the variant meets *any* or several conditions: ->>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) ->>> missense_or_nonsense = missense | nonsense ->>> missense_or_nonsense.test(variant) -True -or *all* conditions: +.. toctree:: + :maxdepth: 1 + :caption: Contents: ->>> missense_and_exon20 = missense & exon20 ->>> missense_and_exon20.test(variant) -True + predicates/phenotype_predicates + predicates/genotype_predicates -The `VariantPredicate` overloads Python ``&`` (AND) and ``|`` (OR) operators to build a compound predicate from lower level building blocks. -Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, -such as testing if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: - ->>> from gpsea.model import VariantClass ->>> chromosomal_deletion = "SO:1000029" ->>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) ->>> predicate.get_question() -'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' - - -Inverting conditions --------------------- - -Sometimes we may want to test the variant for a condition that must *not* be met. -For instance, we may want to test if the variant is a deletion -that is *not* predicted to shift the transcript reading frame. -One of doing this would be to build a compound predicates -for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_VARIANT`: - ->>> non_frameshift_effects = ( -... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, -... # and many more effects.. -... ) ->>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) - -However, this is clearly tedious and it would be much better implemented -by a simple logical not of a predicate for a frameshift variant effect. - -To support this, `VariantPredicate` implements *logical inversion* -which corresponds to Python's ``~`` operator (tilde), to wrap -the underlying predicate and to invert its test result. - -This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: - ->>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) ->>> non_frameshift_del.get_question() -'(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' - -Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. - -The variant predicate offers a flexible API for testing if variants meet a condition. -However, the genotype phenotype correlations are done on the individual level -and the variant predicates are used as a component of the genotype predicate. -The next sections show how to use variant predicates to assign individuals into groups. - - -Mode of inheritance predicate -============================= - -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` -assigns the individual into a group based on the number of alleles -that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports -the following Mendelian modes of inheritance (MoI): - - -+-----------------------+-----------------------------------+------------------+------------------------+ -| Mode of inheritance | Sex | Allele count | Genotype category | -+=======================+===================================+==================+========================+ -| Autosomal dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| Autosomal recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 2 | `BIALLELIC_ALT` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 3` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | -+ + +------------------+------------------------+ -| | | 2 | `BIALLELIC_ALT` | -+ + +------------------+------------------------+ -| | | :math:`\ge 3` | ``None`` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | -+ + +------------------+------------------------+ -| | | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ - -.. note:: - - `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. - -Clinical judgment should be used to choose the MoI for the cohort analysis. -Then a predicate for the desired MoI can be created by one of -:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: - -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` - -All constructors take an instance -of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. - - -Example -------- - -Here we show seting up a predicate for grouping individuals based on -having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` -to test differences between the genotypes of a disease with an autosomal recessive MoI. - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant meets the condition: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -for assigning a patient into a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' - -The `gt_predicate` can be used in downstream analysis, such as in :class: - - -.. _filtering-predicate: - -Filtering predicate -=================== - -Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need -to consider only a subset of the groups. A `GenotypePolyPredicate` -created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset -of the target categorizations of interest. - -Example -------- - -Let's suppose we want test the genotype-phenotype association between variants -that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, -and we are specifically interested in comparing the heterozygous variants -in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -to bin according to a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' - -We see that the `gt_predicate` bins the patients into three groups: - ->>> cats = gt_predicate.get_categorizations() ->>> cats -(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) - -We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, -and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -that includes only the categories of interest: - ->>> from gpsea.analysis.predicate.genotype import filtering_predicate ->>> fgt_predicate = filtering_predicate( -... predicate=gt_predicate, -... targets=(cats[1], cats[2]), -... ) ->>> fgt_predicate.display_question() -'Which genotype group does the patient fit in: HET, BIALLELIC_ALT' .. _groups-predicate: -Groups predicate -================ - -Sometimes, all we want is to compare if there is a difference between individuals -who include one or more alleles of variant $X$ vs. individuals with variants $Y$, -vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. -We can do this with a *groups* predicate. - -The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` -takes *n* variant predicates and *n* group labels, and it will assign the patients -into the respective groups if one or more matching allele is found. -However, only one predicate is allowed to return a non-zero allele count. -Otherwise, the patient is assigned with ``None`` and excluded from the analysis. - -Example -------- - -Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate ->>> tx_id = 'NM_1234.5' ->>> gt_predicate = groups_predicate( -... predicates=( -... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), -... ), -... group_names=('Missense', 'Frameshift', 'Synonymous'), -... ) ->>> gt_predicate.display_question() -'Genotype group: Missense, Frameshift, Synonymous' - - -.. _sex-predicate: - -Partition by the sex of the individual -====================================== - -It is easy to investigate the phenotypic differences between females and males. -The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate -for partitioning based on the sex of the individual: - ->>> from gpsea.analysis.predicate.genotype import sex_predicate ->>> gt_predicate = sex_predicate() ->>> gt_predicate.display_question() -'Sex of the individual: FEMALE, MALE' - -The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. - - -.. _diagnosis-predicate: - -Partition by a diagnosis -======================== - -It is also possible to bin the individuals based on a diagnosis. -The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` -prepares a genotype predicate for assigning an individual into a diagnosis group: - ->>> from gpsea.analysis.predicate.genotype import diagnosis_predicate ->>> gt_predicate = diagnosis_predicate( -... diagnoses=('OMIM:154700', 'OMIM:129600'), -... labels=('Marfan syndrome', 'Ectopia lentis, familial'), -... ) ->>> gt_predicate.display_question() -'What disease was diagnosed: OMIM:154700, OMIM:129600' - -Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses -(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) -will be automatically omitted from the analysis. .. _phenotype-predicates: -******************** -Phenotype predicates -******************** - -The phenotype predicate assigns the individual into a group with respect to tested phenotype. -Typically, the phenotype corresponds to a clinical sign or symptom encoded into an HPO term. - - -Propagating phenotype predicate -=============================== - -When testing for presence or absence of an HPO term, the propagating phenotype predicate -leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. -In result, an individual annotated with a term is implicitly annotated with all its ancestors. -For instance, an individual annotated with `Ectopia lentis `_ -is also annotated with `Abnormal lens morphology `_, -`Abnormal anterior eye segment morphology `_, -`Abnormal eye morphology `_, ... - -Similarly, all descendants of a term, whose presence was specifically excluded in an individual, -are implicitly excluded. - -:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic. - -Example -------- - -Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` -to test for a presence of `Abnormal lens morphology `_. - - ->>> from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate ->>> query = hpotk.TermId.from_curie('HP:0000517') ->>> pheno_predicate = PropagatingPhenotypePredicate( -... hpo=hpo, -... query=query, -... ) ->>> pheno_predicate.display_question() -'Is Abnormal lens morphology present in the patient: Yes, No' - -TODO: explain ``missing_implies_phenotype_excluded`` Predicates for all cohort phenotypes diff --git a/docs/user-guide/devries.rst b/docs/user-guide/predicates/devries.rst similarity index 100% rename from docs/user-guide/devries.rst rename to docs/user-guide/predicates/devries.rst diff --git a/docs/user-guide/predicates/diagnosis_predicate.rst b/docs/user-guide/predicates/diagnosis_predicate.rst new file mode 100644 index 00000000..6ce811ab --- /dev/null +++ b/docs/user-guide/predicates/diagnosis_predicate.rst @@ -0,0 +1,21 @@ +.. _diagnosis_predicate: + +======================== +Partition by a diagnosis +======================== + +It is also possible to bin the individuals based on a diagnosis. +The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` +prepares a genotype predicate for assigning an individual into a diagnosis group: + +>>> from gpsea.analysis.predicate.genotype import diagnosis_predicate +>>> gt_predicate = diagnosis_predicate( +... diagnoses=('OMIM:154700', 'OMIM:129600'), +... labels=('Marfan syndrome', 'Ectopia lentis, familial'), +... ) +>>> gt_predicate.display_question() +'What disease was diagnosed: OMIM:154700, OMIM:129600' + +Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses +(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) +will be automatically omitted from the analysis. \ No newline at end of file diff --git a/docs/user-guide/predicates/filtering_predicate.rst b/docs/user-guide/predicates/filtering_predicate.rst new file mode 100644 index 00000000..2d47e1d3 --- /dev/null +++ b/docs/user-guide/predicates/filtering_predicate.rst @@ -0,0 +1,56 @@ +.. _filtering-predicate: + + +=================== +Filtering predicate +=================== + +Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need +to consider only a subset of the groups. A `GenotypePolyPredicate` +created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset +of the target categorizations of interest. + +Example +------- + +Let's suppose we want test the genotype-phenotype association between variants +that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, +and we are specifically interested in comparing the heterozygous variants +in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +to bin according to a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.display_question() +'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' + +We see that the `gt_predicate` bins the patients into three groups: + +>>> cats = gt_predicate.get_categorizations() +>>> cats +(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) + +We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, +and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +that includes only the categories of interest: + +>>> from gpsea.analysis.predicate.genotype import filtering_predicate +>>> fgt_predicate = filtering_predicate( +... predicate=gt_predicate, +... targets=(cats[1], cats[2]), +... ) +>>> fgt_predicate.display_question() +'What is the genotype group?: HET, BIALLELIC_ALT' \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst new file mode 100644 index 00000000..8a725b6e --- /dev/null +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -0,0 +1,33 @@ +.. _genotype_predicates: + +=================== +Genotype Predicates +=================== + + +A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. +Typically, this includes using the genotype data, such as presence of a missense variant +in a heterozygous genotype. However, other categorical variables, +such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. + +The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. +The testing is done in two steps. First, we count the alleles +of the matching variants and then we interpret the count, possibly including factors +such as the expected mode of inheritance and sex, to assign the individual into a group. +Finding the matching variants is what +the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. + + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + variant_predicates + mode_of_inheritance_predicate + filtering_predicate + male_female_predicate + diagnosis_predicate + groups_predicate + + + diff --git a/docs/user-guide/predicates/groups_predicate.rst b/docs/user-guide/predicates/groups_predicate.rst new file mode 100644 index 00000000..23f178ac --- /dev/null +++ b/docs/user-guide/predicates/groups_predicate.rst @@ -0,0 +1,41 @@ +.. _groups_predicate: + +================ +Groups Predicate +================ + + + +Sometimes, all we want is to compare if there is a difference between individuals +who include one or more alleles of variant $X$ vs. individuals with variants $Y$, +vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. +We can do this with a *groups* predicate. + +The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` +takes *n* variant predicates and *n* group labels, and it will assign the patients +into the respective groups if one or more matching allele is found. +However, only one predicate is allowed to return a non-zero allele count. +Otherwise, the patient is assigned with ``None`` and excluded from the analysis. + +Example +------- + +Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate +>>> tx_id = 'NM_1234.5' +>>> gt_predicate = groups_predicate( +... predicates=( +... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), +... ), +... group_names=('Missense', 'Frameshift', 'Synonymous'), +... ) +>>> gt_predicate.display_question() +'Genotype group: Missense, Frameshift, Synonymous' + + + diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst new file mode 100644 index 00000000..fada58ec --- /dev/null +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -0,0 +1,44 @@ +.. _hpo_predicate: + + +Propagating phenotype predicate +=============================== + +When testing for presence or absence of an HPO term, the propagating phenotype predicate +leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. +In result, an individual annotated with a term is implicitly annotated with all its ancestors. +For instance, an individual annotated with `Ectopia lentis `_ +is also annotated with `Abnormal lens morphology `_, +`Abnormal anterior eye segment morphology `_, +`Abnormal eye morphology `_, ... + +Similarly, all descendants of a term, whose presence was specifically excluded in an individual, +are implicitly excluded. + +:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic. + +Example +------- + +Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` +to test for a presence of `Abnormal lens morphology `_. + + +>>> from gpsea.analysis.predicate.phenotype import HpoPredicate +>>> query = hpotk.TermId.from_curie('HP:0000517') +>>> pheno_predicate = HpoPredicate( +... hpo=hpo, +... query=query, +... ) +>>> pheno_predicate.display_question() +'Is Abnormal lens morphology present in the patient: Yes, No' + + + +missing_implies_phenotype_excluded +---------------------------------- + +In many cases, published reports of clinical data about individuals with rare diseases describes phenotypic features that were observed, but do not +provide a comprehensive list of features that were explicitly excluded. By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. +Setting this argument to True will cause "n/a" entries to be set to "excluded". We provide this option for exploration but do not recommend its use for the +final analysis unless the assumption behind it is known to be true. \ No newline at end of file diff --git a/docs/user-guide/predicates/male_female_predicate.rst b/docs/user-guide/predicates/male_female_predicate.rst new file mode 100644 index 00000000..8f16c96b --- /dev/null +++ b/docs/user-guide/predicates/male_female_predicate.rst @@ -0,0 +1,21 @@ +.. _male_female_predicate: + +Partition by the sex of the individual +====================================== + +It is easy to investigate the phenotypic differences between females and males. +The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate +for partitioning based on the sex of the individual: + +>>> from gpsea.analysis.predicate.genotype import sex_predicate +>>> gt_predicate = sex_predicate() +>>> gt_predicate.display_question() +'Sex of the individual: FEMALE, MALE' + +The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. + +Note that we have implemented this predicate as a genotype predicate, because it is used in +place of other genotype predicates. Currently, it is not possible to compare the distribution of genotypes across sexes. + + + diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst new file mode 100644 index 00000000..a00570d0 --- /dev/null +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -0,0 +1,97 @@ +.. _mode_of_inheritance_predicate: + +============================== +Mode of Inheritance Predicates +============================== + +There are five basic modes of inheritance for single-gene diseases: autosomal dominant, autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial (See +`Understanding Genetics, Appendix B `_). + + +The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` +assigns the individual into a group based on the number of alleles +that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. +The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports +the following Mendelian modes of inheritance (MoI): + + ++-----------------------+-----------------------------------+------------------+------------------------+ +| Mode of inheritance | Sex | Allele count | Genotype category | ++=======================+===================================+==================+========================+ +| Autosomal dominant | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| Autosomal recessive | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 2 | `BIALLELIC_ALT` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 3` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| X-linked dominant | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | 1 | `HET` | ++ +-----------------------------------+------------------+------------------------+ +| | `*` | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ +| X-linked recessive | `*` | 0 | `HOM_REF` | ++ +-----------------------------------+------------------+------------------------+ +| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | ++ + +------------------+------------------------+ +| | | 2 | `BIALLELIC_ALT` | ++ + +------------------+------------------------+ +| | | :math:`\ge 3` | ``None`` | ++ +-----------------------------------+------------------+------------------------+ +| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | ++ + +------------------+------------------------+ +| | | :math:`\ge 2` | ``None`` | ++-----------------------+-----------------------------------+------------------+------------------------+ + +.. note:: + + `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. + +Clinical judgment should be used to choose the MoI for the cohort analysis. +Then a predicate for the desired MoI can be created by one of +:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: + +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` +* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` + +All constructors take an instance +of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. + + +Example +------- + +Here we show seting up a predicate for grouping individuals based on +having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` +to test differences between the genotypes of a disease with an autosomal recessive MoI. + +First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +for testing if a variant meets the condition: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ +... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) +>>> is_frameshift_or_stop_gain.get_question() +'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' + +Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +for assigning a patient into a genotype group: + +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate.display_question() +'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' + +The `gt_predicate` can be used in downstream analysis, such as in :class: diff --git a/docs/user-guide/phenotype_predicates.rst b/docs/user-guide/predicates/phenotype_predicates.rst similarity index 94% rename from docs/user-guide/phenotype_predicates.rst rename to docs/user-guide/predicates/phenotype_predicates.rst index 86572657..b8991a29 100644 --- a/docs/user-guide/phenotype_predicates.rst +++ b/docs/user-guide/predicates/phenotype_predicates.rst @@ -15,4 +15,5 @@ TODO -- separate explanations for HPO (Fisher), scores (Mann Whitney) and surviv :maxdepth: 1 :caption: Contents: + hpo_predicate devries diff --git a/docs/user-guide/predicates/variant_predicates.rst b/docs/user-guide/predicates/variant_predicates.rst new file mode 100644 index 00000000..cce6130c --- /dev/null +++ b/docs/user-guide/predicates/variant_predicates.rst @@ -0,0 +1,168 @@ +.. _variant_predicates: + +================== +Variant Predicates +================== + + +GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class +to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. +The variant predicate can leverage multiple primary data: + ++------------------------+-------------------------------------------------------------------------------------------------+ +| Primary data source | Example | ++========================+=================================================================================================+ +| Allele | the variant being a deletion or a single nucleotide variant (SNV) | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Genome | overlaps of a target genomic region | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Protein data | variant is located in a region encoding a protein domain, protein feature type | ++------------------------+-------------------------------------------------------------------------------------------------+ + +which demands a considerable amount of flexibility for creating the predicate. + +As a rule of thumb, the predicates for testing basic conditions are available off the shelf, +and they can be used as building block for testing for more complex conditions, +such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". + +Let's demonstrate this on few examples. +We will load a cohort of 19 subjects with variants in *RERE*, +leading to `Holt-Oram syndrome MIM:142900 `_. +The the clinical signs and symptoms of the subjects were encoded into HPO terms +along with the pathogenic *RERE* variant. + +Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. +Briefly, we first load HPO: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +then, we configure the cohort creator: + +>>> from gpsea.preprocessing import configure_caching_cohort_creator +>>> cohort_creator = configure_caching_cohort_creator(hpo) + +which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets +from the release `0.1.18` of `Phenopacket Store `_. +We will load 19 individuals with mutations in *RERE* gene: + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('RERE')) +>>> len(phenopackets) +19 + +and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: + +>>> from gpsea.preprocessing import load_phenopackets +>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Patients Created: ... + +To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds +to a pathogenic variant `VCV000522858.5 `_ +that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with arginine: ``NM_001042681.2(RERE):c.4304A>G (p.His1435Arg)``. + +>>> variant_key_of_interest = '1_8358231_8358231_T_C' +>>> variant = cohort.get_variant_by_key(variant_key_of_interest) + +Building blocks +--------------- + +We can check that the variant overlaps with *RERE*: + +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> gene = VariantPredicates.gene('RERE') +>>> gene.test(variant) +True + +it overlaps with the *MANE* transcript: + +>>> rere_mane_tx_id = 'NM_001042681.2' +>>> tx = VariantPredicates.transcript(rere_mane_tx_id) +>>> tx.test(variant) +True + +it in fact overlaps with the exon 20: + +>>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) +>>> exon20.test(variant) +True + +and leads to a missense mutation with respect to the MANE transcript: + +>>> from gpsea.model import VariantEffect +>>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) +>>> missense.test(variant) +True + +See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` +for more info on the predicates available off the shelf. + + +Complex conditions +------------------ + +We can combine the building blocks to test for more elaborate conditions. +For instance, we can test if the variant meets *any* or several conditions: + +>>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) +>>> missense_or_nonsense = missense | nonsense +>>> missense_or_nonsense.test(variant) +True + +or *all* conditions: + +>>> missense_and_exon20 = missense & exon20 +>>> missense_and_exon20.test(variant) +True + +The `VariantPredicate` overloads Python ``&`` (AND) and ``|`` (OR) operators to build a compound predicate from lower level building blocks. + +Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, +such as testing if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: + +>>> from gpsea.model import VariantClass +>>> chromosomal_deletion = "SO:1000029" +>>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) +>>> predicate.get_question() +'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' + + +Inverting conditions +-------------------- + +Sometimes we may want to test the variant for a condition that must *not* be met. +For instance, we may want to test if the variant is a deletion +that is *not* predicted to shift the transcript reading frame. +One of doing this would be to build a compound predicates +for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_VARIANT`: + +>>> non_frameshift_effects = ( +... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, +... # and many more effects.. +... ) +>>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) + +However, this is clearly tedious and it would be much better implemented +by a simple logical not of a predicate for a frameshift variant effect. + +To support this, `VariantPredicate` implements *logical inversion* +which corresponds to Python's ``~`` operator (tilde), to wrap +the underlying predicate and to invert its test result. + +This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: + +>>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) +>>> non_frameshift_del.get_question() +'(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' + +Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. + +The variant predicate offers a flexible API for testing if variants meet a condition. +However, the genotype phenotype correlations are done on the individual level +and the variant predicates are used as a component of the genotype predicate. +The next sections show how to use variant predicates to assign individuals into groups. diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index a96700cf..81b92d32 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,4 +1,4 @@ -"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +"What is the genotype group?: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 55757ef3..4454e596 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -139,7 +139,7 @@ we expect the autosomal dominant mode of inheritance: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET' +'What is the genotype group?: HOM_REF, HET' `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group and the patients with one frameshift allele will be assigned into `HET` group. diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 3a930127..c4c82b5f 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -605,7 +605,7 @@ def __init__( ) if issues: raise ValueError("Cannot create predicate: {}".format(", ".join(issues))) - self._question = "Which genotype group does the patient fit in" + self._question = "What is the genotype group?" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations From 40f3d917c7ab568300cb80f0dccee03d921bfe51 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 6 Sep 2024 14:34:00 +0200 Subject: [PATCH 06/44] Fix some typos, use `HPOPredicate`. --- README.md | 2 +- docs/index.rst | 2 +- docs/report/tbx5_frameshift_vs_missense.csv | 12 ++-- docs/user-guide/predicates.rst | 31 ---------- .../predicates/diagnosis_predicate.rst | 2 +- .../predicates/filtering_predicate.rst | 4 +- .../predicates/genotype_predicates.rst | 2 +- .../predicates/groups_predicate.rst | 6 +- docs/user-guide/predicates/hpo_predicate.rst | 59 ++++++++++++++++--- .../predicates/male_female_predicate.rst | 2 +- .../mode_of_inheritance_predicate.rst | 6 +- docs/user-guide/report/tbx5_frameshift.csv | 10 ++-- .../report/tbx5_frameshift.mtc_report.html | 4 +- docs/user-guide/stats.rst | 2 +- .../predicate/genotype/_gt_predicates.py | 5 +- .../analysis/predicate/phenotype/__init__.py | 4 +- .../analysis/predicate/phenotype/_pheno.py | 8 ++- .../analysis/predicate/phenotype/_util.py | 4 +- tests/analysis/test_mtc_filter.py | 4 +- tests/conftest.py | 12 ++-- tests/test_predicates.py | 8 +-- 21 files changed, 101 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index 449314b9..c4b00093 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![PyPi downloads](https://img.shields.io/pypi/dm/gpsea.svg?label=Pypi%20downloads) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gpsea) -GPSEA (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +GPSEA (Genotypes and Phenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) diff --git a/docs/index.rst b/docs/index.rst index b70bf9c2..5bcadd38 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ A key question in biology and human genetics concerns the relationships between genetics, the focus is generally placed on the study of whether specific disease-causing alleles are associated with specific phenotypic manifestations of the disease. -`GPSEA` (Genotypes and Ghenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +`GPSEA` (Genotypes and Phenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. `gpsea` ingests data from these phenopackets and performs analysis of the correlation of specific variants, variant types (e.g., missense vs. premature termination codon), or variant location in protein motifs or other features. diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 0bc1e5c6..4ab9175a 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,4 +1,4 @@ -"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, +Genotype group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 -Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 -Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 +Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 -Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, -Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, -Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, +Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 +Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,, +Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,, +Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,, diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index ed63f9d3..bfd533b7 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -25,8 +25,6 @@ use the HPO terms to assign a group. All GPSEA analyses need at least one predicate (typically a *genotype* predicate) and many require both *genotype* and *phenotype* predicates. The following pages provide more information. -.. _genotype-predicates: - .. toctree:: @@ -37,35 +35,6 @@ The following pages provide more information. predicates/genotype_predicates - - -.. _groups-predicate: - - - -.. _phenotype-predicates: - - - - -Predicates for all cohort phenotypes -==================================== - -Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. -The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` -function cuts down the tedium: - ->>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest ->>> pheno_predicates = prepare_predicates_for_terms_of_interest( -... cohort=cohort, -... hpo=hpo, -... ) ->>> len(pheno_predicates) -301 - -and prepares predicates for testing 301 HPO terms of the *RERE* cohort. - - ******* Gallery ******* diff --git a/docs/user-guide/predicates/diagnosis_predicate.rst b/docs/user-guide/predicates/diagnosis_predicate.rst index 6ce811ab..386af20a 100644 --- a/docs/user-guide/predicates/diagnosis_predicate.rst +++ b/docs/user-guide/predicates/diagnosis_predicate.rst @@ -1,4 +1,4 @@ -.. _diagnosis_predicate: +.. _diagnosis-predicate: ======================== Partition by a diagnosis diff --git a/docs/user-guide/predicates/filtering_predicate.rst b/docs/user-guide/predicates/filtering_predicate.rst index 2d47e1d3..a5acef31 100644 --- a/docs/user-guide/predicates/filtering_predicate.rst +++ b/docs/user-guide/predicates/filtering_predicate.rst @@ -35,7 +35,7 @@ to bin according to a genotype group: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) >>> gt_predicate.display_question() -'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' We see that the `gt_predicate` bins the patients into three groups: @@ -53,4 +53,4 @@ that includes only the categories of interest: ... targets=(cats[1], cats[2]), ... ) >>> fgt_predicate.display_question() -'What is the genotype group?: HET, BIALLELIC_ALT' \ No newline at end of file +'What is the genotype group: HET, BIALLELIC_ALT' \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst index 8a725b6e..70fe391d 100644 --- a/docs/user-guide/predicates/genotype_predicates.rst +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -1,4 +1,4 @@ -.. _genotype_predicates: +.. _genotype-predicates: =================== Genotype Predicates diff --git a/docs/user-guide/predicates/groups_predicate.rst b/docs/user-guide/predicates/groups_predicate.rst index 23f178ac..a6142897 100644 --- a/docs/user-guide/predicates/groups_predicate.rst +++ b/docs/user-guide/predicates/groups_predicate.rst @@ -1,4 +1,4 @@ -.. _groups_predicate: +.. _groups-predicate: ================ Groups Predicate @@ -7,8 +7,8 @@ Groups Predicate Sometimes, all we want is to compare if there is a difference between individuals -who include one or more alleles of variant $X$ vs. individuals with variants $Y$, -vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. +who include one or more alleles of variant `X` vs. individuals with variants `Y`, +vs. individuals with variants `Z`, where `X`, `Y` and `Z` are variant predicates. We can do this with a *groups* predicate. The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst index fada58ec..eaca60dc 100644 --- a/docs/user-guide/predicates/hpo_predicate.rst +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -1,10 +1,10 @@ -.. _hpo_predicate: +.. _hpo-predicate: -Propagating phenotype predicate -=============================== +HPO predicate +============= -When testing for presence or absence of an HPO term, the propagating phenotype predicate +When testing for presence or absence of an HPO term, the :class:`~gpsea.analysis.predicate.phenotype.HpoPredicate` leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. In result, an individual annotated with a term is implicitly annotated with all its ancestors. For instance, an individual annotated with `Ectopia lentis `_ @@ -15,14 +15,19 @@ is also annotated with `Abnormal lens morphology `_. +We need to load :class:`~hpotk.MinimalOntology` with HPO data to access the HPO hierarchy: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +and now we can set up a predicate to test for presence of *Abnormal lens morphology*: >>> from gpsea.analysis.predicate.phenotype import HpoPredicate >>> query = hpotk.TermId.from_curie('HP:0000517') @@ -41,4 +46,42 @@ missing_implies_phenotype_excluded In many cases, published reports of clinical data about individuals with rare diseases describes phenotypic features that were observed, but do not provide a comprehensive list of features that were explicitly excluded. By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. Setting this argument to True will cause "n/a" entries to be set to "excluded". We provide this option for exploration but do not recommend its use for the -final analysis unless the assumption behind it is known to be true. \ No newline at end of file +final analysis unless the assumption behind it is known to be true. + + + +Predicates for all cohort phenotypes +==================================== + +Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. +The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` +function cuts down the tedium. + +For a given phenopacket collection (e.g. 156 patients with mutations in *WWOX* gene included in Phenopacket Store version `0.1.18`) + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5')) +>>> len(phenopackets) +156 + +processed into a cohort + +>>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets +>>> cohort_creator = configure_caching_cohort_creator(hpo) +>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Patients Created: ... + + +we can create HPO predicates for testing all 260 HPO terms used in the cohort + +>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest +>>> pheno_predicates = prepare_predicates_for_terms_of_interest( +... cohort=cohort, +... hpo=hpo, +... ) +>>> len(pheno_predicates) +260 + +and subject the predicates into further analysis, such as :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/predicates/male_female_predicate.rst b/docs/user-guide/predicates/male_female_predicate.rst index 8f16c96b..3bed237e 100644 --- a/docs/user-guide/predicates/male_female_predicate.rst +++ b/docs/user-guide/predicates/male_female_predicate.rst @@ -1,4 +1,4 @@ -.. _male_female_predicate: +.. _male-female-predicate: Partition by the sex of the individual ====================================== diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst index a00570d0..2efb4d6b 100644 --- a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -1,4 +1,4 @@ -.. _mode_of_inheritance_predicate: +.. _mode-of-inheritance-predicate: ============================== Mode of Inheritance Predicates @@ -92,6 +92,6 @@ for assigning a patient into a genotype group: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) >>> gt_predicate.display_question() -'What is the genotype group?: HOM_REF, HET, BIALLELIC_ALT' +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' -The `gt_predicate` can be used in downstream analysis, such as in :class: +The `gt_predicate` can be used in downstream analysis, such as in :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index 81b92d32..4851c198 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,4 +1,4 @@ -"What is the genotype group?: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +What is the genotype group,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.144002047919 Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667 Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483 Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444 -Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0 Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0 Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],64/64,100%,20/20,100%,, -Abnormal cardiac septum morphology [HP:0001671],89/89,100%,28/28,100%,, -Abnormal heart morphology [HP:0001627],89/89,100%,30/30,100%,, +Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 +Aplasia/Hypoplasia of the thumb [HP:0009601],40/40,100%,19/19,100%,, +Aplasia/Hypoplasia of fingers [HP:0006265],44/44,100%,19/19,100%,, +Aplasia/hypoplasia involving bones of the hand [HP:0005927],44/44,100%,19/19,100%,, diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html index fbc4a9de..3d00300a 100644 --- a/docs/user-guide/report/tbx5_frameshift.mtc_report.html +++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html @@ -103,14 +103,14 @@

Phenotype testing report

TODO - Skipping term with only 5 observations (not powered for 2x2) + Skipping term with maximum frequency that was less than threshold 0.2 10 TODO - Skipping term with maximum frequency that was less than threshold 0.2 + Skipping term with only 5 observations (not powered for 2x2) 10 diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 4454e596..59ad7604 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -139,7 +139,7 @@ we expect the autosomal dominant mode of inheritance: >>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate >>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() -'What is the genotype group?: HOM_REF, HET' +'What is the genotype group: HOM_REF, HET' `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group and the patients with one frameshift allele will be assigned into `HET` group. diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index c4c82b5f..76eebcc5 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,4 +1,3 @@ -from cProfile import label import dataclasses import enum import typing @@ -605,7 +604,7 @@ def __init__( ) if issues: raise ValueError("Cannot create predicate: {}".format(", ".join(issues))) - self._question = "What is the genotype group?" + self._question = "What is the genotype group" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -727,7 +726,7 @@ def sex_predicate() -> GenotypePolyPredicate: """ Get a genotype predicate for categorizing patients by their :class:`~gpsea.model.Sex`. - See the :ref:`sex-predicate` section for an example. + See the :ref:`male-female-predicate` section for an example. """ return INSTANCE diff --git a/src/gpsea/analysis/predicate/phenotype/__init__.py b/src/gpsea/analysis/predicate/phenotype/__init__.py index 2efa1e55..a70f954b 100644 --- a/src/gpsea/analysis/predicate/phenotype/__init__.py +++ b/src/gpsea/analysis/predicate/phenotype/__init__.py @@ -6,13 +6,13 @@ or using the phenotype features encoded into HPO terms (:class:`PropagatingPhenotypePredicate`). """ -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from ._pheno import PhenotypePolyPredicate, HpoPredicate from ._pheno import DiseasePresencePredicate from ._pheno import PhenotypeCategorization, P from ._util import prepare_predicates_for_terms_of_interest, prepare_hpo_terms_of_interest __all__ = [ - 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate', + 'PhenotypePolyPredicate', 'HpoPredicate', 'DiseasePresencePredicate', 'PhenotypeCategorization', 'P', 'prepare_predicates_for_terms_of_interest', 'prepare_hpo_terms_of_interest', diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 3b2d9a7b..4cb8a804 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -92,13 +92,15 @@ def present_phenotype_category(self) -> PatientCategory: return self.present_phenotype_categorization.category -class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]): +class HpoPredicate(PhenotypePolyPredicate[hpotk.TermId]): """ - `PropagatingPhenotypePredicate` tests if a patient is annotated with an HPO term. + `HpoPredicate` tests if a patient is annotated with an HPO term. Note, `query` must be a term of the provided `hpo`! - :param hpo: HPO object + See :ref:`hpo-predicate` section for an example usage. + + :param hpo: HPO ontology :param query: the HPO term to test :param missing_implies_phenotype_excluded: `True` if lack of an explicit annotation implies term's absence`. """ diff --git a/src/gpsea/analysis/predicate/phenotype/_util.py b/src/gpsea/analysis/predicate/phenotype/_util.py index ac73dd02..1a0e33d5 100644 --- a/src/gpsea/analysis/predicate/phenotype/_util.py +++ b/src/gpsea/analysis/predicate/phenotype/_util.py @@ -4,7 +4,7 @@ import hpotk -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from ._pheno import PhenotypePolyPredicate, HpoPredicate from gpsea.model import Patient @@ -26,7 +26,7 @@ def prepare_predicates_for_terms_of_interest( (either directly or indirectly) for the term to be included in the analysis. """ return tuple( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=term, missing_implies_phenotype_excluded=missing_implies_excluded, diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index d904cfe4..02710efd 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -8,7 +8,7 @@ from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.analysis.pcats import apply_predicates_on_patients from gpsea.model import Cohort @@ -55,7 +55,7 @@ def ph_predicate( For the purpose of testing counts, let's pretend the counts were created by this predicate. """ - return PropagatingPhenotypePredicate( + return HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure missing_implies_phenotype_excluded=False, diff --git a/tests/conftest.py b/tests/conftest.py index b66b55e7..26683724 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder from gpsea.model import * from gpsea.model.genome import GRCh38, GenomicRegion, Region, Strand, GenomeBuild @@ -128,23 +128,23 @@ def suox_pheno_predicates( Note, these are just a *SUBSET* of all phenotypes that can be tested for in the *SUOX* cohort. """ return ( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001250'), # Seizure ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001083'), # Ectopia lentis ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0032350'), # Sulfocysteinuria ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0012758'), # Neurodevelopmental delay ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001276'), # Hypertonia ), diff --git a/tests/test_predicates.py b/tests/test_predicates.py index c87ce0ad..467a5df1 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -2,7 +2,7 @@ import pytest from gpsea.analysis.predicate import PatientCategory, PatientCategories -from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate, DiseasePresencePredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate, DiseasePresencePredicate from gpsea.analysis.predicate.genotype import * from gpsea.model import Cohort, Patient, FeatureType, VariantEffect from gpsea.model.genome import Region @@ -16,7 +16,7 @@ def find_patient(pat_id: str, cohort: Cohort) -> Patient: raise ValueError(f'Could not find patient {pat_id}') -class TestPropagatingPhenotypeBooleanPredicate: +class TestHpoPredicate: @pytest.mark.parametrize('curie, patient_id, expected', # Patient "HetSingleVar" has Phenotypes: @@ -46,7 +46,7 @@ def test_phenotype_predicate__present_or_excluded( ): patient = find_patient(patient_id, toy_cohort) term_id = hpotk.TermId.from_curie(curie) - predicate = PropagatingPhenotypePredicate(hpo=hpo, query=term_id) + predicate = HpoPredicate(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual.phenotype == term_id @@ -60,7 +60,7 @@ def test_phenotype_predicate__unknown( # Not Measured and not Observed - 'HP:0006280', # Chronic pancreatitis patient = find_patient('HetSingleVar', toy_cohort) term_id = hpotk.TermId.from_curie('HP:0006280') - predicate = PropagatingPhenotypePredicate(hpo=hpo, query=term_id) + predicate = HpoPredicate(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual is None From 37ddf801eb027ca177c49cf0263e22a40c17adb2 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 6 Sep 2024 17:10:10 +0200 Subject: [PATCH 07/44] all variants viewable --- src/gpsea/view/_all_variant_viewable.py | 119 +++++++++++++++ src/gpsea/view/templates/all_variants.html | 170 +++++++++++++++++++++ 2 files changed, 289 insertions(+) create mode 100644 src/gpsea/view/_all_variant_viewable.py create mode 100644 src/gpsea/view/templates/all_variants.html diff --git a/src/gpsea/view/_all_variant_viewable.py b/src/gpsea/view/_all_variant_viewable.py new file mode 100644 index 00000000..ffe5c943 --- /dev/null +++ b/src/gpsea/view/_all_variant_viewable.py @@ -0,0 +1,119 @@ +import typing + +from hpotk import MinimalOntology +from jinja2 import Environment, PackageLoader +from collections import namedtuple, defaultdict + +from gpsea.model import Cohort +from gpsea.model._variant import Variant +from ._formatter import VariantFormatter + + +ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects']) + +VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'chromosomal_description', 'variant_effects'] ) + +class AllVariantViewable: + """ + Class to create a viewable object that is uses a Jinja2 template to create an HTML element + for display in the Jupyter notebook. + """ + + def __init__( + self, + transcript_id: str + ): + """ + Args: + hpo(MinimalOntology): An HPO ontology object from hpo-toolkit + top_phenotype_count(int): Maximum number of HPO terms to display in the HTML table (default: 10) + top_variant_count(int): Maximum number of variants to display in the HTML table (default: 10) + """ + environment = Environment(loader=PackageLoader('gpsea.view', 'templates')) + self._cohort_template = environment.get_template("all_variants.html") + self._var_formatter = VariantFormatter(transcript_id) + + def process( + self, + cohort: Cohort, + only_hgvs: bool = True + ) -> str: + """ + Create an HTML that should be shown with display(HTML(..)) of the ipython package. + + Args: + cohort (Cohort): The cohort being analyzed in the current Notebook + transcript_id (str): the transcript that we map variants onto + + Returns: + str: an HTML string with parameterized template for rendering + """ + context = self._prepare_context(cohort, transcript_id=self._transcript_id, only_hgvs=only_hgvs) + return self._cohort_template.render(context) + + def _prepare_context( + self, + cohort: Cohort, + transcript_id: typing.Optional[str], + only_hgvs + ) -> typing.Mapping[str, typing.Any]: + variant_count_dictionary = defaultdict() + for var in cohort.all_variants(): + vdata = self._get_variant_data(var, only_hgvs) + variant_count_dictionary[vdata] += 1 + variant_counts = list() + for var_data, count in variant_count_dictionary.items(): + variant_counts.append( + { + "variant_key": var_data.variant_key, + "variant": var_data.hgvs_cdna, + "variant_name": var_data.hgvs_cdna, + "protein_name": var_data.hgvsp, + "variant_effects": var_data.variant_effects, + "count": count, + } + ) + # The following dictionary is used by the Jinja2 HTML template + return { + "variant_count_list": variant_counts, + "total_unique_allele_count": len(variant_counts) + } + + + def _get_variant_description( + self, + variant: Variant, + only_hgvs: bool + ) -> VariantData: + """ + Get user-friendly strings (e.g., HGVS for our target transcript) to match to the chromosomal strings + Args: + cohort (Cohort): The cohort being analyzed in the current Notebook + transcript_id (str): the transcript that we map variants onto + only_hgvs (bool): do not show the transcript ID part of the HGVS annotation, just the annotation. + + Returns: + typing.Mapping[str, ToDisplay]: key: variant key, value: namedtuple(display (e.g. HGVS) string of variant, hgvsp protein string of variant) + """ + variant_key = variant.variant_info.variant_key + display = self._var_formatter.format_as_string(variant) + tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id) + if tx_annotation is not None: + hgvsp = tx_annotation.hgvsp + var_effects = [var_eff.name for var_eff in tx_annotation.variant_effects] + else: + hgvsp = None + var_effects = None + if only_hgvs: + # do not show the transcript id + fields_dna = display.split(":") + fields_ps = hgvsp.split(":") if hgvsp is not None else [None] + if len(fields_dna) > 1: + display = fields_dna[1] + else: + display = fields_dna[0] + if len(fields_ps) > 1: + hgvsp = fields_ps[1] + else: + hgvsp = fields_ps[0] + return VariantData(variant_key=variant_key, hgvs_cdna = fields_dna, hgvsp = hgvsp, chromosomal_description=None, variant_effects=var_effects) diff --git a/src/gpsea/view/templates/all_variants.html b/src/gpsea/view/templates/all_variants.html new file mode 100644 index 00000000..0bbe94e5 --- /dev/null +++ b/src/gpsea/view/templates/all_variants.html @@ -0,0 +1,170 @@ + + + + + Cohort + + + + +

GPSEA cohort analysis: All variant alleles

+ + + + + + + "variant": var_data.hgvs_cdna, + "variant_name": var_data.hgvs_cdna, + "protein_name": var_data.hgvsp, + "variant_effects": var_data.variant_effects, + "count": count, + + + + + + + + + + + {% for vdata in variant_counts %} + + + + + + + + {% endfor %} + +
+

Variant alleles

+ A total of {{ total_unique_allele_count }} unique alleles were identified in the cohort. +
Varianthgvs_cdnaproteinEffectscount
{{ vdata.variant }}{{ vdata.variant_name }}{{ vdata.protein_name }}{{ vdata.variant_effects }}{{ vdata.count }}
+ + + + + + + + + + + + {% for var_count in var_counts %} + + + + + + + + {% endfor %} + +
+

Top {{top_var_count}} Variants

+ Variants are shown according to {{ transcript_id }}. A total of {{ unique_variant_count}} unique variants were identified in the cohort. +
CountVariant keyVariant NameProtein VariantVariant Class
{{ var_count.Count }}{{ var_count.variant }}{{ var_count.variant_name }}{{ var_count.protein_name }}{{ var_count.variant_effects }}
+ + + + + + + + + {% for disease_count in disease_counts %} + + + + + + {% endfor %} + +
+

Diseases

+
Disease NameDisease IDAnnotation Count
{{ disease_count.disease_name }}{{ disease_count.disease_id }}{{ disease_count.count }}
+ {% if has_transcript > 0 %} + + + + + + + + {% for var_effect in var_effects_list %} + + + + + {% endfor %} + +
+

Variant categories for {{ transcript_id }}

+
Variant effectAnnotation Count
{{ var_effect.effect }}{{ var_effect.count }}
+ {% else %} +

Call this function with transcript to see table with variant effect counts.

+ {% endif %} + + + + From ac1422a3b9cbc1f3f38d55b39f55ffea7756a470 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sat, 7 Sep 2024 20:54:51 +0200 Subject: [PATCH 08/44] Keep track of phenotype predicates in the results. --- src/gpsea/analysis/pcats/_impl.py | 23 ++++++++----------- .../predicate/genotype/_gt_predicates.py | 3 ++- .../analysis/predicate/phenotype/_pheno.py | 16 +++++++++++++ tests/view/test_stats.py | 14 ++++++++--- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index e8045d2d..acc4b921 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -392,14 +392,14 @@ class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysis def __init__( self, - phenotypes: typing.Sequence[P], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], corrected_pvals: typing.Optional[typing.Sequence[float]], gt_predicate: GenotypePolyPredicate, ): - self._phenotypes = tuple(phenotypes) + self._pheno_predicates = tuple(pheno_predicates) self._n_usable = tuple(n_usable) self._all_counts = tuple(all_counts) self._pvals = tuple(pvals) @@ -411,7 +411,7 @@ def __init__( @property def phenotypes(self) -> typing.Sequence[P]: - return self._phenotypes + return tuple(p.phenotype for p in self._pheno_predicates) @property def n_usable(self) -> typing.Sequence[int]: @@ -431,7 +431,7 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: def __eq__(self, other): return isinstance(other, BaseMultiPhenotypeAnalysisResult) \ - and self._phenotypes == other._phenotypes \ + and self._pheno_predicates == other._pheno_predicates \ and self._n_usable == other._n_usable \ and self._all_counts == other._all_counts \ and self._pvals == other._pvals \ @@ -441,7 +441,7 @@ def __eq__(self, other): def __hash__(self): # NOTE: if a field is added, the hash method of the subclasses must be updated as well. return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, )) @@ -474,10 +474,8 @@ def _compute_result( else: corrected_pvals = self._apply_mtc(pvals=pvals) - phenotypes = tuple(p.phenotype for p in pheno_predicates) - return BaseMultiPhenotypeAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -496,7 +494,7 @@ class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]): def __init__( self, - phenotypes: typing.Sequence[hpotk.TermId], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], @@ -507,7 +505,7 @@ def __init__( mtc_name: typing.Optional[str], ): super().__init__( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -549,7 +547,7 @@ def __eq__(self, other): def __hash__(self): return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, self._mtc_filter_name, self._mtc_filter_results, self._mtc_name, )) @@ -590,7 +588,6 @@ def _compute_result( pheno_predicates = tuple(pheno_predicates) if len(pheno_predicates) == 0: raise ValueError("No phenotype predicates were provided") - phenotypes = tuple(p.phenotype for p in pheno_predicates) # 1 - Count the patients n_usable, all_counts = apply_predicates_on_patients( @@ -625,7 +622,7 @@ def _compute_result( corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask]) return HpoTermAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 3a930127..7c4ddcfd 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,4 +1,3 @@ -from cProfile import label import dataclasses import enum import typing @@ -14,6 +13,8 @@ from ._api import VariantPredicate from ._counter import AlleleCounter +# TODO: implement __hash__, __eq__ on predicates + class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 3b2d9a7b..7d8a9a53 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -195,6 +195,15 @@ def test( return None + def __eq__(self, value: object) -> bool: + return isinstance(value, PropagatingPhenotypePredicate) \ + and self._hpo.version == value._hpo.version \ + and self._query == value._query \ + and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded + + def __hash__(self) -> int: + return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) + def __repr__(self): return f"PropagatingPhenotypeBooleanPredicate(query={self._query})" @@ -262,5 +271,12 @@ def test( return self._diagnosis_excluded + def __eq__(self, value: object) -> bool: + return isinstance(value, DiseasePresencePredicate) \ + and self._query == value._query + + def __hash__(self) -> int: + return hash((self._query,)) + def __repr__(self): return f"DiseasePresencePredicate(query={self._query})" diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index 691f4f6a..d38556f6 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -7,6 +7,7 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer @@ -16,12 +17,19 @@ class TestStatsViewable: @pytest.fixture(scope='class') def hpo_term_analysis_result( self, + hpo: hpotk.MinimalOntology, suox_gt_predicate: GenotypePolyPredicate, ) -> HpoTermAnalysisResult: return HpoTermAnalysisResult( - phenotypes=( - hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly - hpotk.TermId.from_curie('HP:0001250'), # Seizure + pheno_predicates=( + PropagatingPhenotypePredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly + ), + PropagatingPhenotypePredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001250'), # Seizure + ), ), n_usable=(40, 20), all_counts=( From ddc35c3940c81ef4d45aa0458582c847770c44a1 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sat, 7 Sep 2024 23:12:43 +0200 Subject: [PATCH 09/44] Create `HpoTermAnalysisResultFormatter`. --- src/gpsea/analysis/pcats/_impl.py | 111 ++---------------- src/gpsea/view/__init__.py | 2 + src/gpsea/view/_phenotype_analysis.py | 91 ++++++++++++++ tests/conftest.py | 58 ++++++++- .../view/test_hpo_phenotype_result_viewer.py | 27 +++++ 5 files changed, 189 insertions(+), 100 deletions(-) create mode 100644 src/gpsea/view/_phenotype_analysis.py create mode 100644 tests/view/test_hpo_phenotype_result_viewer.py diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index acc4b921..c7eb6a61 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -13,7 +13,6 @@ from gpsea.model import Patient from gpsea.analysis.pcats.stats import CountStatistic -from ..predicate import PatientCategory from ..predicate.genotype import GenotypePolyPredicate from ..predicate.phenotype import P, PhenotypePolyPredicate from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult @@ -101,14 +100,6 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta): with the order determined by the phenotype order. """ - @property - @abc.abstractmethod - def phenotypes(self) -> typing.Sequence[P]: - """ - Get the tested phenotypes. - """ - pass - @property @abc.abstractmethod def n_usable(self) -> typing.Sequence[int]: @@ -165,95 +156,6 @@ def total_tests(self) -> int: """ return sum(1 for pval in self.pvals if not math.isnan(pval)) - def summarize( - self, - hpo: hpotk.MinimalOntology, - target_phenotype_category: PatientCategory, - ) -> pd.DataFrame: - """ - Create a data frame with summary of the genotype phenotype analysis. - - The *rows* of the frame correspond to the analyzed HPO terms. - - The columns of the data frame have `Count` and `Percentage` per used genotype predicate. - - **Example** - - If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate` - which can compare phenotype with and without a missense variant, we will have a data frame - that looks like this:: - - MISSENSE_VARIANT on `NM_1234.5` No Yes - Count Percent Count Percent Corrected p value p value - Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781 - Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 - Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 - ... ... ... ... ... ... ... - """ - - # TODO: this should be plotted by a formatter. - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(self.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - # TODO: fix the below - iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")), - names=(self._gt_predicate.get_question_base(), None), - ) - - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - - for phenotype, count in zip(self.phenotypes, self.all_counts): - gt_totals = ( - count.sum() - ) # Sum across the phenotype categories (collapse the rows). - for gt_cat in count.columns: - cnt = count.loc[target_phenotype_category, gt_cat] - total = gt_totals[gt_cat] - df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%" - - # Add columns with p values and corrected p values (if present) - p_val_col_name = "p values" - corrected_p_val_col_name = "Corrected p values" - if self.corrected_pvals is not None: - df.insert( - df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals - ) - df.insert(df.shape[1], ("", p_val_col_name), self.pvals) - - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map( - lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id) - ) - - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if self.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ) - else: - return df.sort_values(by=("", p_val_col_name)) - - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == "HP": - term_name = hpo.get_term_name(term_id) - return f"{term_name} [{term_id.value}]" - else: - return term_id.value - class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta): @@ -377,7 +279,8 @@ def _compute_nominal_pvals( def _apply_mtc( self, pvals: typing.Sequence[float], - ) -> typing.Optional[typing.Sequence[float]]: + ) -> typing.Sequence[float]: + assert self._mtc_correction is not None _, corrected_pvals, _, _ = multitest.multipletests( pvals=pvals, alpha=self._mtc_alpha, @@ -409,6 +312,16 @@ def __init__( assert isinstance(gt_predicate, GenotypePolyPredicate) self._gt_predicate = gt_predicate + @property + def gt_predicate(self) -> GenotypePolyPredicate: + return self._gt_predicate + + @property + def pheno_predicates( + self, + ) -> typing.Sequence[PhenotypePolyPredicate[P]]: + return self._pheno_predicates + @property def phenotypes(self) -> typing.Sequence[P]: return tuple(p.phenotype for p in self._pheno_predicates) diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 6eb13dd6..e236f802 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,5 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable +from ._phenotype_analysis import HpoTermAnalysisResultFormatter from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -12,6 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', + 'HpoTermAnalysisResultFormatter', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py new file mode 100644 index 00000000..26cec71b --- /dev/null +++ b/src/gpsea/view/_phenotype_analysis.py @@ -0,0 +1,91 @@ +import hpotk +import pandas as pd + +from gpsea.analysis.pcats import HpoTermAnalysisResult + + +class HpoTermAnalysisResultFormatter: + """ + `HpoTermAnalysisResultFormatter` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` + in a form suitable for humans. + + This includes preparing a dataframe with counts, frequencies, and p values for the tested HPO terms. + """ + + def __init__( + self, + hpo: hpotk.MinimalOntology, + ): + self._hpo = hpo + + def make_summary_dataframe( + self, + result: HpoTermAnalysisResult, + ) -> pd.DataFrame: + """ + Format the `result` into a dataframe for human consumption. + """ + assert isinstance(result, HpoTermAnalysisResult) + + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(result.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), + names=(result.gt_predicate.get_question_base(), None), + ) + + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + + for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + # Sum across the phenotype categories (collapse the rows). + gt_totals = count.sum() + + for gt_cat in count.columns: + cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if result.corrected_pvals is not None: + df.insert( + df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals + ) + df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map( + lambda term_id: HpoTermAnalysisResultFormatter._format_term_id( + self._hpo, term_id + ) + ) + + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + if result.corrected_pvals is not None: + return df.sort_values( + by=[("", corrected_p_val_col_name), ("", p_val_col_name)] + ) + else: + return df.sort_values(by=("", p_val_col_name)) + + @staticmethod + def _format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, + ) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value diff --git a/tests/conftest.py b/tests/conftest.py index b66b55e7..488e0869 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,11 +3,15 @@ import typing import hpotk +import numpy as np +import pandas as pd import pytest +from gpsea.analysis.mtc_filter import PhenotypeMtcResult +from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate -from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder +from gpsea.io import GpseaJSONDecoder from gpsea.model import * from gpsea.model.genome import GRCh38, GenomicRegion, Region, Strand, GenomeBuild from ._protein_test_service import ProteinTestMetadataService @@ -32,6 +36,7 @@ def pytest_collection_modifyitems(config, items): if "online" in item.keywords: item.add_marker(skip_online) + @pytest.fixture(scope='session') def fpath_project_dir(fpath_test_dir: str) -> str: """ @@ -151,6 +156,57 @@ def suox_pheno_predicates( ) +@pytest.fixture +def hpo_result( + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_predicate: GenotypePolyPredicate, +) -> HpoTermAnalysisResult: + return HpoTermAnalysisResult( + pheno_predicates=suox_pheno_predicates, + n_usable=(40, 20, 30, 10, 100), + all_counts=tuple( + make_count_df(count, suox_gt_predicate, ph_pred) + for count, ph_pred in zip( + [ + (10, 5, 15, 10), + (5, 2, 8, 5), + (10, 5, 5, 10), + (2, 3, 2, 3), + (10, 25, 35, 30), + ], + suox_pheno_predicates, + ) + ), + pvals=(0.01, 0.04, 0.3, 0.2, 0.7), + corrected_pvals=(0.04, 0.1, 0.7, 0.5, 1.0), + gt_predicate=suox_gt_predicate, + mtc_filter_name="MTC filter name", + mtc_filter_results=( + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + ), + mtc_name="MTC name", + ) + + +def make_count_df( + counts: typing.Tuple[int, int, int, int], + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], +) -> pd.DataFrame: + rows = tuple(ph_predicate.get_categories()) + cols = tuple(gt_predicate.get_categories()) + data = np.array(counts).reshape((len(rows), len(cols))) + return pd.DataFrame( + data=data, + index=pd.Index(rows), + columns=pd.Index(cols), + ) + + @pytest.fixture(scope='session') def fpath_suox_tx_coordinates(fpath_test_data_dir: str) -> str: suox_mane_tx_id = 'NM_001032386.2' diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py new file mode 100644 index 00000000..71122eb9 --- /dev/null +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -0,0 +1,27 @@ +import hpotk +import pytest + +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.view import HpoTermAnalysisResultFormatter + + +class TestHpoTermAnalysisResultFormatter: + + @pytest.fixture + def formatter( + self, + hpo: hpotk.MinimalOntology, + ) -> HpoTermAnalysisResultFormatter: + return HpoTermAnalysisResultFormatter( + hpo=hpo, + ) + + @pytest.mark.skip("Only for manual control") + def test_summarize( + self, + formatter: HpoTermAnalysisResultFormatter, + hpo_result: HpoTermAnalysisResult, + ): + df = formatter.make_summary_dataframe(result=hpo_result) + + print(df) From db445a5d138ef67f75fba5772c17b27ab40324e4 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sat, 7 Sep 2024 23:31:48 +0200 Subject: [PATCH 10/44] Use the HPO analysis result formatter in the docs. --- docs/report/tbx5_frameshift_vs_missense.csv | 8 ++++---- docs/tutorial.rst | 5 +++-- docs/user-guide/report/tbx5_frameshift.csv | 2 +- docs/user-guide/stats.rst | 5 +++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 0bc1e5c6..772f062f 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,4 +1,4 @@ -"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, +Genotype group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 -Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, -Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, +Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, +Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 0a583b39..d51d3868 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,8 +268,9 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> from gpsea.view import HpoTermAnalysisResultFormatter +>>> formatter = HpoTermAnalysisResultFormatter(hpo) +>>> summary_df = formatter.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index a96700cf..1f78c1c1 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,4 +1,4 @@ -"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +Which genotype group does the patient fit in,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 55757ef3..c2587050 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,8 +254,9 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) +>>> from gpsea.view import HpoTermAnalysisResultFormatter +>>> formatter = HpoTermAnalysisResultFormatter(hpo) +>>> summary_df = formatter.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest From 81846c1b2f0d7b3f1bb42e67866d1797b2f641d7 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 13:50:20 +0200 Subject: [PATCH 11/44] Rename `HpoTermAnalysisResultFormatter` to `HpoTermAnalysisResultViewer`. --- docs/tutorial.rst | 6 +++--- docs/user-guide/stats.rst | 6 +++--- src/gpsea/view/__init__.py | 4 ++-- src/gpsea/view/_phenotype_analysis.py | 6 +++--- tests/view/test_hpo_phenotype_result_viewer.py | 8 ++++---- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index d51d3868..1f20fb8f 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,9 +268,9 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.view import HpoTermAnalysisResultFormatter ->>> formatter = HpoTermAnalysisResultFormatter(hpo) ->>> summary_df = formatter.make_summary_dataframe(result) +>>> from gpsea.view import HpoTermAnalysisResultViewer +>>> result_viewer = HpoTermAnalysisResultViewer(hpo) +>>> summary_df = result_viewer.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index c2587050..47b02162 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,9 +254,9 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.view import HpoTermAnalysisResultFormatter ->>> formatter = HpoTermAnalysisResultFormatter(hpo) ->>> summary_df = formatter.make_summary_dataframe(result) +>>> from gpsea.view import HpoTermAnalysisResultViewer +>>> result_viewer = HpoTermAnalysisResultViewer(hpo) +>>> summary_df = result_viewer.make_summary_dataframe(result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index e236f802..b49ccfe6 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,6 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable -from ._phenotype_analysis import HpoTermAnalysisResultFormatter +from ._phenotype_analysis import HpoTermAnalysisResultViewer from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -13,7 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', - 'HpoTermAnalysisResultFormatter', + 'HpoTermAnalysisResultViewer', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 26cec71b..197643b6 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -4,9 +4,9 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult -class HpoTermAnalysisResultFormatter: +class HpoTermAnalysisResultViewer: """ - `HpoTermAnalysisResultFormatter` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` + `HpoTermAnalysisResultViewer` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` in a form suitable for humans. This includes preparing a dataframe with counts, frequencies, and p values for the tested HPO terms. @@ -61,7 +61,7 @@ def make_summary_dataframe( # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). labeled_idx = df.index.map( - lambda term_id: HpoTermAnalysisResultFormatter._format_term_id( + lambda term_id: HpoTermAnalysisResultViewer._format_term_id( self._hpo, term_id ) ) diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py index 71122eb9..09d36b80 100644 --- a/tests/view/test_hpo_phenotype_result_viewer.py +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -2,7 +2,7 @@ import pytest from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.view import HpoTermAnalysisResultFormatter +from gpsea.view import HpoTermAnalysisResultViewer class TestHpoTermAnalysisResultFormatter: @@ -11,15 +11,15 @@ class TestHpoTermAnalysisResultFormatter: def formatter( self, hpo: hpotk.MinimalOntology, - ) -> HpoTermAnalysisResultFormatter: - return HpoTermAnalysisResultFormatter( + ) -> HpoTermAnalysisResultViewer: + return HpoTermAnalysisResultViewer( hpo=hpo, ) @pytest.mark.skip("Only for manual control") def test_summarize( self, - formatter: HpoTermAnalysisResultFormatter, + formatter: HpoTermAnalysisResultViewer, hpo_result: HpoTermAnalysisResult, ): df = formatter.make_summary_dataframe(result=hpo_result) From e4cc40a157bb59065234b797991a3b5017ae0097 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 13:59:50 +0200 Subject: [PATCH 12/44] .. but let's make it a function. --- docs/tutorial.rst | 5 +- docs/user-guide/stats.rst | 5 +- src/gpsea/view/__init__.py | 4 +- src/gpsea/view/_phenotype_analysis.py | 127 ++++++++---------- .../view/test_hpo_phenotype_result_viewer.py | 29 ++-- 5 files changed, 69 insertions(+), 101 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1f20fb8f..dcecee15 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -268,9 +268,8 @@ by exploring the phenotype MTC filtering report. and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.view import HpoTermAnalysisResultViewer ->>> result_viewer = HpoTermAnalysisResultViewer(hpo) ->>> summary_df = result_viewer.make_summary_dataframe(result) +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) >>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 47b02162..13e5be53 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -254,9 +254,8 @@ Last, let's explore the associations. The results include a table with all teste ordered by the corrected p value (Benjamini-Hochberg FDR). Here we show the top 20 table rows: ->>> from gpsea.view import HpoTermAnalysisResultViewer ->>> result_viewer = HpoTermAnalysisResultViewer(hpo) ->>> summary_df = result_viewer.make_summary_dataframe(result) +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) >>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index b49ccfe6..7bb12e8e 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,6 +1,6 @@ from ._cohort import CohortViewable from ._disease import DiseaseViewable -from ._phenotype_analysis import HpoTermAnalysisResultViewer +from ._phenotype_analysis import summarize_hpo_analysis from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -13,7 +13,7 @@ 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', - 'HpoTermAnalysisResultViewer', + 'summarize_hpo_analysis', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 197643b6..75b65866 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -4,88 +4,69 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult -class HpoTermAnalysisResultViewer: +def summarize_hpo_analysis( + hpo: hpotk.MinimalOntology, + result: HpoTermAnalysisResult, +) -> pd.DataFrame: """ - `HpoTermAnalysisResultViewer` presents the :class:`~gpsea.analysis.pcats.HpoTermAnalysisResult` - in a form suitable for humans. + Create a dataframe with counts, frequencies, and p values for the tested HPO terms. - This includes preparing a dataframe with counts, frequencies, and p values for the tested HPO terms. + :param hpo: HPO data. + :param result: the HPO term analysis results to show. """ + assert isinstance(result, HpoTermAnalysisResult) - def __init__( - self, - hpo: hpotk.MinimalOntology, - ): - self._hpo = hpo + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(result.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), + names=(result.gt_predicate.get_question_base(), None), + ) - def make_summary_dataframe( - self, - result: HpoTermAnalysisResult, - ) -> pd.DataFrame: - """ - Format the `result` into a dataframe for human consumption. - """ - assert isinstance(result, HpoTermAnalysisResult) + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(result.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), - names=(result.gt_predicate.get_question_base(), None), - ) + for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + # Sum across the phenotype categories (collapse the rows). + gt_totals = count.sum() - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + for gt_cat in count.columns: + cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" - for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): - # Sum across the phenotype categories (collapse the rows). - gt_totals = count.sum() - - for gt_cat in count.columns: - cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] - total = gt_totals[gt_cat] - df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if result.corrected_pvals is not None: + df.insert(df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals) + df.insert(df.shape[1], ("", p_val_col_name), result.pvals) - # Add columns with p values and corrected p values (if present) - p_val_col_name = "p values" - corrected_p_val_col_name = "Corrected p values" - if result.corrected_pvals is not None: - df.insert( - df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals - ) - df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map(lambda term_id: format_term_id(hpo, term_id)) - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map( - lambda term_id: HpoTermAnalysisResultViewer._format_term_id( - self._hpo, term_id - ) - ) + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + if result.corrected_pvals is not None: + return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]) + else: + return df.sort_values(by=("", p_val_col_name)) - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if result.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ) - else: - return df.sort_values(by=("", p_val_col_name)) - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == "HP": - term_name = hpo.get_term_name(term_id) - return f"{term_name} [{term_id.value}]" - else: - return term_id.value +def format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, +) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py index 09d36b80..12e0df74 100644 --- a/tests/view/test_hpo_phenotype_result_viewer.py +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -2,26 +2,15 @@ import pytest from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.view import HpoTermAnalysisResultViewer +from gpsea.view import summarize_hpo_analysis -class TestHpoTermAnalysisResultFormatter: +@pytest.mark.skip("Only for manual control") +def test_summarize( + self, + hpo: hpotk.MinimalOntology, + hpo_result: HpoTermAnalysisResult, +): + df = summarize_hpo_analysis(hpo=hpo, result=hpo_result) - @pytest.fixture - def formatter( - self, - hpo: hpotk.MinimalOntology, - ) -> HpoTermAnalysisResultViewer: - return HpoTermAnalysisResultViewer( - hpo=hpo, - ) - - @pytest.mark.skip("Only for manual control") - def test_summarize( - self, - formatter: HpoTermAnalysisResultViewer, - hpo_result: HpoTermAnalysisResult, - ): - df = formatter.make_summary_dataframe(result=hpo_result) - - print(df) + print(df) From 9e75d23b1838083713850ada0488805652a1aea6 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 14:42:33 +0200 Subject: [PATCH 13/44] Fix import error. --- tests/view/test_stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index d38556f6..d57b3ae2 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -7,7 +7,7 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer @@ -22,11 +22,11 @@ def hpo_term_analysis_result( ) -> HpoTermAnalysisResult: return HpoTermAnalysisResult( pheno_predicates=( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001250'), # Seizure ), From 1d4ebd1b52700cad772c5c321584e33791806724 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Mon, 9 Sep 2024 15:14:32 +0200 Subject: [PATCH 14/44] viewable for all variants --- src/gpsea/model/_variant_effects.py | 65 +++++++++++++++- src/gpsea/view/__init__.py | 2 + src/gpsea/view/_all_variant_viewable.py | 89 ++++++++++++++-------- src/gpsea/view/templates/all_variants.html | 75 +++--------------- 4 files changed, 136 insertions(+), 95 deletions(-) diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py index e15452e0..d104f178 100644 --- a/src/gpsea/model/_variant_effects.py +++ b/src/gpsea/model/_variant_effects.py @@ -1,5 +1,6 @@ from enum import Enum - +import typing +from hpotk import TermId class VariantEffect(Enum): """ @@ -62,6 +63,68 @@ class VariantEffect(Enum): INTERGENIC_VARIANT = "SO:0001628" SEQUENCE_VARIANT = "SO:0001060" + @staticmethod + def to_display(veffect: "VariantEffect") -> str: + effect_to_display = { + VariantEffect.TRANSCRIPT_ABLATION : "transcript ablation", + VariantEffect.SPLICE_ACCEPTOR_VARIANT : "splice acceptor", + VariantEffect.SPLICE_DONOR_VARIANT : "splice donor", + VariantEffect.STOP_GAINED : "stop gained", + VariantEffect.FRAMESHIFT_VARIANT : "frameshift", + VariantEffect.STOP_LOST : "stop lost", + VariantEffect.START_LOST : "start lost", + VariantEffect.TRANSCRIPT_AMPLIFICATION : "transcript amplification", + VariantEffect.INFRAME_INSERTION : "inframe insertion", + VariantEffect.INFRAME_DELETION : "inframe deletion", + VariantEffect. MISSENSE_VARIANT : "missense", + VariantEffect. PROTEIN_ALTERING_VARIANT : "protein altering", + VariantEffect.SPLICE_REGION_VARIANT : "splice region", + VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT : "splice donor 5th base", + VariantEffect.SPLICE_DONOR_REGION_VARIANT : "splice donor", + VariantEffect. SPLICE_POLYPYRIMIDINE_TRACT_VARIANT : "splice polypyrimidine", + VariantEffect.INCOMPLETE_TERMINAL_CODON_VARIANT : "incomplete terminal codon", + VariantEffect.START_RETAINED_VARIANT : "start retained", + VariantEffect.STOP_RETAINED_VARIANT : "stop retainined", + VariantEffect.SYNONYMOUS_VARIANT : "synonymous", + VariantEffect.CODING_SEQUENCE_VARIANT : "coding sequence", + VariantEffect. MATURE_MIRNA_VARIANT : "mature miRNA", + VariantEffect. FIVE_PRIME_UTR_VARIANT : "5UTR", + VariantEffect. THREE_PRIME_UTR_VARIANT : "3UTR", + VariantEffect. NON_CODING_TRANSCRIPT_EXON_VARIANT : "non-coding transcript exon", + VariantEffect.INTRON_VARIANT : "intronic", + VariantEffect. NMD_TRANSCRIPT_VARIANT : "NMD transcript", + VariantEffect.NON_CODING_TRANSCRIPT_VARIANT : "non-coding transcript", + VariantEffect.UPSTREAM_GENE_VARIANT : "upstream of gene", + VariantEffect. DOWNSTREAM_GENE_VARIANT : "downstream of gene", + VariantEffect.TFBS_ABLATION : "TFBS ablation", + VariantEffect. TFBS_AMPLIFICATION : "TFBS amplification", + VariantEffect.TF_BINDING_SITE_VARIANT : "TFBS binding site", + VariantEffect.REGULATORY_REGION_ABLATION : "regulatory region ablation", + VariantEffect. REGULATORY_REGION_AMPLIFICATION : "regulatory region amplification", + VariantEffect.FEATURE_ELONGATION : "feature elongation", + VariantEffect.REGULATORY_REGION_VARIANT : "regulatory region", + VariantEffect.FEATURE_TRUNCATION : "feature truncation", + VariantEffect.INTERGENIC_VARIANT : "intergenic", + VariantEffect.SEQUENCE_VARIANT : "sequence variant" + } + return effect_to_display.get(veffect, "n/a") + + @staticmethod + def structural_so_id_to_display(so_term: typing.Union[TermId, str]) -> str: + """ + get a string to represent a Sequence Ontology (SO) term identifier + """ + if isinstance(so_term, TermId): + so_term = so_term.value + so_id_to_display = {"SO:1000029": "chromosomal deletion", + "SO:1000037": "chromosomal duplication", + "SO:1000030": "chromosomal_inversion", + "SO:1000044": "chromosomal_translocation", + } + return so_id_to_display.get(so_term, "n/a") + + + def __init__(self, curie: str): self._curie = curie diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 6eb13dd6..95507abc 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,4 +1,5 @@ from ._cohort import CohortViewable +from ._all_variant_viewable import AllVariantViewable from ._disease import DiseaseViewable from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable @@ -8,6 +9,7 @@ from ._formatter import VariantFormatter __all__ = [ + 'AllVariantViewable', 'CohortViewable', 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', diff --git a/src/gpsea/view/_all_variant_viewable.py b/src/gpsea/view/_all_variant_viewable.py index ffe5c943..a941c58e 100644 --- a/src/gpsea/view/_all_variant_viewable.py +++ b/src/gpsea/view/_all_variant_viewable.py @@ -1,17 +1,16 @@ import typing -from hpotk import MinimalOntology from jinja2 import Environment, PackageLoader from collections import namedtuple, defaultdict from gpsea.model import Cohort -from gpsea.model._variant import Variant +from gpsea.model._variant import Variant, VariantEffect from ._formatter import VariantFormatter ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects']) -VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'chromosomal_description', 'variant_effects'] ) +VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'variant_effects'] ) class AllVariantViewable: """ @@ -25,13 +24,14 @@ def __init__( ): """ Args: - hpo(MinimalOntology): An HPO ontology object from hpo-toolkit - top_phenotype_count(int): Maximum number of HPO terms to display in the HTML table (default: 10) - top_variant_count(int): Maximum number of variants to display in the HTML table (default: 10) + transcript_id(str): The transcript identifier (Usually, the MANE RefSeq transcript, that should start with "NM_") """ environment = Environment(loader=PackageLoader('gpsea.view', 'templates')) self._cohort_template = environment.get_template("all_variants.html") self._var_formatter = VariantFormatter(transcript_id) + if not transcript_id.startswith("NM"): + print(f"[WARNING] Non-RefSeq transcript id: {transcript_id}") + self._transcript_id = transcript_id def process( self, @@ -57,30 +57,51 @@ def _prepare_context( transcript_id: typing.Optional[str], only_hgvs ) -> typing.Mapping[str, typing.Any]: - variant_count_dictionary = defaultdict() + variant_count_dictionary = defaultdict(int) + variant_effect_count_dictionary = defaultdict(int) + variant_key_to_variant = dict() for var in cohort.all_variants(): + var_key = var.variant_info.variant_key vdata = self._get_variant_data(var, only_hgvs) - variant_count_dictionary[vdata] += 1 + variant_key_to_variant[var_key] = vdata + variant_count_dictionary[var_key] += 1 + for v_eff in vdata.variant_effects: + variant_effect_count_dictionary[v_eff] += 1 variant_counts = list() - for var_data, count in variant_count_dictionary.items(): + variant_effect_counts = list() + for var_key, count in variant_count_dictionary.items(): + var_data = variant_key_to_variant[var_key] variant_counts.append( { "variant_key": var_data.variant_key, "variant": var_data.hgvs_cdna, "variant_name": var_data.hgvs_cdna, "protein_name": var_data.hgvsp, - "variant_effects": var_data.variant_effects, + "variant_effects": ", ".join(var_data.variant_effects), "count": count, } ) + for v_effect, count in variant_effect_count_dictionary.items(): + variant_effect_counts.append( + { + "effect": v_effect, + "count": count + } + ) + print(f"variants eee {len(variant_effect_counts)}") + variant_counts = sorted(variant_counts, key=lambda row: row.get("count"), reverse=True) + variant_effect_counts = sorted(variant_effect_counts, key=lambda row: row.get("count"), reverse=True) + # The following dictionary is used by the Jinja2 HTML template return { + "has_transcript": False, "variant_count_list": variant_counts, + "variant_effect_count_list": variant_effect_counts, "total_unique_allele_count": len(variant_counts) } - def _get_variant_description( + def _get_variant_data( self, variant: Variant, only_hgvs: bool @@ -96,24 +117,32 @@ def _get_variant_description( typing.Mapping[str, ToDisplay]: key: variant key, value: namedtuple(display (e.g. HGVS) string of variant, hgvsp protein string of variant) """ variant_key = variant.variant_info.variant_key - display = self._var_formatter.format_as_string(variant) - tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id) - if tx_annotation is not None: - hgvsp = tx_annotation.hgvsp - var_effects = [var_eff.name for var_eff in tx_annotation.variant_effects] + if variant.variant_info.has_sv_info(): + sv_info = variant.variant_info.sv_info + gene_symbol = sv_info.gene_symbol + display = f"SV involving {gene_symbol}" + effect = VariantEffect.structural_so_id_to_display(so_term=sv_info.structural_type) + return VariantData(variant_key=variant_key, hgvs_cdna = display, hgvsp = "p.?", variant_effects=[effect]) else: - hgvsp = None - var_effects = None - if only_hgvs: - # do not show the transcript id - fields_dna = display.split(":") - fields_ps = hgvsp.split(":") if hgvsp is not None else [None] - if len(fields_dna) > 1: - display = fields_dna[1] - else: - display = fields_dna[0] - if len(fields_ps) > 1: - hgvsp = fields_ps[1] + variant_key = variant.variant_info.variant_key + display = self._var_formatter.format_as_string(variant) + tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id) + if tx_annotation is not None: + hgvsp = tx_annotation.hgvsp + var_effects = [VariantEffect.to_display(var_eff) for var_eff in tx_annotation.variant_effects] else: - hgvsp = fields_ps[0] - return VariantData(variant_key=variant_key, hgvs_cdna = fields_dna, hgvsp = hgvsp, chromosomal_description=None, variant_effects=var_effects) + hgvsp = None + var_effects = [] + if only_hgvs: + # do not show the transcript id + fields_dna = display.split(":") + fields_ps = hgvsp.split(":") if hgvsp is not None else [None] + if len(fields_dna) > 1: + display_hgvs_cDNA = fields_dna[1] + else: + display_hgvs_cDNA = fields_dna[0] + if len(fields_ps) > 1: + hgvsp = fields_ps[1] + else: + hgvsp = fields_ps[0] + return VariantData(variant_key=variant_key, hgvs_cdna = display_hgvs_cDNA, hgvsp = hgvsp, variant_effects=var_effects) diff --git a/src/gpsea/view/templates/all_variants.html b/src/gpsea/view/templates/all_variants.html index 0bbe94e5..9640bd1a 100644 --- a/src/gpsea/view/templates/all_variants.html +++ b/src/gpsea/view/templates/all_variants.html @@ -73,24 +73,17 @@

Variant alleles

- "variant": var_data.hgvs_cdna, - "variant_name": var_data.hgvs_cdna, - "protein_name": var_data.hgvsp, - "variant_effects": var_data.variant_effects, - "count": count, - - - Variant - hgvs_cdna - protein + Variant key + Variant (cDNA) + Variant (protein) Effects - count + Count - {% for vdata in variant_counts %} + {% for vdata in variant_count_list %} - {{ vdata.variant }} + {{ vdata.variant_key }} {{ vdata.variant_name }} {{ vdata.protein_name }} {{ vdata.variant_effects }} @@ -99,61 +92,17 @@

Variant alleles

{% endfor %} - - - - - - - - - - - - {% for var_count in var_counts %} - - - - - - - - {% endfor %} - -
-

Top {{top_var_count}} Variants

- Variants are shown according to {{ transcript_id }}. A total of {{ unique_variant_count}} unique variants were identified in the cohort. -
CountVariant keyVariant NameProtein VariantVariant Class
{{ var_count.Count }}{{ var_count.variant }}{{ var_count.variant_name }}{{ var_count.protein_name }}{{ var_count.variant_effects }}
- - - - - - - - - {% for disease_count in disease_counts %} - - - - - - {% endfor %} - -
-

Diseases

-
Disease NameDisease IDAnnotation Count
{{ disease_count.disease_name }}{{ disease_count.disease_id }}{{ disease_count.count }}
- {% if has_transcript > 0 %} - + From 1697fb70c32552e909c8db10fd3a9dc1d1d1be5d Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 21:06:52 +0200 Subject: [PATCH 15/44] Test phenotype predicate utility methods. --- tests/analysis/predicate/test_phenotype.py | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/analysis/predicate/test_phenotype.py diff --git a/tests/analysis/predicate/test_phenotype.py b/tests/analysis/predicate/test_phenotype.py new file mode 100644 index 00000000..609417a9 --- /dev/null +++ b/tests/analysis/predicate/test_phenotype.py @@ -0,0 +1,31 @@ +import hpotk + +from gpsea.model import Cohort + +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import prepare_hpo_terms_of_interest, prepare_predicates_for_terms_of_interest + + +def test_prepare_hpo_terms_of_interest( + suox_cohort: Cohort, + hpo: hpotk.MinimalOntology, +): + terms = prepare_hpo_terms_of_interest( + cohort=suox_cohort, + hpo=hpo, + ) + + assert len(terms) == 66 + + +def test_prepare_predicates_for_terms_of_interest( + suox_cohort: Cohort, + hpo: hpotk.MinimalOntology, +): + predicates = prepare_predicates_for_terms_of_interest( + cohort=suox_cohort, + hpo=hpo, + ) + + assert len(predicates) == 66 + assert all(isinstance(p, PhenotypePolyPredicate) for p in predicates) From 381fd548e5c87bc2f4303430afacb8c390446ee2 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 21:33:55 +0200 Subject: [PATCH 16/44] Refactor, rename to `CohortVariantViewer` and add a test. --- src/gpsea/model/_variant_effects.py | 148 +++++++++++------- src/gpsea/view/__init__.py | 4 +- ..._viewable.py => _cohort_variant_viewer.py} | 58 ++++--- tests/conftest.py | 13 +- tests/view/test_variant_viewer.py | 16 ++ 5 files changed, 149 insertions(+), 90 deletions(-) rename src/gpsea/view/{_all_variant_viewable.py => _cohort_variant_viewer.py} (76%) create mode 100644 tests/view/test_variant_viewer.py diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py index d104f178..632d56a4 100644 --- a/src/gpsea/model/_variant_effects.py +++ b/src/gpsea/model/_variant_effects.py @@ -1,8 +1,10 @@ -from enum import Enum +import enum import typing -from hpotk import TermId -class VariantEffect(Enum): +import hpotk + + +class VariantEffect(enum.Enum): """ `VariantEffect` represents consequences of a variant on transcript that are supported by GPSEA. @@ -48,7 +50,7 @@ class VariantEffect(Enum): THREE_PRIME_UTR_VARIANT = "SO:0001624" NON_CODING_TRANSCRIPT_EXON_VARIANT = "SO:0001792" INTRON_VARIANT = "SO:0001627" - NMD_TRANSCRIPT_VARIANT = "SO:0001621", + NMD_TRANSCRIPT_VARIANT = "SO:0001621" NON_CODING_TRANSCRIPT_VARIANT = "SO:0001619" UPSTREAM_GENE_VARIANT = "SO:0001631" DOWNSTREAM_GENE_VARIANT = "SO:0001632" @@ -63,67 +65,42 @@ class VariantEffect(Enum): INTERGENIC_VARIANT = "SO:0001628" SEQUENCE_VARIANT = "SO:0001060" + def to_display(self) -> str: + """ + Get a concise name of the variant effect that is suitable for showing to humans. + + Example + ^^^^^^^ + + >>> from gpsea.model import VariantEffect + >>> VariantEffect.MISSENSE_VARIANT + 'missense' + >>> VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT + 'splice donor 5th base' + + :returns: a `str` with the name or `'n/a'` if the variant effect was not assigned a concise name. + """ + return effect_to_display.get(self, "n/a") + @staticmethod - def to_display(veffect: "VariantEffect") -> str: - effect_to_display = { - VariantEffect.TRANSCRIPT_ABLATION : "transcript ablation", - VariantEffect.SPLICE_ACCEPTOR_VARIANT : "splice acceptor", - VariantEffect.SPLICE_DONOR_VARIANT : "splice donor", - VariantEffect.STOP_GAINED : "stop gained", - VariantEffect.FRAMESHIFT_VARIANT : "frameshift", - VariantEffect.STOP_LOST : "stop lost", - VariantEffect.START_LOST : "start lost", - VariantEffect.TRANSCRIPT_AMPLIFICATION : "transcript amplification", - VariantEffect.INFRAME_INSERTION : "inframe insertion", - VariantEffect.INFRAME_DELETION : "inframe deletion", - VariantEffect. MISSENSE_VARIANT : "missense", - VariantEffect. PROTEIN_ALTERING_VARIANT : "protein altering", - VariantEffect.SPLICE_REGION_VARIANT : "splice region", - VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT : "splice donor 5th base", - VariantEffect.SPLICE_DONOR_REGION_VARIANT : "splice donor", - VariantEffect. SPLICE_POLYPYRIMIDINE_TRACT_VARIANT : "splice polypyrimidine", - VariantEffect.INCOMPLETE_TERMINAL_CODON_VARIANT : "incomplete terminal codon", - VariantEffect.START_RETAINED_VARIANT : "start retained", - VariantEffect.STOP_RETAINED_VARIANT : "stop retainined", - VariantEffect.SYNONYMOUS_VARIANT : "synonymous", - VariantEffect.CODING_SEQUENCE_VARIANT : "coding sequence", - VariantEffect. MATURE_MIRNA_VARIANT : "mature miRNA", - VariantEffect. FIVE_PRIME_UTR_VARIANT : "5UTR", - VariantEffect. THREE_PRIME_UTR_VARIANT : "3UTR", - VariantEffect. NON_CODING_TRANSCRIPT_EXON_VARIANT : "non-coding transcript exon", - VariantEffect.INTRON_VARIANT : "intronic", - VariantEffect. NMD_TRANSCRIPT_VARIANT : "NMD transcript", - VariantEffect.NON_CODING_TRANSCRIPT_VARIANT : "non-coding transcript", - VariantEffect.UPSTREAM_GENE_VARIANT : "upstream of gene", - VariantEffect. DOWNSTREAM_GENE_VARIANT : "downstream of gene", - VariantEffect.TFBS_ABLATION : "TFBS ablation", - VariantEffect. TFBS_AMPLIFICATION : "TFBS amplification", - VariantEffect.TF_BINDING_SITE_VARIANT : "TFBS binding site", - VariantEffect.REGULATORY_REGION_ABLATION : "regulatory region ablation", - VariantEffect. REGULATORY_REGION_AMPLIFICATION : "regulatory region amplification", - VariantEffect.FEATURE_ELONGATION : "feature elongation", - VariantEffect.REGULATORY_REGION_VARIANT : "regulatory region", - VariantEffect.FEATURE_TRUNCATION : "feature truncation", - VariantEffect.INTERGENIC_VARIANT : "intergenic", - VariantEffect.SEQUENCE_VARIANT : "sequence variant" - } - return effect_to_display.get(veffect, "n/a") - - @staticmethod - def structural_so_id_to_display(so_term: typing.Union[TermId, str]) -> str: + def structural_so_id_to_display(so_term: typing.Union[hpotk.TermId, str]) -> str: """ - get a string to represent a Sequence Ontology (SO) term identifier + Get a `str` with a concise name for representing a Sequence Ontology (SO) term identifier. + + Example + ^^^^^^^ + + >>> from gpsea.model import VariantEffect + >>> VariantEffect.structural_so_id_to_display('SO:1000029') + 'chromosomal deletion' + + :param so_term: a CURIE `str` or a :class:`~hpotk.TermId` with the query SO term. + :returns: a `str` with the concise name for the SO term or `'n/a'` if a name has not been assigned yet. """ - if isinstance(so_term, TermId): + if isinstance(so_term, hpotk.TermId): so_term = so_term.value - so_id_to_display = {"SO:1000029": "chromosomal deletion", - "SO:1000037": "chromosomal duplication", - "SO:1000030": "chromosomal_inversion", - "SO:1000044": "chromosomal_translocation", - } - return so_id_to_display.get(so_term, "n/a") - + return so_id_to_display.get(so_term, "n/a") def __init__(self, curie: str): self._curie = curie @@ -138,3 +115,54 @@ def curie(self) -> str: def __str__(self) -> str: return self.name.lower() + + +effect_to_display = { + VariantEffect.TRANSCRIPT_ABLATION: "transcript ablation", + VariantEffect.SPLICE_ACCEPTOR_VARIANT: "splice acceptor", + VariantEffect.SPLICE_DONOR_VARIANT: "splice donor", + VariantEffect.STOP_GAINED: "stop gained", + VariantEffect.FRAMESHIFT_VARIANT: "frameshift", + VariantEffect.STOP_LOST: "stop lost", + VariantEffect.START_LOST: "start lost", + VariantEffect.TRANSCRIPT_AMPLIFICATION: "transcript amplification", + VariantEffect.INFRAME_INSERTION: "inframe insertion", + VariantEffect.INFRAME_DELETION: "inframe deletion", + VariantEffect.MISSENSE_VARIANT: "missense", + VariantEffect.PROTEIN_ALTERING_VARIANT: "protein altering", + VariantEffect.SPLICE_REGION_VARIANT: "splice region", + VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT: "splice donor 5th base", + VariantEffect.SPLICE_DONOR_REGION_VARIANT: "splice donor", + VariantEffect.SPLICE_POLYPYRIMIDINE_TRACT_VARIANT: "splice polypyrimidine", + VariantEffect.INCOMPLETE_TERMINAL_CODON_VARIANT: "incomplete terminal codon", + VariantEffect.START_RETAINED_VARIANT: "start retained", + VariantEffect.STOP_RETAINED_VARIANT: "stop retainined", + VariantEffect.SYNONYMOUS_VARIANT: "synonymous", + VariantEffect.CODING_SEQUENCE_VARIANT: "coding sequence", + VariantEffect.MATURE_MIRNA_VARIANT: "mature miRNA", + VariantEffect.FIVE_PRIME_UTR_VARIANT: "5UTR", + VariantEffect.THREE_PRIME_UTR_VARIANT: "3UTR", + VariantEffect.NON_CODING_TRANSCRIPT_EXON_VARIANT: "non-coding transcript exon", + VariantEffect.INTRON_VARIANT: "intronic", + VariantEffect.NMD_TRANSCRIPT_VARIANT: "NMD transcript", + VariantEffect.NON_CODING_TRANSCRIPT_VARIANT: "non-coding transcript", + VariantEffect.UPSTREAM_GENE_VARIANT: "upstream of gene", + VariantEffect.DOWNSTREAM_GENE_VARIANT: "downstream of gene", + VariantEffect.TFBS_ABLATION: "TFBS ablation", + VariantEffect.TFBS_AMPLIFICATION: "TFBS amplification", + VariantEffect.TF_BINDING_SITE_VARIANT: "TFBS binding site", + VariantEffect.REGULATORY_REGION_ABLATION: "regulatory region ablation", + VariantEffect.REGULATORY_REGION_AMPLIFICATION: "regulatory region amplification", + VariantEffect.FEATURE_ELONGATION: "feature elongation", + VariantEffect.REGULATORY_REGION_VARIANT: "regulatory region", + VariantEffect.FEATURE_TRUNCATION: "feature truncation", + VariantEffect.INTERGENIC_VARIANT: "intergenic", + VariantEffect.SEQUENCE_VARIANT: "sequence variant", +} + +so_id_to_display = { + "SO:1000029": "chromosomal deletion", + "SO:1000037": "chromosomal duplication", + "SO:1000030": "chromosomal_inversion", + "SO:1000044": "chromosomal_translocation", +} diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 74d580e0..d6a6c22e 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,5 +1,5 @@ from ._cohort import CohortViewable -from ._all_variant_viewable import AllVariantViewable +from ._cohort_variant_viewer import CohortVariantViewer from ._disease import DiseaseViewable from ._phenotype_analysis import summarize_hpo_analysis from ._protein_viewer import ProteinViewable @@ -10,7 +10,7 @@ from ._formatter import VariantFormatter __all__ = [ - 'AllVariantViewable', + 'CohortVariantViewer', 'CohortViewable', 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', diff --git a/src/gpsea/view/_all_variant_viewable.py b/src/gpsea/view/_cohort_variant_viewer.py similarity index 76% rename from src/gpsea/view/_all_variant_viewable.py rename to src/gpsea/view/_cohort_variant_viewer.py index a941c58e..737a2c20 100644 --- a/src/gpsea/view/_all_variant_viewable.py +++ b/src/gpsea/view/_cohort_variant_viewer.py @@ -3,24 +3,25 @@ from jinja2 import Environment, PackageLoader from collections import namedtuple, defaultdict -from gpsea.model import Cohort -from gpsea.model._variant import Variant, VariantEffect +from gpsea.model import Cohort, Variant, VariantEffect from ._formatter import VariantFormatter ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects']) -VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'variant_effects'] ) +VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'variant_effects']) -class AllVariantViewable: + +class CohortVariantViewer: """ - Class to create a viewable object that is uses a Jinja2 template to create an HTML element - for display in the Jupyter notebook. + `AllVariantViewer` creates an HTML report with the cohort variants. + + The report can be either written into an HTML file or displayed in a Jupyter notebook. """ def __init__( - self, - transcript_id: str + self, + transcript_id: str ): """ Args: @@ -39,23 +40,22 @@ def process( only_hgvs: bool = True ) -> str: """ - Create an HTML that should be shown with display(HTML(..)) of the ipython package. + Create an HTML that should be shown with ``display(HTML(..))`` of the ipython package. Args: - cohort (Cohort): The cohort being analyzed in the current Notebook - transcript_id (str): the transcript that we map variants onto + cohort (Cohort): The cohort being analyzed in the current notebook. + only_hgvs (bool): Do not show the transcript ID part of the HGVS annotation, just the annotation. Returns: str: an HTML string with parameterized template for rendering """ - context = self._prepare_context(cohort, transcript_id=self._transcript_id, only_hgvs=only_hgvs) + context = self._prepare_context(cohort, only_hgvs=only_hgvs) return self._cohort_template.render(context) def _prepare_context( self, cohort: Cohort, - transcript_id: typing.Optional[str], - only_hgvs + only_hgvs: bool, ) -> typing.Mapping[str, typing.Any]: variant_count_dictionary = defaultdict(int) variant_effect_count_dictionary = defaultdict(int) @@ -74,9 +74,9 @@ def _prepare_context( variant_counts.append( { "variant_key": var_data.variant_key, - "variant": var_data.hgvs_cdna, - "variant_name": var_data.hgvs_cdna, - "protein_name": var_data.hgvsp, + "variant": var_data.hgvs_cdna, + "variant_name": var_data.hgvs_cdna, + "protein_name": var_data.hgvsp, "variant_effects": ", ".join(var_data.variant_effects), "count": count, } @@ -88,7 +88,6 @@ def _prepare_context( "count": count } ) - print(f"variants eee {len(variant_effect_counts)}") variant_counts = sorted(variant_counts, key=lambda row: row.get("count"), reverse=True) variant_effect_counts = sorted(variant_effect_counts, key=lambda row: row.get("count"), reverse=True) @@ -100,7 +99,6 @@ def _prepare_context( "total_unique_allele_count": len(variant_counts) } - def _get_variant_data( self, variant: Variant, @@ -109,12 +107,11 @@ def _get_variant_data( """ Get user-friendly strings (e.g., HGVS for our target transcript) to match to the chromosomal strings Args: - cohort (Cohort): The cohort being analyzed in the current Notebook - transcript_id (str): the transcript that we map variants onto + variant (Variant): The variant to be formatted. only_hgvs (bool): do not show the transcript ID part of the HGVS annotation, just the annotation. Returns: - typing.Mapping[str, ToDisplay]: key: variant key, value: namedtuple(display (e.g. HGVS) string of variant, hgvsp protein string of variant) + VariantData: a named tuple with variant data formatted for human consumption. """ variant_key = variant.variant_info.variant_key if variant.variant_info.has_sv_info(): @@ -122,7 +119,12 @@ def _get_variant_data( gene_symbol = sv_info.gene_symbol display = f"SV involving {gene_symbol}" effect = VariantEffect.structural_so_id_to_display(so_term=sv_info.structural_type) - return VariantData(variant_key=variant_key, hgvs_cdna = display, hgvsp = "p.?", variant_effects=[effect]) + return VariantData( + variant_key=variant_key, + hgvs_cdna=display, + hgvsp="p.?", + variant_effects=[effect], + ) else: variant_key = variant.variant_info.variant_key display = self._var_formatter.format_as_string(variant) @@ -136,13 +138,19 @@ def _get_variant_data( if only_hgvs: # do not show the transcript id fields_dna = display.split(":") - fields_ps = hgvsp.split(":") if hgvsp is not None else [None] if len(fields_dna) > 1: display_hgvs_cDNA = fields_dna[1] else: display_hgvs_cDNA = fields_dna[0] + + fields_ps = hgvsp.split(":") if hgvsp is not None else (None,) if len(fields_ps) > 1: hgvsp = fields_ps[1] else: hgvsp = fields_ps[0] - return VariantData(variant_key=variant_key, hgvs_cdna = display_hgvs_cDNA, hgvsp = hgvsp, variant_effects=var_effects) + return VariantData( + variant_key=variant_key, + hgvs_cdna=display_hgvs_cDNA, + hgvsp=hgvsp, + variant_effects=var_effects, + ) diff --git a/tests/conftest.py b/tests/conftest.py index 64ed2e81..907b7d83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -111,12 +111,19 @@ def suox_cohort( @pytest.fixture(scope='session') -def suox_gt_predicate() -> GenotypePolyPredicate: +def suox_mane_tx_id() -> str: + return 'NM_001032386.2' + + +@pytest.fixture(scope='session') +def suox_gt_predicate( + suox_mane_tx_id: str, +) -> GenotypePolyPredicate: # To bin the patients to a group with >1 MISSENSE variant or 0 MISSENSE variants. - suox_mane_tx_id = 'NM_001032386.2' + return boolean_predicate( variant_predicate=VariantPredicates.variant_effect( - effect=VariantEffect.MISSENSE_VARIANT, + effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id ) ) diff --git a/tests/view/test_variant_viewer.py b/tests/view/test_variant_viewer.py new file mode 100644 index 00000000..169940da --- /dev/null +++ b/tests/view/test_variant_viewer.py @@ -0,0 +1,16 @@ +import pytest + +from gpsea.model import Cohort +from gpsea.view import CohortVariantViewer + + +@pytest.mark.skip("For manual run only") +def test_viewer( + suox_mane_tx_id: str, + suox_cohort: Cohort, +): + viewer = CohortVariantViewer(transcript_id=suox_mane_tx_id) + html = viewer.process(suox_cohort) + + with open("all_variants.html", "w") as fh: + fh.write(html) From f0edb42dba0d26db376e84e8a9c5083c1c27cb8a Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 21:47:49 +0200 Subject: [PATCH 17/44] Include the `CohortVariantViewer` in the tutorial. --- docs/report/tbx5_all_variants.html | 583 +++++++++++++++++++++++ docs/tutorial.rst | 33 +- src/gpsea/view/_cohort_variant_viewer.py | 14 +- tests/view/test_variant_viewer.py | 2 +- 4 files changed, 624 insertions(+), 8 deletions(-) create mode 100644 docs/report/tbx5_all_variants.html diff --git a/docs/report/tbx5_all_variants.html b/docs/report/tbx5_all_variants.html new file mode 100644 index 00000000..a9618162 --- /dev/null +++ b/docs/report/tbx5_all_variants.html @@ -0,0 +1,583 @@ + + + + + Cohort + + + + +

GPSEA cohort analysis: All variant alleles

+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Variant alleles

+ A total of 53 unique alleles were identified in the cohort. +
Variant keyVariant (cDNA)Variant (protein)EffectsCount
12_114385521_114385521_C_Tc.710G>Ap.Arg237Glnmissense22
12_114401830_114401830_C_Tc.238G>Ap.Gly80Argmissense20
12_114385563_114385563_G_Ac.668C>Tp.Thr223Metmissense8
12_114398675_114398675_G_Tc.408C>Ap.Tyr136Terstop gained6
12_114399514_114399514_A_Cc.361T>Gp.Trp121Glymissense, splice region5
12_114403792_114403792_C_CGc.106_107insCp.Ser36ThrfsTer25frameshift5
12_114398682_114398682_C_CGc.400dupp.Arg134ProfsTer49frameshift5
12_114385474_114385474_A_Gc.755+2T>CNonesplice donor4
12_114398656_114398656_C_CGc.426dupp.Ala143ArgfsTer40frameshift4
12_114366360_114366360_C_Tc.787G>Ap.Val263Metmissense4
12_114385522_114385522_G_Ac.709C>Tp.Arg237Trpmissense4
12_114403798_114403798_G_GCc.100dupp.Ala34GlyfsTer27frameshift4
12_114399613_114399613_T_Ac.262A>Tp.Lys88Terstop gained3
12_114401827_114401827_T_Ac.241A>Tp.Arg81Trpmissense, splice region3
12_114366312_114366312_G_Ac.835C>Tp.Arg279Terstop gained3
12_114366366_114366366_T_Ac.781A>Tp.Ser261Cysmissense3
12_114403798_114403799_GC_Gc.100delp.Ala34ProfsTer32frameshift3
12_114385475_114385475_C_Tc.755+1G>ANonesplice donor3
12_114401853_114401853_G_Tc.215C>Ap.Thr72Lysmissense3
12_114385521_114385521_C_Gc.710G>Cp.Arg237Promissense2
12_114398568_114398568_C_Ac.510+5G>TNonesplice donor 5th base, intronic2
12_114366207_114366208_GC_Gc.939delp.Gln315ArgfsTer79frameshift2
12_114366274_114366274_G_Tc.873C>Ap.Tyr291Terstop gained2
12_114366267_114366267_C_Ac.880G>Tp.Glu294Terstop gained2
12_114398578_114398579_CA_Cc.504delp.Phe168LeufsTer6frameshift2
12_114394762_114394763_CA_Cc.641delp.Val214GlyfsTer12frameshift2
12_114398666_114398667_TG_Tc.416delp.Pro139GlnfsTer11frameshift2
12_114403754_114403754_G_Tc.145C>Ap.Gln49Lysmissense, splice region2
12_114385550_114385550_A_AATTATTCTCAGc.680_681insCTGAGAATAATp.Ile227_Glu228insTerinframe insertion, stop retainined2
12_114366348_114366349_CT_Cc.798delp.Val267TrpfsTer127frameshift1
12_114399559_114399559_T_Cc.316A>Gp.Ile106Valmissense1
12_114356064_114356065_TA_Tc.1024delp.Tyr342ThrfsTer52frameshift1
12_114355755_114355756_TG_Tc.1333delp.His445MetfsTer137frameshift1
12_114398632_114398632_G_Ac.451C>Tp.Gln151Terstop gained1
12_114401907_114401907_A_Gc.161T>Cp.Ile54Thrmissense1
12_114355723_114355723_G_Ac.1366C>Tp.Gln456Terstop gained1
12_114398602_114398602_T_Gc.481A>Cp.Thr161Promissense1
12_114398708_114398709_GC_Gc.374delp.Gly125AlafsTer25frameshift1
12_114385553_114385553_C_Ac.678G>Tp.Lys226Asnmissense1
12_114399633_114399633_C_Gc.243-1G>CNonesplice acceptor1
12_114401873_114401874_TA_Tc.194delp.Leu65GlnfsTer10frameshift1
12_114399594_114399594_A_Cc.281T>Gp.Leu94Argmissense1
12_114399622_114399622_G_Tc.253C>Ap.Pro85Thrmissense1
12_114394743_114394746_TGTG_Tc.658_660delp.His220delinframe deletion1
12_114394820_114394820_C_Gc.584G>Cp.Gly195Alamissense1
12_114401846_114401846_C_Gc.222G>Cp.Met74Ilemissense1
12_114366241_114366242_CT_Cc.905delp.Gln302ArgfsTer92frameshift1
12_114355784_114355785_CA_Cc.1304delp.Leu435ArgfsTer147frameshift1
12_114398626_114398627_CG_Cc.456delp.Val153SerfsTer21frameshift1
12_114403859_114403859_G_Tc.40C>Ap.Pro14Thrmissense1
12_114399625_114399629_ACATC_Ac.246_249delp.Met83PhefsTer6frameshift1
12_114394817_114394817_G_Cc.587C>Gp.Ser196Terstop gained1
12_114401921_114401921_C_Gc.148-1G>CNonesplice acceptor1
+ + + + + \ No newline at end of file diff --git a/docs/tutorial.rst b/docs/tutorial.rst index dcecee15..687969ca 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -103,7 +103,13 @@ We loaded the patient data into a `cohort` which is ready for the next steps. Explore cohort ^^^^^^^^^^^^^^ -We can now explore the cohort to see how many patients are included. +GPSEA helps with gaining insight into the cohort by providing + + +Show cohort summary +------------------- + +The summary report provides an overview about the HPO terms, variants, diseases, and variant effects that occurr most frequently: >>> from gpsea.view import CohortViewable >>> viewer = CohortViewable(hpo) @@ -121,6 +127,10 @@ We can now explore the cohort to see how many patients are included. from IPython.display import HTML, display display(HTML(report)) + +Plot distribution of variants with respect to the protein sequence +------------------------------------------------------------------ + Now we can show the distribution of variants with respect to the encoded protein. We first obtain `tx_coordinates` (:class:`~gpsea.model.TranscriptCoordinates`) and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`) @@ -154,6 +164,27 @@ and we follow with plotting the diagram of the mutations on the protein: :width: 600px +.. _show-cohort-variants: + +Summarize all variant alleles +----------------------------- + +We can prepare a table of all variant alleles that occurr in the cohort. +Each table row corresponds to a single allele and lists the variant key, +the predicted effect on the transcript (*cDNA*) and protein of interest, +the variant effects, and the number of patients who present +with one or more variant alleles (*Count*): + +>>> from gpsea.view import CohortVariantViewer +>>> viewer = CohortVariantViewer(tx_id=tx_id) +>>> report = viewer.process(cohort=cohort) +>>> with open('docs/report/tbx5_all_variants.html', 'w') as fh: # doctest: +SKIP +... _ = fh.write(report) + +.. raw:: html + :file: report/tbx5_all_variants.html + + Prepare genotype and phenotype predicates ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/gpsea/view/_cohort_variant_viewer.py b/src/gpsea/view/_cohort_variant_viewer.py index 737a2c20..016c8cf9 100644 --- a/src/gpsea/view/_cohort_variant_viewer.py +++ b/src/gpsea/view/_cohort_variant_viewer.py @@ -17,22 +17,24 @@ class CohortVariantViewer: `AllVariantViewer` creates an HTML report with the cohort variants. The report can be either written into an HTML file or displayed in a Jupyter notebook. + + See :ref:show-cohort-variants: for an example usage. """ def __init__( self, - transcript_id: str + tx_id: str ): """ Args: - transcript_id(str): The transcript identifier (Usually, the MANE RefSeq transcript, that should start with "NM_") + tx_id (str): The transcript identifier (Usually, the MANE RefSeq transcript, that should start with "NM_") """ environment = Environment(loader=PackageLoader('gpsea.view', 'templates')) self._cohort_template = environment.get_template("all_variants.html") - self._var_formatter = VariantFormatter(transcript_id) - if not transcript_id.startswith("NM"): - print(f"[WARNING] Non-RefSeq transcript id: {transcript_id}") - self._transcript_id = transcript_id + self._var_formatter = VariantFormatter(tx_id) + if not tx_id.startswith("NM"): + print(f"[WARNING] Non-RefSeq transcript id: {tx_id}") + self._transcript_id = tx_id def process( self, diff --git a/tests/view/test_variant_viewer.py b/tests/view/test_variant_viewer.py index 169940da..31fe12bb 100644 --- a/tests/view/test_variant_viewer.py +++ b/tests/view/test_variant_viewer.py @@ -9,7 +9,7 @@ def test_viewer( suox_mane_tx_id: str, suox_cohort: Cohort, ): - viewer = CohortVariantViewer(transcript_id=suox_mane_tx_id) + viewer = CohortVariantViewer(tx_id=suox_mane_tx_id) html = viewer.process(suox_cohort) with open("all_variants.html", "w") as fh: From 8ccfed60e4ab4b4a6c80f7bc3c31228d49c07703 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 9 Sep 2024 21:51:42 +0200 Subject: [PATCH 18/44] Fix doctest, tweak pydoc. --- src/gpsea/model/_variant_effects.py | 4 ++-- src/gpsea/view/_cohort_variant_viewer.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py index 632d56a4..9567592d 100644 --- a/src/gpsea/model/_variant_effects.py +++ b/src/gpsea/model/_variant_effects.py @@ -73,9 +73,9 @@ def to_display(self) -> str: ^^^^^^^ >>> from gpsea.model import VariantEffect - >>> VariantEffect.MISSENSE_VARIANT + >>> VariantEffect.MISSENSE_VARIANT.to_display() 'missense' - >>> VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT + >>> VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT.to_display() 'splice donor 5th base' :returns: a `str` with the name or `'n/a'` if the variant effect was not assigned a concise name. diff --git a/src/gpsea/view/_cohort_variant_viewer.py b/src/gpsea/view/_cohort_variant_viewer.py index 016c8cf9..0933ce02 100644 --- a/src/gpsea/view/_cohort_variant_viewer.py +++ b/src/gpsea/view/_cohort_variant_viewer.py @@ -14,7 +14,7 @@ class CohortVariantViewer: """ - `AllVariantViewer` creates an HTML report with the cohort variants. + `CohortVariantViewer` creates an HTML report with the cohort variants. The report can be either written into an HTML file or displayed in a Jupyter notebook. @@ -133,7 +133,7 @@ def _get_variant_data( tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id) if tx_annotation is not None: hgvsp = tx_annotation.hgvsp - var_effects = [VariantEffect.to_display(var_eff) for var_eff in tx_annotation.variant_effects] + var_effects = [var_eff.to_display() for var_eff in tx_annotation.variant_effects] else: hgvsp = None var_effects = [] From af675e5c3c2eb715aa8a161976ae9cc2c649db26 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 15:56:40 +0200 Subject: [PATCH 19/44] Add codes to reasons for filtering out an HPO term in MTC filter. Do not show untested HPO terms in the summary frame. Fixes #228 --- docs/report/tbx5_frameshift_vs_missense.csv | 26 +- ...bx5_frameshift_vs_missense.mtc_report.html | 77 ++---- docs/tutorial.rst | 10 +- docs/user-guide/mtc.rst | 126 +++++----- docs/user-guide/report/tbx5_frameshift.csv | 28 +-- .../report/tbx5_frameshift.mtc_report.html | 81 ++---- docs/user-guide/stats.rst | 9 +- src/gpsea/analysis/mtc_filter/_impl.py | 238 +++++++++++++----- src/gpsea/view/_phenotype_analysis.py | 8 +- src/gpsea/view/_stats.py | 17 +- src/gpsea/view/templates/stats.html | 5 +- tests/analysis/test_mtc_filter.py | 1 - tests/view/test_stats.py | 2 +- 13 files changed, 338 insertions(+), 290 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 772f062f..76666c2a 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,22 +1,18 @@ Genotype group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 -Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 -Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 -Heart block [HP:0012722],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 -Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.0191369345329502,0.005628510156750059 -Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.038236617183985605,0.01349527665317139 -Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.062175372424826694,0.02560162393963452 -Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.15811357916621074,0.07440639019586388 -Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2690764879148444,0.1424522583078588 -Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2868675985983051,0.1687456462342971 -Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 -Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 +Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 +Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 +Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.01932367149758454,0.0036231884057971015 +Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.022514040627000236,0.005628510156750059 +Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.04318488529014845,0.01349527665317139 +Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.06827099717235872,0.02560162393963452 +Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.17007174901911745,0.07440639019586388 +Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2849045166157176,0.1424522583078588 +Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.29999225997208373,0.1687456462342971 +Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6857142857142857,0.42857142857142855 +Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.831168831168831,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,, -Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,, -Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html index b6ac1747..14f949c3 100644 --- a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html +++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 17 out of the total of 260 HPO terms.

+

Performed statistical tests for 16 out of the total of 260 HPO terms.

- + @@ -59,80 +59,45 @@

Phenotype testing report

- - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + diff --git a/docs/tutorial.rst b/docs/tutorial.rst index dcecee15..8917ce46 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -246,15 +246,15 @@ Now we can perform the analysis and investigate the results. ... pheno_predicates=pheno_predicates, ... ) >>> result.total_tests -17 +16 -We only tested 1y HPO terms. This is despite the individuals being collectively annotated with +We only tested 16 HPO terms. This is despite the individuals being collectively annotated with 260 direct and indirect HPO terms >>> len(result.phenotypes) 260 -We can show the reasoning behind *not* testing 243 (`260 - 17`) HPO terms +We can show the reasoning behind *not* testing 244 (`260 - 16`) HPO terms by exploring the phenotype MTC filtering report. >>> from gpsea.view import MtcStatsViewer @@ -266,11 +266,11 @@ by exploring the phenotype MTC filtering report. .. raw:: html :file: report/tbx5_frameshift_vs_missense.mtc_report.html -and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: +and these are the tested HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: >>> from gpsea.view import summarize_hpo_analysis >>> summary_df = summarize_hpo_analysis(hpo, result) ->>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP +>>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense :file: report/tbx5_frameshift_vs_missense.csv diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index 070539d1..b01c95d1 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -194,60 +194,74 @@ The constructor takes a threshold as an argument (e.g. 20% in the example above) and the method's logic is made up of 8 individual heuristics designed to skip testing the HPO terms that are unlikely to yield significant or interesting results: -#. Skip terms that occur very rarely - The ``term_frequency_threshold`` determines the mininum proportion of individuals - with direct or indirect annotation by the HPO term to test. - We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), and we only retain a term for testing - if the proportion of individuals in at least one genotype group is greater than or equal to ``term_frequency_threshold``. - - This is because of our assumption that even if there is statistical significance, - if a term is only seen in (for example) 7% of individuals in the MISSENSE group and 2% in the not-MISSENSE group, - the term is unlikely to be of great interest because it is rare. - -#. Skip terms if no cell has more than one count - In a related heuristic, we skip terms if no genotype group has more than one count. - This is not completely redundant with the previous condition, - because some terms may have a small number of total observations. - -#. Skip terms if all counts are identical to counts for a child term - Let's say a term such as - `Posterior polar cataract (HP:0001115) `_ - was observed in 7 of 11 individuals with MISSENSE variants - and in 3 of 8 individuals with NONSENSE variants. - If we find the same patient counts (7 of 11 and 3 of 8) in the parent term - `Polar cataract HP:0010696 `_, - then we choose to not test the parent term. - - This is because the more specific an HPO term is, - the more information it has (the more interesting the correlation would be if it exists), - and the result of the Fisher Exact test for *Polar cataract* - would be exactly the same as for *Posterior polar cataract*. - -#. Skip terms if genotypes have same HPO proportions - If both (or all) of the genotype groups have the same proportion of individuals - observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, - because it is not possible that the Fisher exact test will return a significant result. - -#. Skip terms if there are no HPO observations in a group - If one of the genotype groups has neither observed nor excluded observations for an HPO term, skip it. - -#. Skipping terms that are not descendents of `Phenotypic abnormality (HP:0000118) `_ - The HPO has a number of other branches that describe modes of inheritance, - past medical history, and clinical modifiers. - We do not think it makes much sense to test for enrichment of these terms, - and so they are filtered out. - -#. Skipping "general" level terms - All the direct children of the root phenotype term - `Phenotypic abnormality (HP:0000118) `_ are skipped, - because of the assumption that if there is a valid signal, - it will derive from one of the more specific descendents. - - For instance, `Abnormality of the nervous system (HP:0000707) `_ - is a child of *Phenotypic abnormality*, and this assumption implies - that if there is a signal from the nervous system, - it will lead to at least one of the descendents of - *Abnormality of the nervous system* being significant. - - See :ref:`general-hpo-terms` section for details. ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| Code | Name | Description | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF01` | Skip terms that | The ``term_frequency_threshold`` determines the mininum proportion of individuals | +| | occur very rarely | with direct or indirect annotation by the HPO term to test. | +| | | We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), | +| | | and we only retain a term for testing if the proportion of individuals | +| | | in at least one genotype group is greater than | +| | | or equal to ``term_frequency_threshold``. | +| | | This is because of our assumption that even if there is statistical significance, | +| | | if a term is only seen in (for example) 7% of individuals | +| | | in the MISSENSE group and 2% in the not-MISSENSE group, | +| | | the term is unlikely to be of great interest because it is rare. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF02` | Skip terms if | In a related heuristic, we skip terms if no genotype group has more | +| | no cell has more | than one count. This is not completely redundant with the previous condition, | +| | than one count | because some terms may have a small number of total observations. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF03` | Skip terms if | Let's say a term such as | +| | all counts are | `Posterior polar cataract (HP:0001115) `_ | +| | identical | was observed in 7 of 11 individuals with MISSENSE variants | +| | to counts | and in 3 of 8 individuals with NONSENSE variants. | +| | for a child | If we find the same patient counts (7 of 11 and 3 of 8) in the parent term | +| | term | `Polar cataract HP:0010696 `_, | +| | | then we choose to not test the parent term. | +| | | | +| | | This is because the more specific an HPO term is, | +| | | the more information it has (the more interesting the correlation would be if it exists), | +| | | and the result of a test, such as the Fisher Exact test, would be exactly the same | +| | | for *Polar cataract* as for *Posterior polar cataract*. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF04` | Skip terms if | If both (or all) of the genotype groups have the same proportion of individuals | +| | genotypes have | observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, | +| | same HPO | because it is not possible that the Fisher exact test will return a significant result. | +| | proportions | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF05` | Skip terms if | If one of the genotype groups has neither observed nor excluded observations | +| | there are no | for an HPO term, skip it. | +| | HPO observations | | +| | in a group | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF06` | Skip term if | If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) | +| | underpowered | and the total count of patients in all genotype-phenotype groups is less than 7, | +| | for 2x2 or 2x3 | or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients | +| | analysis | is less than 6, then there is a lack even of the nominal statistical power | +| | | and the counts can never be significant. | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF07` | Skipping terms | The HPO has a number of other branches that describe modes of inheritance, | +| | that are not | past medical history, and clinical modifiers. | +| | descendents of | We do not think it makes much sense to test for enrichment of these terms, | +| | *Phenotypic* | so, all terms that are not descendants of | +| | *abnormality* | `Phenotypic abnormality `_ are filtered out. | +| | | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ +| `HMF08` | Skipping | All the direct children of the root phenotype term | +| | "general" | `Phenotypic abnormality (HP:0000118) `_ | +| | level terms | are skipped, because of the assumption that if there is a valid signal, | +| | | it will derive from one of the more specific descendents. | +| | | | +| | | For instance, | +| | |`Abnormality of the nervous system `_ | +| | | (HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies | +| | | that if there is a signal from the nervous system, | +| | | it will lead to at least one of the descendents of | +| | | *Abnormality of the nervous system* being significant. | +| | | | +| | | See :ref:`general-hpo-terms` section for details. | +| | | | ++------------+-------------------+--------------------------------------------------------------------------------------------+ + diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index 4851c198..39c76f0b 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,22 +1,18 @@ What is the genotype group,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 -Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 -Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.021044138590779585,0.003713671516019927 -Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.034,0.01 -Heart block [HP:0012722],1/23,4%,2/2,100%,0.034,0.01 -Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.09214092140921408,0.03252032520325203 -Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.1440020479198931,0.06544319142266644 -Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.1440020479198931,0.06932119159387057 -Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.1440020479198931,0.08259109311740892 -Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.1440020479198931,0.08470708701170182 -Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667 -Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483 +Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.003870827221893892,0.00024192670136836825 +Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01230769230769231,0.0015384615384615387 +Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.01980624808543961,0.003713671516019927 +Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.04,0.01 +Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.10406504065040649,0.03252032520325203 +Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.15059037690969213,0.06544319142266644 +Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.15059037690969213,0.06932119159387057 +Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.15059037690969213,0.08259109311740892 +Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.15059037690969213,0.08470708701170182 +Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.708378140298727,0.4666666666666667 +Short thumb [HP:0009778],25/69,36%,8/30,27%,0.708378140298727,0.48700997145537483 Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444 +Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0 Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0 Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0 -Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 -Aplasia/Hypoplasia of the thumb [HP:0009601],40/40,100%,19/19,100%,, -Aplasia/Hypoplasia of fingers [HP:0006265],44/44,100%,19/19,100%,, -Aplasia/hypoplasia involving bones of the hand [HP:0005927],44/44,100%,19/19,100%,, diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html index 3d00300a..242057f9 100644 --- a/docs/user-guide/report/tbx5_frameshift.mtc_report.html +++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 17 out of the total of 260 HPO terms.

+

Performed statistical tests for 16 out of the total of 260 HPO terms.

Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
Code
TODOHMF01 Skipping term with maximum frequency that was less than threshold 0.2 51
TODOSkipping general term44
TODOSkipping term because all genotypes have same HPO observed proportions41
TODOSkipping term with only 2 observations (not powered for 2x2)26
TODOSkipping term with only 1 observations (not powered for 2x2)24
TODOSkipping term with only 3 observations (not powered for 2x2)22HMF02Skipping term because no genotype has more than one observed HPO count3
TODOSkipping term with only 4 observations (not powered for 2x2)14HMF03Skipping term because of a child term with the same individual counts1
TODOSkipping term with only 6 observations (not powered for 2x2)12HMF04Skipping term because all genotypes have same HPO observed proportions41
TODOSkipping term with only 5 observations (not powered for 2x2)4HMF05Skipping term because one genotype had zero observations2
TODOSkipping term because no genotype has more than one observed HPO count3HMF06Skipping term with less than 7 observations (not powered for 2x2)102
TODOSkipping term because one genotype had zero observations2HMF08Skipping general term44
- + @@ -59,80 +59,45 @@

Phenotype testing report

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 5132b4b5..991fbd34 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -230,10 +230,10 @@ We can now execute the analysis: >>> len(result.phenotypes) 260 >>> result.total_tests -17 +16 -Thanks to Phenotype MTC filter, we only tested 17 out of 260 terms. +Thanks to Phenotype MTC filter, we only tested 16 out of 260 terms. We can learn more by showing the MTC filter report: >>> from gpsea.view import MtcStatsViewer @@ -251,12 +251,11 @@ Genotype phenotype associations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Last, let's explore the associations. The results include a table with all tested HPO terms -ordered by the corrected p value (Benjamini-Hochberg FDR). -Here we show the top 20 table rows: +ordered by the corrected p value (Benjamini-Hochberg FDR): >>> from gpsea.view import summarize_hpo_analysis >>> summary_df = summarize_hpo_analysis(hpo, result) ->>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP +>>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest :file: report/tbx5_frameshift.csv diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index c52d10b9..332a53a2 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -1,10 +1,10 @@ import abc +import dataclasses import typing from collections import deque import hpotk -from matplotlib import axis import numpy as np import pandas as pd @@ -12,51 +12,83 @@ from ..predicate.phenotype import PhenotypePolyPredicate, P +@dataclasses.dataclass(eq=True, frozen=True) +class PhenotypeMtcIssue: + """ + The container for data available regarding the reason why a phenotype was filtered out. + """ + + code: str + """ + A `str` with a unique code of the issue. + """ + + reason: str + """ + A human-friendly explanation of the issue. + """ + + class PhenotypeMtcResult: """ `PhenotypeMtcResult` represents a result of :class:`PhenotypeMtcFilter` for a single phenotype. The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`) - of be filtered out (:meth:`is_filtered_out`) in which case :attr:`reason` must be available. + of be filtered out (:meth:`is_filtered_out`) in which case :attr:`mtc_issue` with more context + regarding culprit must be available. """ @staticmethod def ok() -> "PhenotypeMtcResult": # A singleton would be nice here... - return PhenotypeMtcResult(status=True, reason=None) + return PhenotypeMtcResult(status=True, issue=None) @staticmethod - def fail(reason: str) -> "PhenotypeMtcResult": - return PhenotypeMtcResult(status=False, reason=reason) + def fail(code: str, reason: str) -> "PhenotypeMtcResult": + issue = PhenotypeMtcIssue(code=code, reason=reason) + return PhenotypeMtcResult(status=False, issue=issue) def __init__( self, status: bool, - reason: typing.Optional[str], + issue: typing.Optional[PhenotypeMtcIssue], ): + assert isinstance(status, bool) + if status: + assert issue is None, '`issue` must NOT be provided if the HPO term passed the MTC filter' + else: + assert issue is not None, '`issue` must be provided if the HPO term failed the MTC filter' + self._status = status - self._reason = reason + self._issue = issue def is_passed(self) -> bool: return self._status def is_filtered_out(self) -> bool: return not self._status - + + @property + def mtc_issue(self) -> typing.Optional[PhenotypeMtcIssue]: + return self._issue + @property def reason(self) -> typing.Optional[str]: - return self._reason + if self.mtc_issue is None: + return None + else: + return self.mtc_issue.reason def __eq__(self, value: object) -> bool: return isinstance(value, PhenotypeMtcResult) \ and self._status == value._status \ - and self._reason == value._reason + and self._issue == value._issue def __hash__(self) -> int: - return hash((self._status, self._reason)) + return hash((self._status, self._issue)) def __str__(self) -> str: - return f'PhenotypeMtcResult(status={self._status}, reason={self._reason})' + return f'PhenotypeMtcResult(status={self._status}, issue={self._issue})' def __repr__(self) -> str: return str(self) @@ -71,6 +103,14 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta): Therefore, the expected input asks for :class:`hpotk.TermId` items. For instance, `n_usable` is a mapping from an *HPO term* to an `int` with the count of the patients categorized according to the HPO term. + + :attr:`PhenotypeMtcFilter.OK` is returned for HPO terms that pass MTC filtering + and *should* be included in the analysis. + """ + + OK = PhenotypeMtcResult.ok() + """ + The MTC result for the phenotypes that pass the filtering and *should* be included in the analysis. """ @abc.abstractmethod @@ -91,6 +131,13 @@ def filter( """ pass + @abc.abstractmethod + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + """ + Return all possible result types which the `PhenotypeMtcFilter` can produce. + """ + pass + @abc.abstractmethod def filter_method_name(self) -> str: """ @@ -106,9 +153,6 @@ class UseAllTermsMtcFilter(PhenotypeMtcFilter[typing.Any]): See :ref:`use-all-terms-strategy` section for more info. """ - def __init__(self): - self._ok = PhenotypeMtcResult.ok() - def filter( self, gt_predicate: GenotypePolyPredicate, @@ -116,7 +160,10 @@ def filter( counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: # Always OK! 😏 - return tuple(self._ok for _ in ph_predicates) + return tuple(PhenotypeMtcFilter.OK for _ in ph_predicates) + + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return (PhenotypeMtcFilter.OK,) def filter_method_name(self) -> str: return "All HPO terms" @@ -133,6 +180,11 @@ class SpecifiedTermsMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): See :ref:`specify-terms-strategy` section for more info. """ + + NON_SPECIFIED_TERM = PhenotypeMtcResult.fail(code="ST1", reason="Non-specified term") + """ + The MTC filtering result returned when an HPO term does not belong among the selection of terms to be tested. + """ def __init__( self, @@ -142,8 +194,6 @@ def __init__( Args: terms_to_test: an iterable of TermIds representing the terms to test """ - self._ok = PhenotypeMtcResult.ok() - self._fail = PhenotypeMtcResult.fail("Non-specified term") self._terms_to_test_set = set(terms_to_test) def filter( @@ -155,10 +205,16 @@ def filter( results = [] for predicate in ph_predicates: if predicate.phenotype in self._terms_to_test_set: - results.append(self._ok) + results.append(SpecifiedTermsMtcFilter.OK) else: - results.append(self._fail) + results.append(SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM) return tuple(results) + + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return ( + PhenotypeMtcFilter.OK, + SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM, + ) def filter_method_name(self) -> str: return "Specified terms MTC filter" @@ -174,16 +230,21 @@ class HpoMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): We recommend creating an instance using the :func:`default_filter` static factory method. """ - # TODO: this has been here before. Is it still valid? - # 2. Do not perform a test if the counts in the genotype categories do not even have nominal statistical power - # for i in range(2,6): - # for j in range(2,6): - # # create a table with the most extreme possible counts. If this is not significant, then - # # other counts also cannot be - # two_by_two_table = [[i, 0], [0, j]] - # oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') - # This code reveals that (2,4), (4,2), (2,3), (3,3), (2,2) and (3,2) are not powered so we can always skip them - # ## TODO -- similar calculate for 3x2 + NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO = PhenotypeMtcResult.fail( + "HMF02", "Skipping term because no genotype has more than one observed HPO count", + ) + SAME_COUNT_AS_THE_ONLY_CHILD = PhenotypeMtcResult.fail( + "HMF03", "Skipping term because of a child term with the same individual counts", + ) + SKIPPING_SAME_OBSERVED_PROPORTIONS = PhenotypeMtcResult.fail( + "HMF04", + "Skipping term because all genotypes have same HPO observed proportions", + ) + SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS = PhenotypeMtcResult.fail( + "HMF05", "Skipping term because one genotype had zero observations" + ) + SKIPPING_NON_PHENOTYPE_TERM = PhenotypeMtcResult.fail("HMF07", "Skipping non phenotype term") + SKIPPING_GENERAL_TERM = PhenotypeMtcResult.fail("HMF08", "Skipping general term") @staticmethod def default_filter( @@ -262,13 +323,40 @@ def __init__( """ self._hpo = hpo self._hpo_term_frequency_filter = term_frequency_threshold - # The following numbers of total observations in the genotype groups can never be significant, - # so we just skip them see above explanation - self._powerless_set = {(2, 4), (4, 2), (2, 3), (3, 3), (2, 2), (3, 2)} - # thus if the total count is 6 or less, there is no power - CAN WE SIMPLIFY? self._general_hpo_terms = set(general_hpo_terms) - self._ok = PhenotypeMtcResult.ok() + + self._below_frequency_threshold = PhenotypeMtcResult.fail( + code="HMF01", + reason="Skipping term with maximum frequency that was" + f" less than threshold {self._hpo_term_frequency_filter}", + ) + + # Do not perform a test if the counts in the genotype categories do not even have nominal statistical power + # The following numbers of total observations in the genotype groups can never be significant, + # so we just skip them. + # ``` + # for i in range(2,6): + # for j in range(2,6): + # # create a table with the most extreme possible counts. If this is not significant, then + # # other counts also cannot be + # two_by_two_table = [[i, 0], [0, j]] + # oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') + # ``` + # This code reveals that (2,4), (4,2), (2,3), (3,3), (2,2) and (3,2) are not powered so we can always skip them + # Thus if the total count is 6 or less, there is no power. + self._min_observations_for_2_by_2 = 7 + self._not_powered_for_2_by_2 = PhenotypeMtcResult.fail( + code="HMF06", + reason=f"Skipping term with less than {self._min_observations_for_2_by_2} observations" + " (not powered for 2x2)", + ) + self._min_observations_for_2_by_3 = 6 + self._not_powered_for_2_by_3 = PhenotypeMtcResult.fail( + code="HMF06", + reason=f"Skipping term with less than {self._min_observations_for_2_by_3} observations" + " (not powered for 2x3)", + ) def filter( self, @@ -290,46 +378,39 @@ def filter( continue if term_id in self._general_hpo_terms: - results[idx] = PhenotypeMtcResult.fail("Skipping general term") + results[idx] = HpoMtcFilter.SKIPPING_GENERAL_TERM continue if not self._hpo.graph.is_ancestor_of( hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id ): - results[idx] = PhenotypeMtcResult.fail("Skipping non phenotype term") + results[idx] = HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM continue - # get total number of observations - # if term_id not in all_counts: - # reason_for_filtering_out["Skipping non-target term"] += 1 - # continue - ph_predicate = ph_predicates[idx] contingency_matrix = counts[idx] - total = contingency_matrix.sum().sum() + max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( contingency_matrix, ph_predicate=ph_predicate, ) if max_freq < self._hpo_term_frequency_filter: - reason = ( - "Skipping term with maximum frequency " - f"that was less than threshold {self._hpo_term_frequency_filter}" - ) - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = self._below_frequency_threshold continue - if contingency_matrix.shape == (2, 2) and total < 7: - reason = f"Skipping term with only {total} observations (not powered for 2x2)" - results[idx] = PhenotypeMtcResult.fail(reason) + total_count = contingency_matrix.sum().sum() + if contingency_matrix.shape == (2, 2) and total_count < self._min_observations_for_2_by_2: + results[idx] = self._not_powered_for_2_by_2 + continue + elif contingency_matrix.shape == (2, 3) and total_count < self._min_observations_for_2_by_3: + results[idx] = self._not_powered_for_2_by_3 continue if not HpoMtcFilter.some_cell_has_greater_than_one_count( counts=contingency_matrix, ph_predicate=ph_predicate, ): - reason = "Skipping term because no genotype has more than one observed HPO count" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO continue elif HpoMtcFilter.genotypes_have_same_hpo_proportions( @@ -337,29 +418,40 @@ def filter( gt_predicate=gt_predicate, ph_predicate=ph_predicate, ): - reason = "Skipping term because all genotypes have same HPO observed proportions" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.SKIPPING_SAME_OBSERVED_PROPORTIONS continue elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts=contingency_matrix, gt_predicate=gt_predicate, ): - reason = "Skipping term because one genotype had zero observations" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS continue - # TODO: the code below actually did not work. - # for child_term_id in self._hpo.graph.get_children(term_id): - # if child_term_id in tested_counts_pf: - # if tested_counts_pf[child_term_id].equals(contingency_matrix): - # # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches? - # reason = "Child term with same counts previously tested" - # reason_for_filtering_out[reason] += 1 - # continue - + # Check if the term has exactly one child with a very similar number of individuals + # in the genotype and phenotype groups. + child_contingency_matrix = None + for child in self._hpo.graph.get_children(term_id): + if child in p_to_idx: + child_idx = p_to_idx[child] + if child_contingency_matrix is None: + child_contingency_matrix = counts[child_idx] + else: + # Found 2nd child of this term. + # We always want to test a term with + # >1 children in the target set. + child_contingency_matrix = None + break + + if child_contingency_matrix is not None: + if (contingency_matrix - child_contingency_matrix).abs().max(axis=None) < 1: + # Do not test if the count is exactly the same to the counts in the only child term. + results[idx] = HpoMtcFilter.SAME_COUNT_AS_THE_ONLY_CHILD + continue + + # ## # The term should be tested if we get here. - results[idx] = self._ok + results[idx] = PhenotypeMtcFilter.OK # There should be no `None` elements in `results` at this point. if any(r is None for r in results): @@ -375,6 +467,18 @@ def filter( # Ignoring the type hint, because we checked the type match above. return tuple(results) # type: ignore + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return ( + PhenotypeMtcFilter.OK, + HpoMtcFilter.SKIPPING_GENERAL_TERM, + HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM, + self._below_frequency_threshold, + self._not_powered_for_2_by_2, + HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO, + HpoMtcFilter.SKIPPING_SAME_OBSERVED_PROPORTIONS, + HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS, + ) + def filter_method_name(self) -> str: return "HPO MTC filter" diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 75b65866..256c3fb3 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -11,6 +11,8 @@ def summarize_hpo_analysis( """ Create a dataframe with counts, frequencies, and p values for the tested HPO terms. + The HPO terms that were not tested will *not* be included in the frame. + :param hpo: HPO data. :param result: the HPO term analysis results to show. """ @@ -51,10 +53,12 @@ def summarize_hpo_analysis( # Last, sort by corrected p value or just p value df = df.set_index(labeled_idx) + # and only report the tested HPO terms + with_p_value = df[("", p_val_col_name)].notna() if result.corrected_pvals is not None: - return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]) + return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]).loc[with_p_value] else: - return df.sort_values(by=("", p_val_col_name)) + return df.sort_values(by=("", p_val_col_name)).loc[with_p_value] def format_term_id( diff --git a/src/gpsea/view/_stats.py b/src/gpsea/view/_stats.py index b232ad15..5d9e1ab1 100644 --- a/src/gpsea/view/_stats.py +++ b/src/gpsea/view/_stats.py @@ -43,12 +43,19 @@ def _prepare_context( counts = Counter() for result in report.mtc_filter_results: if result.is_filtered_out(): - counts[result.reason] += 1 + counts[result.mtc_issue] += 1 n_skipped = 0 - reason_to_count = list() - for reason, count in sorted(counts.items(), key=lambda item: item[1], reverse=True): - reason_to_count.append({"reason": reason, "count": count}) + issue_to_count = list() + for mtc_issue, count in sorted( + counts.items(), + key=lambda issue2count: (issue2count[0].code, issue2count[0].reason) + ): + issue_to_count.append({ + "code": mtc_issue.code, + "reason": mtc_issue.reason, + "count": count, + }) n_skipped += count n_all = len(report.phenotypes) @@ -61,5 +68,5 @@ def _prepare_context( "skipped_hpo_count": n_skipped, "tested_hpo_count": n_tested, "total_hpo_count": n_all, - "reason_to_count": reason_to_count, + "issue_to_count": issue_to_count, } diff --git a/src/gpsea/view/templates/stats.html b/src/gpsea/view/templates/stats.html index 2a75c055..3fd48ed5 100644 --- a/src/gpsea/view/templates/stats.html +++ b/src/gpsea/view/templates/stats.html @@ -57,10 +57,9 @@

Phenotype testing report

- {% for skipped in reason_to_count %} + {% for skipped in issue_to_count %} - - + diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index 02710efd..88e085dd 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -1,4 +1,3 @@ -from itertools import count import typing import hpotk diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index d57b3ae2..f557a3e7 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -49,7 +49,7 @@ def hpo_term_analysis_result( gt_predicate=suox_gt_predicate, mtc_filter_name='Random MTC filter', mtc_filter_results=( - PhenotypeMtcResult.fail("Not too interesting"), + PhenotypeMtcResult.fail("RMF01", "Not too interesting"), PhenotypeMtcResult.ok(), ), mtc_name='fdr_bh', From 43b8ee4e630c7911a8e7f0d81fe11aa3152fed90 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 16:00:46 +0200 Subject: [PATCH 20/44] Export `PhenotypeMtcIssue`. --- src/gpsea/analysis/mtc_filter/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpsea/analysis/mtc_filter/__init__.py b/src/gpsea/analysis/mtc_filter/__init__.py index 4497ac34..57f0d614 100644 --- a/src/gpsea/analysis/mtc_filter/__init__.py +++ b/src/gpsea/analysis/mtc_filter/__init__.py @@ -5,10 +5,10 @@ See :ref:`MTC filters ` section for more info. """ -from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult +from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult, PhenotypeMtcIssue from ._impl import UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter __all__ = [ - 'PhenotypeMtcFilter', 'PhenotypeMtcResult', + 'PhenotypeMtcFilter', 'PhenotypeMtcResult', 'PhenotypeMtcIssue', 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter', ] From 9c98210f7d5aa8af74ca74473953fb7432fbdb01 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 16:01:46 +0200 Subject: [PATCH 21/44] Improve pydoc. --- src/gpsea/analysis/mtc_filter/_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index 332a53a2..ad691c40 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -35,7 +35,7 @@ class PhenotypeMtcResult: The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`) of be filtered out (:meth:`is_filtered_out`) in which case :attr:`mtc_issue` with more context - regarding culprit must be available. + regarding the culprit must be available. """ @staticmethod From 71cdc91f25cf45be84630bb71961845429313db9 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 10 Sep 2024 16:16:24 +0200 Subject: [PATCH 22/44] Fix the p values to reflect the updated MTC filter. --- docs/tutorial.rst | 2 +- docs/user-guide/stats.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e448e8c4..35b0d278 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -314,4 +314,4 @@ was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. Fisher exact test computed a p value of `~0.0000562` and the p value corrected by Benjamini-Hochberg procedure -is `~0.000955`. +is `~0.000899`. diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 991fbd34..5e9c8316 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -269,7 +269,7 @@ was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. Fisher exact test computed a p value of `~0.000242` and the p value corrected by Benjamini-Hochberg procedure -is `~0.00411`. +is `~0.00387`. The table includes all HPO terms of the cohort, including the terms that were not selected for testing and thus have no associated p value. From b694fec4faf6fbacf2bfb0dea40103879083bdee Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 13:20:51 +0200 Subject: [PATCH 23/44] Remove X-linked modes of inheritance. --- .../predicate/genotype/_gt_predicates.py | 175 +----------------- .../predicate/genotype/test_gt_predicates.py | 56 +----- 2 files changed, 13 insertions(+), 218 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 341d17fb..a9972044 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,5 +1,4 @@ import dataclasses -import enum import typing from collections import defaultdict @@ -291,23 +290,6 @@ class GenotypeGroup: categorization: Categorization -class MendelianInheritanceAspect(enum.Enum): - AUTOSOMAL = 0 - """ - Related to chromosomes that do *not* determine the sex of an individual. - """ - - GONOSOMAL = 1 - """ - Related to chromosomes that determine the sex of an individual. - """ - - MITOCHONDRIAL = 2 - """ - Related to mitochondrial DNA. - """ - - class ModeOfInheritanceInfo: # NOT PART OF THE PUBLIC API!!! @@ -333,13 +315,6 @@ class ModeOfInheritanceInfo: description="Homozygous alternate or compound heterozygous", ), ) - HEMI = Categorization( - PatientCategory( - cat_id=3, - name="HEMI", - description="Hemizygous", - ), - ) @staticmethod def autosomal_dominant() -> "ModeOfInheritanceInfo": @@ -356,7 +331,6 @@ def autosomal_dominant() -> "ModeOfInheritanceInfo": ), ) return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, groups=groups, ) @@ -380,62 +354,11 @@ def autosomal_recessive() -> "ModeOfInheritanceInfo": ), ) return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, - groups=groups, - ) - - @staticmethod - def x_dominant() -> "ModeOfInheritanceInfo": - groups = ( - GenotypeGroup( - allele_count=0, - sex=None, - categorization=ModeOfInheritanceInfo.HOM_REF, - ), - GenotypeGroup( - allele_count=1, - sex=None, - categorization=ModeOfInheritanceInfo.HET, - ), - ) - return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, - groups=groups, - ) - - @staticmethod - def x_recessive() -> "ModeOfInheritanceInfo": - groups = ( - GenotypeGroup( - allele_count=0, - sex=None, - categorization=ModeOfInheritanceInfo.HOM_REF, - ), - GenotypeGroup( - allele_count=1, - sex=Sex.FEMALE, - categorization=ModeOfInheritanceInfo.HET, - ), - GenotypeGroup( - allele_count=2, - sex=Sex.FEMALE, - categorization=ModeOfInheritanceInfo.BIALLELIC_ALT, - ), - GenotypeGroup( - allele_count=1, - sex=Sex.MALE, - categorization=ModeOfInheritanceInfo.HEMI, - ), - ) - - return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, groups=groups, ) def __init__( self, - mendelian_inheritance_aspect: MendelianInheritanceAspect, groups: typing.Iterable[GenotypeGroup], ): # We want this to be hashable but also keep a non-hashable dict @@ -443,10 +366,6 @@ def __init__( # The correctness depends on two default dicts with same keys and values # comparing equal. hash_value = 17 - assert isinstance(mendelian_inheritance_aspect, MendelianInheritanceAspect) - self._aspect = mendelian_inheritance_aspect - - hash_value += 31 * hash(self._aspect) self._groups = defaultdict(list) for group in groups: @@ -461,10 +380,6 @@ def groups(self) -> typing.Iterator[GenotypeGroup]: # Flatten `values()` which is an iterable of lists. return (group for meta_group in self._groups.values() for group in meta_group) - @property - def mendelian_inheritance_aspect(self) -> MendelianInheritanceAspect: - return self._aspect - def get_groups_for_allele_count( self, allele_count: int, @@ -475,19 +390,9 @@ def get_groups_for_allele_count( # No group for this allele count is OK return () - def is_autosomal(self) -> bool: - return self._aspect == MendelianInheritanceAspect.AUTOSOMAL - - def is_gonosomal(self) -> bool: - return self._aspect == MendelianInheritanceAspect.GONOSOMAL - - def is_mitochondrial(self) -> bool: - return self._aspect == MendelianInheritanceAspect.MITOCHONDRIAL - def __eq__(self, value: object) -> bool: return ( isinstance(value, ModeOfInheritanceInfo) - and self._aspect == value._aspect and self._groups == value._groups ) @@ -495,7 +400,7 @@ def __hash__(self) -> int: return self._hash def __str__(self) -> str: - return f"ModeOfInheritanceInfo(aspect={self._aspect}, groups={self._groups})" + return f"ModeOfInheritanceInfo(groups={self._groups})" def __repr__(self) -> str: return str(self) @@ -518,7 +423,7 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate.from_moi_info( + return ModeOfInheritancePredicate._from_moi_info( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), ) @@ -535,46 +440,13 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate.from_moi_info( + return ModeOfInheritancePredicate._from_moi_info( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), ) @staticmethod - def x_dominant( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": - """ - Create a predicate that assigns the patient either into - homozygous reference or heterozygous - group in line with the X-linked dominant mode of inheritance. - - :param variant_predicate: a predicate for choosing the variants for testing. - """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.x_dominant(), - ) - - @staticmethod - def x_recessive( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": - """ - Create a predicate that assigns the patient either into - homozygous reference, heterozygous, biallelic alternative allele - (homozygous alternative or compound heterozygous), or hemizygous - group in line with the X-linked recessive mode of inheritance. - - :param variant_predicate: a predicate for choosing the variants for testing. - """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.x_recessive(), - ) - - @staticmethod - def from_moi_info( + def _from_moi_info( variant_predicate: VariantPredicate, mode_of_inheritance_data: ModeOfInheritanceInfo, ) -> "ModeOfInheritancePredicate": @@ -620,41 +492,12 @@ def test( ) -> typing.Optional[Categorization]: self._check_patient(patient) - if self._moi_info.is_autosomal(): - allele_count = self._allele_counter.count(patient) - groups = self._moi_info.get_groups_for_allele_count(allele_count) - if len(groups) == 1: - return groups[0].categorization - else: - return None - elif self._moi_info.is_gonosomal(): - if patient.sex.is_provided(): - allele_count = self._allele_counter.count(patient) - groups = self._moi_info.get_groups_for_allele_count(allele_count) - if len(groups) == 0: - # Unable to assign the individual. - return None - elif len(groups) == 1: - # We can only assign into one category no matter what the individual's sex is. - return groups[0].categorization - else: - # We choose depending on the sex. - for group in groups: - if group.sex is not None and group.sex == patient.sex: - return group.categorization - return None - else: - # We must have patient's sex - # to do any meaningful analysis - # in the non-autosomal scenario. - return None - - elif self._moi_info.is_mitochondrial(): - # Cannot deal with mitochondrial inheritance right now. - return None + allele_count = self._allele_counter.count(patient) + groups = self._moi_info.get_groups_for_allele_count(allele_count) + if len(groups) == 1: + return groups[0].categorization else: - # Bug, please report to the developers - raise ValueError("Unexpected mode of inheritance condition") + return None def __eq__(self, value: object) -> bool: return ( diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 61ff29c3..6d82201c 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -136,62 +136,13 @@ def test_autosomal_recessive( assert categorization.category.name == name - @pytest.mark.parametrize( - "patient_name,name", - [ - ("adam", "HOM_REF"), - ("eve", "HET"), - ("cain", "HET"), - ], - ) - def test_x_dominant( - self, - patient_name: str, - name: str, - variant_predicate: VariantPredicate, - request: pytest.FixtureRequest, - ): - patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.x_dominant(variant_predicate) - - categorization = predicate.test(patient) - - assert categorization is not None - - assert categorization.category.name == name - - @pytest.mark.parametrize( - "patient_name,name", - [ - ("anakin", "HOM_REF"), - ("padme", "HET"), - ("luke", "HEMI"), - ("leia", "HET"), - ], - ) - def test_x_recessive( - self, - patient_name: str, - name: str, - variant_predicate: VariantPredicate, - request: pytest.FixtureRequest, - ): - patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.x_recessive(variant_predicate) - - categorization = predicate.test(patient) - - assert categorization is not None - - assert categorization.category.name == name - class TestFilteringPredicate: @pytest.fixture(scope="class") def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: affects_suox = VariantPredicates.gene("SUOX") - return ModeOfInheritancePredicate.x_recessive( + return ModeOfInheritancePredicate.autosomal_recessive( variant_predicate=affects_suox, ) @@ -232,17 +183,18 @@ def test_filtering_predicate__explodes_when_not_subsetting( assert ( ve.value.args[0] - == "It makes no sense to subset the a predicate with 4 categorizations with the same number (4) of targets" + == "It makes no sense to subset the a predicate with 3 categorizations with the same number (3) of targets" ) def test_filtering_predicate__explodes_when_using_random_junk( self, x_recessive_gt_predicate: GenotypePolyPredicate, ): + random_junk = (0, 1) with pytest.raises(ValueError) as ve: filtering_predicate( predicate=x_recessive_gt_predicate, - targets=(0, 1), + targets=random_junk, # type: ignore ) assert ( From 134f85eabe6f62a407d132f26f9bb3b9a5b21de3 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:03:45 +0200 Subject: [PATCH 24/44] Implement monoallelic and biallelic predicates. --- .../analysis/predicate/genotype/__init__.py | 2 + .../predicate/genotype/_gt_predicates.py | 168 ++++++++++++++++-- tests/analysis/predicate/genotype/conftest.py | 86 +++++++-- .../predicate/genotype/test_gt_predicates.py | 81 +++++++++ 4 files changed, 312 insertions(+), 25 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 23ad8cfb..d684972c 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -2,12 +2,14 @@ from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'diagnosis_predicate', + 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index a9972044..d21be631 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -15,7 +15,7 @@ # TODO: implement __hash__, __eq__ on predicates -class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): +class AlleleCountingGenotypePredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API """ The predicate tests presence of at least one matching allele in the patient. @@ -26,7 +26,7 @@ class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): @staticmethod def for_variant_predicate(predicate: VariantPredicate): allele_counter = AlleleCounter(predicate=predicate) - return AlleleCountingGenotypeBooleanPredicate(allele_counter=allele_counter) + return AlleleCountingGenotypePredicate(allele_counter=allele_counter) def __init__(self, allele_counter: AlleleCounter): self._allele_counter = allele_counter @@ -34,12 +34,12 @@ def __init__(self, allele_counter: AlleleCounter): def get_categorizations(self) -> typing.Sequence[Categorization]: """ The predicate bins a patient into - :attr:`AlleleCountingGenotypeBooleanPredicate.NO` - or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category. + :attr:`AlleleCountingGenotypePredicate.NO` + or :class:`AlleleCountingGenotypePredicate.YES` category. """ return ( - AlleleCountingGenotypeBooleanPredicate.YES, - AlleleCountingGenotypeBooleanPredicate.NO, + AlleleCountingGenotypePredicate.YES, + AlleleCountingGenotypePredicate.NO, ) def get_question_base(self) -> str: @@ -50,9 +50,9 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: allele_count = self._allele_counter.count(patient) if allele_count > 0: - return AlleleCountingGenotypeBooleanPredicate.YES + return AlleleCountingGenotypePredicate.YES elif allele_count == 0: - return AlleleCountingGenotypeBooleanPredicate.NO + return AlleleCountingGenotypePredicate.NO else: raise ValueError( f"Allele counter should return a non-negative allele count: {allele_count}" @@ -60,7 +60,7 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def __eq__(self, value: object) -> bool: return ( - isinstance(value, AlleleCountingGenotypeBooleanPredicate) + isinstance(value, AlleleCountingGenotypePredicate) and self._allele_counter == value._allele_counter ) @@ -68,7 +68,7 @@ def __hash__(self) -> int: return hash((self._allele_counter,)) def __str__(self) -> str: - return f"AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})" + return f"AlleleCountingGenotypePredicate(allele_counter={self._allele_counter})" def __repr__(self) -> str: return str(self) @@ -79,7 +79,7 @@ def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredic Create a genotype boolean predicate from given `variant_predicate` to test for presence of at least one matching allele in the patient. """ - return AlleleCountingGenotypeBooleanPredicate.for_variant_predicate( + return AlleleCountingGenotypePredicate.for_variant_predicate( predicate=variant_predicate, ) @@ -186,6 +186,144 @@ def groups_predicate( ) +class MonoallelicGenotypePredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + ZERO = Categorization( + PatientCategory(cat_id=0, name="Zero", description="Zero alleles") + ) + ONE = Categorization( + PatientCategory(cat_id=1, name="One", description="One allele") + ) + + @staticmethod + def for_variant_predicate(predicate: VariantPredicate): + allele_counter = AlleleCounter(predicate=predicate) + return MonoallelicGenotypePredicate(allele_counter=allele_counter) + + def __init__( + self, + allele_counter: AlleleCounter, + ): + self._allele_counter = allele_counter + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return (MonoallelicGenotypePredicate.ZERO, MonoallelicGenotypePredicate.ONE) + + def get_question_base(self) -> str: + return "The variant allele count" + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + self._check_patient(patient) + + allele_count = self._allele_counter.count(patient) + if allele_count == 0: + return MonoallelicGenotypePredicate.ZERO + elif allele_count == 1: + return MonoallelicGenotypePredicate.ONE + else: + return None + + def __eq__(self, value: object) -> bool: + return ( + isinstance(value, MonoallelicGenotypePredicate) + and self._allele_counter == value._allele_counter + ) + + def __hash__(self) -> int: + return hash((self._allele_counter,)) + + def __str__(self) -> str: + return f"MonoallelicGenotypePredicate(allele_counter={self._allele_counter})" + + def __repr__(self) -> str: + return str(self) + + +def monoallelic_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: + return MonoallelicGenotypePredicate.for_variant_predicate(variant_predicate) + + +class BiallelicGenotypePredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) + AB = Categorization( + PatientCategory( + cat_id=1, name="AB", description="One copy of A and one copy of B" + ) + ) + BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) + + @staticmethod + def for_variant_predicates( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + ): + return BiallelicGenotypePredicate( + a_counter=AlleleCounter(a_predicate), + b_counter=AlleleCounter(b_predicate), + ) + + def __init__( + self, + a_counter: AlleleCounter, + b_counter: AlleleCounter, + ): + self._a_counter = a_counter + self._b_counter = b_counter + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return ( + BiallelicGenotypePredicate.AA, + BiallelicGenotypePredicate.AB, + BiallelicGenotypePredicate.BB, + ) + + def get_question_base(self) -> str: + return "The variant allele count" + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + self._check_patient(patient) + + a_count = self._a_counter.count(patient) + b_count = self._b_counter.count(patient) + if a_count == 2 and b_count == 0: + return BiallelicGenotypePredicate.AA + elif a_count == 1 and b_count == 1: + return BiallelicGenotypePredicate.AB + elif a_count == 0 and b_count == 2: + return BiallelicGenotypePredicate.BB + else: + return None + + def __eq__(self, value: object) -> bool: + return ( + isinstance(value, BiallelicGenotypePredicate) + and self._a_counter == value._a_counter + and self._b_counter == value._b_counter + ) + + def __hash__(self) -> int: + return hash((self._a_counter, self._b_counter)) + + def __str__(self) -> str: + return f"BiallelicGenotypePredicate(a_counter={self._a_counter}, b_counter={self._b_counter})" + + def __repr__(self) -> str: + return str(self) + + +def biallelic_predicate( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, +) -> GenotypePolyPredicate: + return BiallelicGenotypePredicate.for_variant_predicates( + a_predicate=a_predicate, + b_predicate=b_predicate, + ) + + class FilteringGenotypePolyPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API @@ -592,7 +730,7 @@ def create( pass else: raise ValueError(f"{d} is neither `str` nor `hpotk.TermId`") - + diagnosis_ids.append(d) if labels is None: @@ -631,7 +769,7 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations def get_question_base(self) -> str: - return 'What disease was diagnosed' + return "What disease was diagnosed" def test(self, patient: Patient) -> typing.Optional[Categorization]: categorization = None @@ -641,14 +779,14 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: except KeyError: # No match for this disease, no problem. continue - + if categorization is None: # First time we found a candidate disease categorization = candidate else: # Ambiguous match. We found several matching diagnoses! return None - + return categorization diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 7138e93f..88fb5537 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -336,7 +336,7 @@ def cain( @pytest.fixture(scope="package") -def white_mutation( +def white_missense_mutation( genome_build: GenomeBuild, walt_label: SampleLabels, skyler_label: SampleLabels, @@ -364,7 +364,7 @@ def white_mutation( gene_id="a_gene", tx_id="tx:xyz", hgvs_cdna=None, - is_preferred=False, + is_preferred=True, variant_effects=( VariantEffect.MISSENSE_VARIANT, VariantEffect.SPLICE_DONOR_VARIANT, @@ -386,6 +386,56 @@ def white_mutation( ) +@pytest.fixture(scope="package") +def white_stop_gain_mutation( + genome_build: GenomeBuild, + walt_label: SampleLabels, + skyler_label: SampleLabels, + flynn_label: SampleLabels, + holly_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=200, + end=201, + strand=Strand.POSITIVE, + ), + ref="T", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=True, + variant_effects=( + VariantEffect.STOP_GAINED, + ), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(80, 81), + ), + ), + genotypes=Genotypes.from_mapping( + { + walt_label: Genotype.HETEROZYGOUS, + skyler_label: Genotype.HETEROZYGOUS, + flynn_label: Genotype.HOMOZYGOUS_REFERENCE, + holly_label: Genotype.HOMOZYGOUS_ALTERNATE, + } + ), + ) + + @pytest.fixture(scope="package") def walt_label() -> SampleLabels: return SampleLabels("Walt") @@ -394,14 +444,18 @@ def walt_label() -> SampleLabels: @pytest.fixture(scope="package") def walt( walt_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( walt_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) @@ -413,14 +467,18 @@ def skyler_label() -> SampleLabels: @pytest.fixture(scope="package") def skyler( skyler_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( skyler_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) @@ -432,14 +490,18 @@ def flynn_label() -> SampleLabels: @pytest.fixture(scope="package") def flynn( flynn_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( flynn_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) @@ -451,14 +513,18 @@ def holly_label() -> SampleLabels: @pytest.fixture(scope="package") def holly( holly_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( holly_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 6d82201c..529f5fed 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -8,6 +8,8 @@ groups_predicate, filtering_predicate, sex_predicate, + monoallelic_predicate, + biallelic_predicate, VariantPredicates, VariantPredicate, ModeOfInheritancePredicate, @@ -137,6 +139,85 @@ def test_autosomal_recessive( assert categorization.category.name == name +class TestAllelePredicates: + + @pytest.mark.parametrize( + "individual_name,expected_name", + [ + ("adam", "Zero"), # 0/0 + ("eve", "One"), # 0/1 + ("cain", "One"), # 0/1 + ], + ) + def test_monoallelic_predicate_ad_family( + self, + individual_name: str, + expected_name: str, + request: pytest.FixtureRequest, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) + + assert actual_cat is not None + assert actual_cat.category.name == expected_name + + @pytest.mark.parametrize( + "individual_name,not_none,expected_name", + [ + ("walt", True, "One"), # 1/2 + ("skyler", True, "One"), # 1/2 + ("flynn", False, None), # 1/1, hence 2 alleles which is too much to work with monoallelic predicate. + ("holly", True, "Zero"), # 2/2 + ], + ) + def test_monoallelic_predicate_ar_family( + self, + individual_name: str, + not_none: bool, + expected_name: str, + request: pytest.FixtureRequest, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) + + if not_none: + assert actual_cat is not None + assert actual_cat.category.name == expected_name + else: + assert actual_cat is None + + @pytest.mark.parametrize( + "individual_name,expected_name", + [ + ("walt", "AB"), # 1/2 + ("skyler", "AB"), # 1/2 + ("flynn", "AA"), # 1/1 + ("holly", "BB"), # 2/2 + ], + ) + def test_biallelic_predicate( + self, + individual_name: str, + expected_name: str, + request: pytest.FixtureRequest, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_stop_gained = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, TX_ID) + gt_predicate = biallelic_predicate(is_missense, is_stop_gained) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) + + assert actual_cat is not None + assert actual_cat.category.name == expected_name + + class TestFilteringPredicate: @pytest.fixture(scope="class") From 81e3d31b0791053282e93c6ff1c96b341d8ebcde Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:09:01 +0200 Subject: [PATCH 25/44] Remove filtering predicate. --- .../predicates/filtering_predicate.rst | 56 ----------- .../predicates/genotype_predicates.rst | 1 - .../analysis/predicate/genotype/__init__.py | 4 +- .../predicate/genotype/_gt_predicates.py | 97 ------------------- .../predicate/genotype/test_gt_predicates.py | 84 ---------------- 5 files changed, 2 insertions(+), 240 deletions(-) delete mode 100644 docs/user-guide/predicates/filtering_predicate.rst diff --git a/docs/user-guide/predicates/filtering_predicate.rst b/docs/user-guide/predicates/filtering_predicate.rst deleted file mode 100644 index a5acef31..00000000 --- a/docs/user-guide/predicates/filtering_predicate.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. _filtering-predicate: - - -=================== -Filtering predicate -=================== - -Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need -to consider only a subset of the groups. A `GenotypePolyPredicate` -created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset -of the target categorizations of interest. - -Example -------- - -Let's suppose we want test the genotype-phenotype association between variants -that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, -and we are specifically interested in comparing the heterozygous variants -in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -to bin according to a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' - -We see that the `gt_predicate` bins the patients into three groups: - ->>> cats = gt_predicate.get_categorizations() ->>> cats -(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) - -We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, -and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -that includes only the categories of interest: - ->>> from gpsea.analysis.predicate.genotype import filtering_predicate ->>> fgt_predicate = filtering_predicate( -... predicate=gt_predicate, -... targets=(cats[1], cats[2]), -... ) ->>> fgt_predicate.display_question() -'What is the genotype group: HET, BIALLELIC_ALT' \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst index 70fe391d..f1ff3bf7 100644 --- a/docs/user-guide/predicates/genotype_predicates.rst +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -24,7 +24,6 @@ the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. variant_predicates mode_of_inheritance_predicate - filtering_predicate male_female_predicate diagnosis_predicate groups_predicate diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index d684972c..14036885 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,14 +1,14 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'diagnosis_predicate', + 'boolean_predicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index d21be631..89abe068 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -324,103 +324,6 @@ def biallelic_predicate( ) -class FilteringGenotypePolyPredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - - @staticmethod - def create( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "FilteringGenotypePolyPredicate": - # At least 2 target categorizations must be provided - if len(targets) <= 1: - raise ValueError( - f"At least 2 target categorizations must be provided but got {len(targets)}" - ) - - good_boys = tuple(isinstance(cat, Categorization) for cat in targets) - if not all(good_boys): - offenders = ", ".join( - str(i) for i, is_instance in enumerate(good_boys) if not is_instance - ) - raise ValueError( - f"The targets at following indices are not categorizations: [{offenders}]" - ) - - # All `allowed` categorizations must in fact be present in the `base` predicate. - cats_are_in_fact_present = tuple( - cat in predicate.get_categorizations() for cat in targets - ) - if not all(cats_are_in_fact_present): - missing = ", ".join( - c.category.name - for c, is_present in zip(targets, cats_are_in_fact_present) - if not is_present - ) - raise ValueError(f"Some from the categories are not present: {missing}") - - if len(targets) == predicate.n_categorizations(): - raise ValueError( - f"It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations " - f"with the same number ({len(targets)}) of targets" - ) - - return FilteringGenotypePolyPredicate( - predicate=predicate, - allowed=targets, - ) - - def __init__( - self, - predicate: "GenotypePolyPredicate", - allowed: typing.Iterable[Categorization], - ): - self._predicate = predicate - self._allowed = tuple(allowed) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return self._allowed - - def get_question_base(self) -> str: - return self._predicate.get_question_base() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - cat = self._predicate.test(patient) - if cat in self._allowed: - return cat - else: - return None - - def __repr__(self): - return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" - - -def filtering_predicate( - predicate: GenotypePolyPredicate, - targets: typing.Collection[Categorization], -) -> GenotypePolyPredicate: - """ - Filtering predicate applies the base `predicate` but only returns the categorizations - from the provided `targets` collection. - - This can be useful if only some of the categorizations are interesting. - For instance, if we only seek to compare the differences between heterozygous and hemizygous variants, - but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups. - - See the :ref:`filtering-predicate` section for an example. - - The `predicate` is checked for being able to produce the all items in `targets` - and the `targets` must include at least 2 categorizations. - - :param predicate: the base predicate whose categorizations are subject to filteration. - :param targets: the categorizations to retain - """ - return FilteringGenotypePolyPredicate.create( - predicate=predicate, - targets=targets, - ) - - @dataclasses.dataclass(eq=True, frozen=True) class GenotypeGroup: allele_count: int diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 529f5fed..cc2e50f6 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -1,12 +1,9 @@ -import typing - import pytest from gpsea.model import * from gpsea.analysis.predicate.genotype import ( GenotypePolyPredicate, groups_predicate, - filtering_predicate, sex_predicate, monoallelic_predicate, biallelic_predicate, @@ -218,87 +215,6 @@ def test_biallelic_predicate( assert actual_cat.category.name == expected_name -class TestFilteringPredicate: - - @pytest.fixture(scope="class") - def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: - affects_suox = VariantPredicates.gene("SUOX") - return ModeOfInheritancePredicate.autosomal_recessive( - variant_predicate=affects_suox, - ) - - @pytest.mark.parametrize( - "indices, expected", - [ - ((0, 1), 2), - ((1, 0), 2), - ((1, 2), 2), - ], - ) - def test_filtering_predicate( - self, - indices: typing.Sequence[int], - expected: int, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - cats = x_recessive_gt_predicate.get_categorizations() - targets = [cats[i] for i in indices] - predicate = filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=targets, - ) - - actual = len(predicate.get_categorizations()) - - assert actual == expected - - def test_filtering_predicate__explodes_when_not_subsetting( - self, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=x_recessive_gt_predicate.get_categorizations(), - ) - - assert ( - ve.value.args[0] - == "It makes no sense to subset the a predicate with 3 categorizations with the same number (3) of targets" - ) - - def test_filtering_predicate__explodes_when_using_random_junk( - self, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - random_junk = (0, 1) - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=random_junk, # type: ignore - ) - - assert ( - ve.value.args[0] - == "The targets at following indices are not categorizations: [0, 1]" - ) - - def test_filtering_predicate__explodes_when_using_one_category( - self, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=(x_recessive_gt_predicate.get_categorizations()[0],), - ) - - assert ( - ve.value.args[0] - == "At least 2 target categorizations must be provided but got 1" - ) - - class TestSexPredicate: def test_sex_predicate( From 328ec489d3c206d18c89476c74af6a79f025f589 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:21:54 +0200 Subject: [PATCH 26/44] Add docs. --- .../predicate/genotype/_gt_predicates.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 89abe068..420110d8 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -241,6 +241,22 @@ def __repr__(self) -> str: def monoallelic_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: + """ + + The predicate bins patient into one of two groups: `Zero` and `One`: + + +-----------+------------------+ + | Group | Allele count | + +===========+==================+ + | Zero | 0 | + +-----------+------------------+ + | One | 1 | + +-----------+------------------+ + + Individuals with different allele counts (e.g. `2) + are assigned the ``None`` group and, thus, omitted from the analysis. + """ + return MonoallelicGenotypePredicate.for_variant_predicate(variant_predicate) @@ -318,6 +334,26 @@ def biallelic_predicate( a_predicate: VariantPredicate, b_predicate: VariantPredicate, ) -> GenotypePolyPredicate: + """ + Get a predicate for binning the individuals into groups, + with respect to allele counts of variants selected by `a_predicate` and `b_predicate`. + + The predicate bins patient into one of three groups: `AA`, `AB` and `BB`: + + +-----------+------------------+------------------+ + | Group | `A` allele count | `B` allele count | + +===========+==================+==================+ + | AA | 2 | 0 | + +-----------+------------------+------------------+ + | AB | 1 | 1 | + +-----------+------------------+------------------+ + | AA | 0 | 2 | + +-----------+------------------+------------------+ + + Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) + are assigned the ``None`` group and, thus, omitted from the analysis. + + """ return BiallelicGenotypePredicate.for_variant_predicates( a_predicate=a_predicate, b_predicate=b_predicate, From 7422eb2285235de4f601355d3a1f88b35322fa85 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:54:06 +0200 Subject: [PATCH 27/44] Simplify the "allelic" predicates. --- .../predicate/genotype/_gt_predicates.py | 178 +++++++----------- tests/analysis/predicate/genotype/conftest.py | 94 +++++++-- .../predicate/genotype/test_gt_predicates.py | 49 ++--- 3 files changed, 159 insertions(+), 162 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 420110d8..c2bbb5f1 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -186,148 +186,112 @@ def groups_predicate( ) -class MonoallelicGenotypePredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API +class PolyCountingGenotypePredicate(GenotypePolyPredicate): - ZERO = Categorization( - PatientCategory(cat_id=0, name="Zero", description="Zero alleles") - ) - ONE = Categorization( - PatientCategory(cat_id=1, name="One", description="One allele") - ) + A = Categorization(PatientCategory(cat_id=0, name="A", description="Monoallelic A")) + B = Categorization(PatientCategory(cat_id=1, name="B", description="Monoallelic B")) + + AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) + AB = Categorization(PatientCategory(cat_id=1, name="AB", description="Monoallelic A + monoallelic B")) + BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) @staticmethod - def for_variant_predicate(predicate: VariantPredicate): - allele_counter = AlleleCounter(predicate=predicate) - return MonoallelicGenotypePredicate(allele_counter=allele_counter) - - def __init__( - self, - allele_counter: AlleleCounter, + def monoallelic( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, ): - self._allele_counter = allele_counter - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return (MonoallelicGenotypePredicate.ZERO, MonoallelicGenotypePredicate.ONE) - - def get_question_base(self) -> str: - return "The variant allele count" - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - self._check_patient(patient) - - allele_count = self._allele_counter.count(patient) - if allele_count == 0: - return MonoallelicGenotypePredicate.ZERO - elif allele_count == 1: - return MonoallelicGenotypePredicate.ONE - else: - return None + count2cat = { + (0, 1): PolyCountingGenotypePredicate.A, + (1, 0): PolyCountingGenotypePredicate.B, + } - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, MonoallelicGenotypePredicate) - and self._allele_counter == value._allele_counter + return PolyCountingGenotypePredicate._for_predicates_and_categories( + count2cat=count2cat, + a_predicate=a_predicate, + b_predicate=b_predicate, ) - def __hash__(self) -> int: - return hash((self._allele_counter,)) - - def __str__(self) -> str: - return f"MonoallelicGenotypePredicate(allele_counter={self._allele_counter})" - - def __repr__(self) -> str: - return str(self) - - -def monoallelic_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: - """ - - The predicate bins patient into one of two groups: `Zero` and `One`: - - +-----------+------------------+ - | Group | Allele count | - +===========+==================+ - | Zero | 0 | - +-----------+------------------+ - | One | 1 | - +-----------+------------------+ - - Individuals with different allele counts (e.g. `2) - are assigned the ``None`` group and, thus, omitted from the analysis. - """ - - return MonoallelicGenotypePredicate.for_variant_predicate(variant_predicate) - - -class BiallelicGenotypePredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API + @staticmethod + def biallelic( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + ): + count2cat = { + (2, 0): PolyCountingGenotypePredicate.AA, + (1, 1): PolyCountingGenotypePredicate.AB, + (0, 2): PolyCountingGenotypePredicate.BB, + } - AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) - AB = Categorization( - PatientCategory( - cat_id=1, name="AB", description="One copy of A and one copy of B" + return PolyCountingGenotypePredicate._for_predicates_and_categories( + count2cat=count2cat, + a_predicate=a_predicate, + b_predicate=b_predicate, ) - ) - BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) - + @staticmethod - def for_variant_predicates( + def _for_predicates_and_categories( + count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], a_predicate: VariantPredicate, b_predicate: VariantPredicate, - ): - return BiallelicGenotypePredicate( + ) -> "PolyCountingGenotypePredicate": + return PolyCountingGenotypePredicate( a_counter=AlleleCounter(a_predicate), b_counter=AlleleCounter(b_predicate), + count2cat=count2cat, ) def __init__( self, + count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], a_counter: AlleleCounter, b_counter: AlleleCounter, ): + self._count2cat = dict(count2cat) + self._categorizations = tuple(count2cat.values()) self._a_counter = a_counter self._b_counter = b_counter - + def get_categorizations(self) -> typing.Sequence[Categorization]: - return ( - BiallelicGenotypePredicate.AA, - BiallelicGenotypePredicate.AB, - BiallelicGenotypePredicate.BB, - ) + return self._categorizations def get_question_base(self) -> str: - return "The variant allele count" + raise NotImplementedError def test(self, patient: Patient) -> typing.Optional[Categorization]: self._check_patient(patient) a_count = self._a_counter.count(patient) b_count = self._b_counter.count(patient) - if a_count == 2 and b_count == 0: - return BiallelicGenotypePredicate.AA - elif a_count == 1 and b_count == 1: - return BiallelicGenotypePredicate.AB - elif a_count == 0 and b_count == 2: - return BiallelicGenotypePredicate.BB - else: - return None + counts = (a_count, b_count) + + return self._count2cat.get(counts, None) - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, BiallelicGenotypePredicate) - and self._a_counter == value._a_counter - and self._b_counter == value._b_counter - ) + # TODO: implement __hash__, __eq__ - def __hash__(self) -> int: - return hash((self._a_counter, self._b_counter)) - def __str__(self) -> str: - return f"BiallelicGenotypePredicate(a_counter={self._a_counter}, b_counter={self._b_counter})" +def monoallelic_predicate( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, +) -> GenotypePolyPredicate: + """ + + The predicate bins patient into one of two groups: `Zero` and `One`: - def __repr__(self) -> str: - return str(self) + +-----------+------------------+------------------+ + | Group | `A` allele count | `B` allele count | + +===========+==================+==================+ + | A | 1 | 0 | + +-----------+------------------+------------------+ + | B | 0 | 1 | + +-----------+------------------+------------------+ + + Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) + are assigned the ``None`` group and, thus, omitted from the analysis. + """ + return PolyCountingGenotypePredicate.monoallelic( + a_predicate=a_predicate, + b_predicate=b_predicate, + ) def biallelic_predicate( @@ -354,7 +318,7 @@ def biallelic_predicate( are assigned the ``None`` group and, thus, omitted from the analysis. """ - return BiallelicGenotypePredicate.for_variant_predicates( + return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, b_predicate=b_predicate, ) diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 88fb5537..037e3712 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -220,7 +220,7 @@ def patient_w_frameshift( @pytest.fixture(scope="package") -def genesis_mutation( +def genesis_missense_mutation( genome_build: GenomeBuild, adam_label: SampleLabels, eve_label: SampleLabels, @@ -268,6 +268,54 @@ def genesis_mutation( ) +@pytest.fixture(scope="package") +def genesis_synonymous_mutation( + genome_build: GenomeBuild, + adam_label: SampleLabels, + eve_label: SampleLabels, + cain_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=200, + end=201, + strand=Strand.POSITIVE, + ), + ref="T", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=True, + variant_effects=( + VariantEffect.SYNONYMOUS_VARIANT, + ), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(80, 81), + ), + ), + genotypes=Genotypes.from_mapping( + { + adam_label: Genotype.HETEROZYGOUS, + eve_label: Genotype.HOMOZYGOUS_REFERENCE, + cain_label: Genotype.HOMOZYGOUS_REFERENCE, + } + ), + ) + + @pytest.fixture(scope="package") def adam_label() -> SampleLabels: return SampleLabels("Adam") @@ -276,14 +324,18 @@ def adam_label() -> SampleLabels: @pytest.fixture(scope="package") def adam( adam_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( adam_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -295,14 +347,18 @@ def eve_label() -> SampleLabels: @pytest.fixture(scope="package") def eve( eve_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( eve_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -314,14 +370,18 @@ def cain_label() -> SampleLabels: @pytest.fixture(scope="package") def cain( cain_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( cain_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -387,7 +447,7 @@ def white_missense_mutation( @pytest.fixture(scope="package") -def white_stop_gain_mutation( +def white_synonymous_mutation( genome_build: GenomeBuild, walt_label: SampleLabels, skyler_label: SampleLabels, @@ -417,7 +477,7 @@ def white_stop_gain_mutation( hgvs_cdna=None, is_preferred=True, variant_effects=( - VariantEffect.STOP_GAINED, + VariantEffect.SYNONYMOUS_VARIANT, ), affected_exons=(5,), protein_id="pt:xyz", @@ -445,7 +505,7 @@ def walt_label() -> SampleLabels: def walt( walt_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( walt_label, @@ -454,7 +514,7 @@ def walt( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) @@ -468,7 +528,7 @@ def skyler_label() -> SampleLabels: def skyler( skyler_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( skyler_label, @@ -477,7 +537,7 @@ def skyler( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) @@ -491,7 +551,7 @@ def flynn_label() -> SampleLabels: def flynn( flynn_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( flynn_label, @@ -500,7 +560,7 @@ def flynn( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) @@ -514,7 +574,7 @@ def holly_label() -> SampleLabels: def holly( holly_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( holly_label, @@ -523,7 +583,7 @@ def holly( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index cc2e50f6..1afa391a 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -141,9 +141,9 @@ class TestAllelePredicates: @pytest.mark.parametrize( "individual_name,expected_name", [ - ("adam", "Zero"), # 0/0 - ("eve", "One"), # 0/1 - ("cain", "One"), # 0/1 + ("adam", "A"), # 0/0 & 0/1 + ("eve", "B"), # 0/1 & 0/0 + ("cain", "B"), # 0/1 & 0/0 ], ) def test_monoallelic_predicate_ad_family( @@ -153,7 +153,8 @@ def test_monoallelic_predicate_ad_family( request: pytest.FixtureRequest, ): is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - gt_predicate = monoallelic_predicate(is_missense) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense, is_synonymous) individual = request.getfixturevalue(individual_name) actual_cat = gt_predicate.test(individual) @@ -161,41 +162,13 @@ def test_monoallelic_predicate_ad_family( assert actual_cat is not None assert actual_cat.category.name == expected_name - @pytest.mark.parametrize( - "individual_name,not_none,expected_name", - [ - ("walt", True, "One"), # 1/2 - ("skyler", True, "One"), # 1/2 - ("flynn", False, None), # 1/1, hence 2 alleles which is too much to work with monoallelic predicate. - ("holly", True, "Zero"), # 2/2 - ], - ) - def test_monoallelic_predicate_ar_family( - self, - individual_name: str, - not_none: bool, - expected_name: str, - request: pytest.FixtureRequest, - ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - gt_predicate = monoallelic_predicate(is_missense) - individual = request.getfixturevalue(individual_name) - - actual_cat = gt_predicate.test(individual) - - if not_none: - assert actual_cat is not None - assert actual_cat.category.name == expected_name - else: - assert actual_cat is None - @pytest.mark.parametrize( "individual_name,expected_name", [ - ("walt", "AB"), # 1/2 - ("skyler", "AB"), # 1/2 - ("flynn", "AA"), # 1/1 - ("holly", "BB"), # 2/2 + ("walt", "AB"), # 0/1 & 0/1 + ("skyler", "AB"), # 0/1 & 0/1 + ("flynn", "AA"), # 1/1 & 0/0 + ("holly", "BB"), # 0/0 & 1/1 ], ) def test_biallelic_predicate( @@ -205,8 +178,8 @@ def test_biallelic_predicate( request: pytest.FixtureRequest, ): is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - is_stop_gained = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, TX_ID) - gt_predicate = biallelic_predicate(is_missense, is_stop_gained) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = biallelic_predicate(is_missense, is_synonymous) individual = request.getfixturevalue(individual_name) actual_cat = gt_predicate.test(individual) From 83b82992b5cd9c2baed1011e9194d52337414743 Mon Sep 17 00:00:00 2001 From: Lauren Rekerle Date: Wed, 11 Sep 2024 09:05:28 -0500 Subject: [PATCH 28/44] Totally thought I had pushed this. Changed to "Individuals" --- src/gpsea/preprocessing/_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index fc23fa81..7c7b3e10 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -455,7 +455,7 @@ def load_phenopackets( # Turn phenopackets into a cohort using the cohort creator. # Keep track of the progress by wrapping the list of phenopackets # with TQDM 😎 - cohort_iter = tqdm(phenopackets, desc="Patients Created", file=sys.stdout, unit="files") + cohort_iter = tqdm(phenopackets, desc="Individuals Processed", file=sys.stdout, unit="individuals") notepad = cohort_creator.prepare_notepad("Phenopackets") cohort = cohort_creator.process(cohort_iter, notepad) From 920bb6c528ad832c67c44421a8fe20b7a77dc0e5 Mon Sep 17 00:00:00 2001 From: Lauren Rekerle Date: Wed, 11 Sep 2024 09:12:10 -0500 Subject: [PATCH 29/44] Updated docs to reflect new output --- docs/tutorial.rst | 2 +- docs/user-guide/input-data.rst | 2 +- docs/user-guide/predicates/hpo_predicate.rst | 2 +- docs/user-guide/predicates/variant_predicates.rst | 2 +- docs/user-guide/stats.rst | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 35b0d278..f16a6c06 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -84,7 +84,7 @@ a :class:`~gpsea.model.Cohort`: ... phenopackets=phenopackets, ... cohort_creator=cohort_creator, ... ) -Patients Created: ... +Individuals Processed: ... and we will check that there are no Q/C issues: diff --git a/docs/user-guide/input-data.rst b/docs/user-guide/input-data.rst index b94379b3..1abd1ab8 100644 --- a/docs/user-guide/input-data.rst +++ b/docs/user-guide/input-data.rst @@ -80,7 +80,7 @@ loader function: >>> from gpsea.preprocessing import load_phenopackets >>> cohort, qc_results = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... >>> len(cohort) 19 diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst index eaca60dc..a5fc0549 100644 --- a/docs/user-guide/predicates/hpo_predicate.rst +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -71,7 +71,7 @@ processed into a cohort >>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets >>> cohort_creator = configure_caching_cohort_creator(hpo) >>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... we can create HPO predicates for testing all 260 HPO terms used in the cohort diff --git a/docs/user-guide/predicates/variant_predicates.rst b/docs/user-guide/predicates/variant_predicates.rst index cce6130c..1053fbbb 100644 --- a/docs/user-guide/predicates/variant_predicates.rst +++ b/docs/user-guide/predicates/variant_predicates.rst @@ -60,7 +60,7 @@ and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: >>> from gpsea.preprocessing import load_phenopackets >>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds to a pathogenic variant `VCV000522858.5 `_ diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 5e9c8316..588e9332 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -102,7 +102,7 @@ with the default cohort creator: >>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets >>> cohort_creator = configure_caching_cohort_creator(hpo) >>> cohort, qc_results = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... >>> qc_results.summarize() # doctest: +SKIP Validated under none policy No errors or warnings were found From 1704272a403403a04829f5a32ef5ad165ccb0b28 Mon Sep 17 00:00:00 2001 From: Lauren Rekerle Date: Wed, 11 Sep 2024 09:15:18 -0500 Subject: [PATCH 30/44] I missed one --- docs/user-guide/stats.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 588e9332..9a649d6d 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -344,7 +344,7 @@ which we will use to preprocess the cohort >>> from gpsea.preprocessing import load_phenopackets >>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... resulting in a cohort consisting of 19 individuals From 5016de1dd6cb1bbaea96dbc238627c3344246182 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 16:25:15 +0200 Subject: [PATCH 31/44] Use `monoallelic_predicate` in tutorial. --- docs/report/tbx5_frameshift_vs_missense.csv | 4 +- docs/tutorial.rst | 16 +++--- docs/user-guide/stats.rst | 2 +- .../predicate/genotype/_gt_predicates.py | 55 +++++++++++-------- .../predicate/genotype/test_gt_predicates.py | 26 ++++++++- 5 files changed, 66 insertions(+), 37 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 76666c2a..4bffc6ea 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,4 +1,4 @@ -Genotype group,Missense,Missense,Frameshift,Frameshift,, +Allele group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 @@ -12,7 +12,7 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.299992259972 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6857142857142857,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.831168831168831,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 -Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 35b0d278..5f8d7a06 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -189,21 +189,19 @@ Prepare genotype and phenotype predicates ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We will create a predicate to bin patients into group -depending on presence of a missense and frameshift variant to test +depending on presence of a single allele of a missense or frameshift variant to test if there is a difference between frameshift and non-frameshift variants in the individuals of the *TBX5* cohort. >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate ->>> gt_predicate = groups_predicate( -... predicates=( -... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) -... ), -... group_names=('Missense', 'Frameshift'), +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, monoallelic_predicate +>>> gt_predicate = monoallelic_predicate( +... a_predicate=VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... b_predicate=VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... names=('Missense', 'Frameshift') ... ) >>> gt_predicate.display_question() -'Genotype group: Missense, Frameshift' +'Allele group: Missense, Frameshift' .. note:: diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 5e9c8316..3247a121 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -127,7 +127,7 @@ First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` to test if the variant leads to a frameshift (in this case): >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate +>>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> tx_id = 'NM_181486.4' >>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) >>> is_frameshift.get_question() diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index c2bbb5f1..e6645473 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -188,21 +188,15 @@ def groups_predicate( class PolyCountingGenotypePredicate(GenotypePolyPredicate): - A = Categorization(PatientCategory(cat_id=0, name="A", description="Monoallelic A")) - B = Categorization(PatientCategory(cat_id=1, name="B", description="Monoallelic B")) - - AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) - AB = Categorization(PatientCategory(cat_id=1, name="AB", description="Monoallelic A + monoallelic B")) - BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) - @staticmethod def monoallelic( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str], ): count2cat = { - (0, 1): PolyCountingGenotypePredicate.A, - (1, 0): PolyCountingGenotypePredicate.B, + (1, 0): Categorization(PatientCategory(cat_id=0, name=names[0], description=f"Monoallelic {names[0]}")), + (0, 1): Categorization(PatientCategory(cat_id=1, name=names[1], description=f"Monoallelic {names[1]}")), } return PolyCountingGenotypePredicate._for_predicates_and_categories( @@ -215,11 +209,24 @@ def monoallelic( def biallelic( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str], ): count2cat = { - (2, 0): PolyCountingGenotypePredicate.AA, - (1, 1): PolyCountingGenotypePredicate.AB, - (0, 2): PolyCountingGenotypePredicate.BB, + (2, 0): Categorization( + PatientCategory( + cat_id=0, name=f'{names[0]}/{names[0]}', description=f"Biallelic {names[0]}", + ) + ), + (1, 1): Categorization( + PatientCategory( + cat_id=1, name=f'{names[0]}/{names[1]}', description=f"{names[0]}/{names[1]}", + ), + ), + (0, 2): Categorization( + PatientCategory( + cat_id=2, name=f'{names[1]}/{names[1]}', description=f"Biallelic {names[1]}" + ), + ), } return PolyCountingGenotypePredicate._for_predicates_and_categories( @@ -255,7 +262,7 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations def get_question_base(self) -> str: - raise NotImplementedError + return 'Allele group' def test(self, patient: Patient) -> typing.Optional[Categorization]: self._check_patient(patient) @@ -272,31 +279,34 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def monoallelic_predicate( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ - The predicate bins patient into one of two groups: `Zero` and `One`: + The predicate bins patient into one of two groups: `A` and `B`: - +-----------+------------------+------------------+ - | Group | `A` allele count | `B` allele count | - +===========+==================+==================+ - | A | 1 | 0 | - +-----------+------------------+------------------+ - | B | 0 | 1 | - +-----------+------------------+------------------+ + +-----------+------------+------------+ + | Group | `A` count | `B` count | + +===========+============+============+ + | A | 1 | 0 | + +-----------+------------+------------+ + | B | 0 | 1 | + +-----------+------------+------------+ - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) + Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) are assigned the ``None`` group and, thus, omitted from the analysis. """ return PolyCountingGenotypePredicate.monoallelic( a_predicate=a_predicate, b_predicate=b_predicate, + names=names, ) def biallelic_predicate( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ Get a predicate for binning the individuals into groups, @@ -321,6 +331,7 @@ def biallelic_predicate( return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, b_predicate=b_predicate, + names=names, ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 1afa391a..222660ad 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -141,9 +141,9 @@ class TestAllelePredicates: @pytest.mark.parametrize( "individual_name,expected_name", [ - ("adam", "A"), # 0/0 & 0/1 - ("eve", "B"), # 0/1 & 0/0 - ("cain", "B"), # 0/1 & 0/0 + ("adam", "B"), # 0/0 & 0/1 + ("eve", "A"), # 0/1 & 0/0 + ("cain", "A"), # 0/1 & 0/0 ], ) def test_monoallelic_predicate_ad_family( @@ -162,6 +162,16 @@ def test_monoallelic_predicate_ad_family( assert actual_cat is not None assert actual_cat.category.name == expected_name + def test_monoallelic_predicate__general_stuff( + self, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + + gt_predicate = monoallelic_predicate(is_missense, is_synonymous) + + assert gt_predicate.display_question() == 'Allele group: A, B' + @pytest.mark.parametrize( "individual_name,expected_name", [ @@ -187,6 +197,16 @@ def test_biallelic_predicate( assert actual_cat is not None assert actual_cat.category.name == expected_name + def test_biallelic_predicate__general_stuff( + self, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + + gt_predicate = biallelic_predicate(is_missense, is_synonymous) + + assert gt_predicate.display_question() == 'Allele group: A/A, A/B, B/B' + class TestSexPredicate: From 640cf406c917b10ad0e01d3e9ed93561fde4352b Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 16:53:24 +0200 Subject: [PATCH 32/44] Remove `boolean_predicate`. --- .../analysis/predicate/genotype/__init__.py | 4 +- .../predicate/genotype/_gt_predicates.py | 71 +------------- .../analysis/pcats/test_hpo_term_analysis.py | 18 ++-- .../predicate/genotype/test_gt_predicates.py | 8 +- tests/analysis/test_mtc_filter.py | 20 ++-- tests/conftest.py | 6 +- tests/test_predicates.py | 96 +------------------ 7 files changed, 29 insertions(+), 194 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 14036885..b9bd8e8a 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,14 +1,14 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', + 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index e6645473..9bc8c6bd 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -7,7 +7,7 @@ from gpsea.model import Patient, Sex -from .._api import Categorization, PatientCategory, PatientCategories +from .._api import Categorization, PatientCategory from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter @@ -15,75 +15,6 @@ # TODO: implement __hash__, __eq__ on predicates -class AlleleCountingGenotypePredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - """ - The predicate tests presence of at least one matching allele in the patient. - """ - YES = Categorization(PatientCategories.YES) - NO = Categorization(PatientCategories.NO) - - @staticmethod - def for_variant_predicate(predicate: VariantPredicate): - allele_counter = AlleleCounter(predicate=predicate) - return AlleleCountingGenotypePredicate(allele_counter=allele_counter) - - def __init__(self, allele_counter: AlleleCounter): - self._allele_counter = allele_counter - - def get_categorizations(self) -> typing.Sequence[Categorization]: - """ - The predicate bins a patient into - :attr:`AlleleCountingGenotypePredicate.NO` - or :class:`AlleleCountingGenotypePredicate.YES` category. - """ - return ( - AlleleCountingGenotypePredicate.YES, - AlleleCountingGenotypePredicate.NO, - ) - - def get_question_base(self) -> str: - return self._allele_counter.get_question() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - self._check_patient(patient) - - allele_count = self._allele_counter.count(patient) - if allele_count > 0: - return AlleleCountingGenotypePredicate.YES - elif allele_count == 0: - return AlleleCountingGenotypePredicate.NO - else: - raise ValueError( - f"Allele counter should return a non-negative allele count: {allele_count}" - ) - - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, AlleleCountingGenotypePredicate) - and self._allele_counter == value._allele_counter - ) - - def __hash__(self) -> int: - return hash((self._allele_counter,)) - - def __str__(self) -> str: - return f"AlleleCountingGenotypePredicate(allele_counter={self._allele_counter})" - - def __repr__(self) -> str: - return str(self) - - -def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: - """ - Create a genotype boolean predicate from given `variant_predicate` - to test for presence of at least one matching allele in the patient. - """ - return AlleleCountingGenotypePredicate.for_variant_predicate( - predicate=variant_predicate, - ) - - class AlleleCountingGroupsPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py index dbd8d508..9d18f87a 100644 --- a/tests/analysis/pcats/test_hpo_term_analysis.py +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -57,25 +57,25 @@ def test_compare_genotype_vs_phenotypes( assert results is not None assert results.total_tests == 4 - assert results.n_usable == (35, 18, 13, 25, 23) + assert results.n_usable == (17, 7, 5, 13, 12) assert results.pvals == pytest.approx( [ - 0.0721291631224236, - 1.0, + 0.35294117647058815, + 0.48571428571428565, float("nan"), - 0.35921595820909313, - 0.6668461434917216, + 0.1048951048951049, + 1., ], nan_ok=True, ) assert results.corrected_pvals is not None assert results.corrected_pvals == pytest.approx( [ - 0.2885166524896944, - 1.0, + 0.6476190476190475, + 0.6476190476190475, float("nan"), - 0.7184319164181863, - 0.8891281913222954, + 0.4195804195804196, + 1.0, ], nan_ok=True, ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 222660ad..6d1b0988 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -175,10 +175,10 @@ def test_monoallelic_predicate__general_stuff( @pytest.mark.parametrize( "individual_name,expected_name", [ - ("walt", "AB"), # 0/1 & 0/1 - ("skyler", "AB"), # 0/1 & 0/1 - ("flynn", "AA"), # 1/1 & 0/0 - ("holly", "BB"), # 0/0 & 1/1 + ("walt", "A/B"), # 0/1 & 0/1 + ("skyler", "A/B"), # 0/1 & 0/1 + ("flynn", "A/A"), # 1/1 & 0/0 + ("holly", "B/B"), # 0/0 & 1/1 ], ) def test_biallelic_predicate( diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index 88e085dd..b0ab43de 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -67,8 +67,8 @@ def prepare_counts_df( ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): values = np.array(counts).reshape((2, 2)) - index = pd.Index(gt_predicate.get_categories()) - columns = pd.Index(ph_predicate.get_categories()) + index = pd.Index(ph_predicate.get_categories()) + columns = pd.Index(gt_predicate.get_categories()) return pd.DataFrame(data=values, index=index, columns=columns) @pytest.mark.parametrize( @@ -82,7 +82,7 @@ def prepare_counts_df( ) def test_one_genotype_has_zero_hpo_observations( self, - counts: typing.Tuple[int], + counts: typing.Sequence[int], expected: bool, gt_predicate: GenotypePolyPredicate, ph_predicate: PhenotypePolyPredicate[hpotk.TermId], @@ -200,7 +200,7 @@ def test_filter_terms_to_test( reasons = [r.reason for r in mtc_report] assert reasons == [ None, None, - 'Skipping term because all genotypes have same HPO observed proportions', + 'Skipping term with less than 7 observations (not powered for 2x2)', None, None, ] @@ -263,7 +263,7 @@ def test_min_observed_HPO_threshold( for i, p in enumerate(suox_pheno_predicates) } - # Ectopia lentis HP:0001083 (6 9 1 2), freqs are 6/15=0.4 and 1/3=0.33 + # Ectopia lentis HP:0001083 (1 2 3 1), freqs are 1/4=0.25 and 3/4=0.75 idx = curie2idx["HP:0001083"] ectopia = patient_counts[idx] ectopia_predicate = suox_pheno_predicates[idx] @@ -271,9 +271,9 @@ def test_min_observed_HPO_threshold( ectopia, ph_predicate=ectopia_predicate, ) - assert max_f == pytest.approx(0.4, abs=EPSILON) + assert max_f == pytest.approx(0.75, abs=EPSILON) - # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1 + # Seizure HP:0001250 (11 5 0 1), freqs are 11/11=1.0 and 5/6=0.8333333 idx = curie2idx["HP:0001250"] seizure = patient_counts[idx] seizure_predicate = suox_pheno_predicates[idx] @@ -283,7 +283,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(1.0, abs=EPSILON) - # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1 + # Sulfocysteinuria HP:0032350 (2 3 0 0), freqs are both 1 idx = curie2idx["HP:0032350"] sulfocysteinuria = patient_counts[idx] sulfocysteinuria_predicate = suox_pheno_predicates[idx] @@ -293,7 +293,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(1.0, abs=EPSILON) - # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5 + # Neurodevelopmental delay HP:0012758 (4 0 4 5), freqs are 4/8 = 0.5 and 0/5=0.0 idx = curie2idx["HP:0012758"] ndelay = patient_counts[idx] ndelay_predicate = suox_pheno_predicates[idx] @@ -303,7 +303,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(0.5, abs=EPSILON) - # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714 + # Hypertonia HP:0001276 (4 2 3 3) freqs are 4/7=0.4375 and 2/5=0.5714 idx = curie2idx["HP:0001276"] hypertonia = patient_counts[idx] hypertonia_predicate = suox_pheno_predicates[idx] diff --git a/tests/conftest.py b/tests/conftest.py index 907b7d83..5459ce3f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, ModeOfInheritancePredicate, VariantPredicates from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONDecoder from gpsea.model import * @@ -119,9 +119,7 @@ def suox_mane_tx_id() -> str: def suox_gt_predicate( suox_mane_tx_id: str, ) -> GenotypePolyPredicate: - # To bin the patients to a group with >1 MISSENSE variant or 0 MISSENSE variants. - - return boolean_predicate( + return ModeOfInheritancePredicate.autosomal_dominant( variant_predicate=VariantPredicates.variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id diff --git a/tests/test_predicates.py b/tests/test_predicates.py index 467a5df1..71308b8e 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -4,8 +4,7 @@ from gpsea.analysis.predicate import PatientCategory, PatientCategories from gpsea.analysis.predicate.phenotype import HpoPredicate, DiseasePresencePredicate from gpsea.analysis.predicate.genotype import * -from gpsea.model import Cohort, Patient, FeatureType, VariantEffect -from gpsea.model.genome import Region +from gpsea.model import Cohort, Patient from gpsea.preprocessing import ProteinMetadataService @@ -89,100 +88,7 @@ def test_disease_predicate( assert actual.category == patient_category -@pytest.mark.parametrize('patient_id, variant_effect, expected_result', - (['HetSingleVar', VariantEffect.FRAMESHIFT_VARIANT, PatientCategories.YES], - ['HetSingleVar', VariantEffect.MISSENSE_VARIANT, PatientCategories.NO], - ['HetDoubleVar1', VariantEffect.STOP_GAINED, PatientCategories.YES], - ['HomoVar', VariantEffect.FRAMESHIFT_VARIANT, PatientCategories.YES], - ['LargeCNV', VariantEffect.STOP_LOST, PatientCategories.YES], - ['LargeCNV', VariantEffect.FEATURE_TRUNCATION, PatientCategories.YES])) -def test_VariantEffectPredicate(patient_id: str, - variant_effect: VariantEffect, - expected_result: PatientCategory, - toy_cohort: Cohort): - patient = find_patient(patient_id, toy_cohort) - predicate = boolean_predicate(VariantPredicates.variant_effect(effect=variant_effect, tx_id='NM_013275.6')) - result = predicate.test(patient) - assert result.category == expected_result - - -@pytest.mark.parametrize('patient_id, variant, hasVarResult', - (['HetSingleVar', '16_89279850_89279850_G_GC', PatientCategories.YES], - # the `HetSingleVar` patient does NOT have the variant. - ['HetSingleVar', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', PatientCategories.NO], - # but `HetDoubleVar2` below DOES have the variant. - ['HetDoubleVar2', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', PatientCategories.YES], - ['HetDoubleVar1', '16_89284601_89284602_GG_A', PatientCategories.YES], - ['HetDoubleVar1', '16_89280752_89280752_G_T', PatientCategories.YES], - # the `HomoVar` patient does NOT have the variant - ['HomoVar', '16_89280752_89280752_G_T', PatientCategories.NO], - ['HomoVar', '16_89279458_89279459_TG_T', PatientCategories.YES], - ['LargeCNV', '16_89190071_89439815_DEL', PatientCategories.YES])) -def test_VariantKeyPredicate(patient_id, variant, hasVarResult, toy_cohort): - predicate = boolean_predicate(VariantPredicates.variant_key(key=variant)) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult - - -@pytest.mark.parametrize('patient_id, exon, hasVarResult', - (['HetSingleVar', 9, PatientCategories.YES], - ['HetSingleVar', 13, PatientCategories.NO], - ['HetDoubleVar1', 9, PatientCategories.YES], - ['HetDoubleVar2', 10, PatientCategories.YES], - ['HetDoubleVar2', 9, PatientCategories.YES], - ['HomoVar', 10, PatientCategories.NO], - ['HomoVar', 9, PatientCategories.YES], - ['LargeCNV', 1, PatientCategories.NO], - ['LargeCNV', 13, PatientCategories.YES])) -def test_ExonPredicate(patient_id, exon, hasVarResult, toy_cohort): - patient = find_patient(patient_id, toy_cohort) - predicate = VariantPredicates.exon(exon=exon, tx_id='NM_013275.6') - predicate = boolean_predicate(predicate) - result = predicate.test(patient) - assert result.category == hasVarResult - @pytest.fixture(scope='module') def protein_predicates(protein_test_service: ProteinMetadataService) -> ProteinPredicates: return ProteinPredicates(protein_metadata_service=protein_test_service) - - -@pytest.mark.parametrize('patient_id, feature_type, hasVarResult', - (['HetDoubleVar2', FeatureType.REGION, PatientCategories.YES], - ['HetDoubleVar2', FeatureType.REPEAT, PatientCategories.NO], - ['HetSingleVar', FeatureType.REGION, PatientCategories.YES], - ['HomoVar', FeatureType.REGION, PatientCategories.YES], - ['HetDoubleVar1', FeatureType.REPEAT, PatientCategories.NO])) -## TODO Why do CNV not show as affecting a feature? -##['LargeCNV', FeatureType.REGION , HETEROZYGOUS])) -def test_ProteinFeatureTypePredicate(patient_id, feature_type, hasVarResult, toy_cohort, protein_predicates): - patient = find_patient(patient_id, toy_cohort) - predicate = boolean_predicate(protein_predicates.protein_feature_type(feature_type=feature_type, tx_id='NM_013275.6')) - result = predicate.test(patient) - assert result.category == hasVarResult - - -@pytest.mark.parametrize('patient_id, feature, hasVarResult', - (['HetDoubleVar2', 'Disordered', PatientCategories.YES], - ['HetDoubleVar2', 'BadFeature', PatientCategories.NO], - ['HetSingleVar', 'Disordered', PatientCategories.YES], - ['HomoVar', 'Disordered', PatientCategories.YES], - ['HetDoubleVar1', 'Disordered', PatientCategories.YES])) -def test_ProteinFeaturePredicate(patient_id, feature, hasVarResult, toy_cohort, protein_predicates): - predicate = boolean_predicate(protein_predicates.protein_feature(feature_id=feature, tx_id='NM_013275.6')) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult - -@pytest.mark.parametrize('patient_id, region, hasVarResult', - (['HetDoubleVar2', Region(2000, 2500), PatientCategories.YES], - ['HetDoubleVar2', Region(1000, 1500), PatientCategories.NO], - ['HomoVar', Region(2000, 2500), PatientCategories.YES], - ['HetSingleVar', Region(2000, 2500), PatientCategories.YES], - ['HetDoubleVar1', Region(600, 650), PatientCategories.YES])) -def test_ProteinRegionPredicate(patient_id, region, hasVarResult, toy_cohort): - predicate = boolean_predicate(VariantPredicates.region(region=region, tx_id='NM_013275.6')) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult From b0b157c0f462fbabaf7a123e008fea7d516c9d73 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 17:01:24 +0200 Subject: [PATCH 33/44] Fix bug in `HpoPredicate`. --- src/gpsea/analysis/predicate/phenotype/_pheno.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 01e692d8..004ea672 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -198,7 +198,7 @@ def test( return None def __eq__(self, value: object) -> bool: - return isinstance(value, PropagatingPhenotypePredicate) \ + return isinstance(value, HpoPredicate) \ and self._hpo.version == value._hpo.version \ and self._query == value._query \ and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded @@ -207,7 +207,7 @@ def __hash__(self) -> int: return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) def __repr__(self): - return f"PropagatingPhenotypeBooleanPredicate(query={self._query})" + return f"HpoPredicate(query={self._query})" class DiseasePresencePredicate(PhenotypePolyPredicate[hpotk.TermId]): From b32ceb1410407dd36ad497a6c00982c1445bc660 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 10:14:18 +0200 Subject: [PATCH 34/44] Add `AlwaysTrueVariantPredicate`. --- .../predicate/genotype/_predicates.py | 93 ++++++++++++------- .../analysis/predicate/genotype/_variant.py | 23 +++++ .../predicate/genotype/test_predicates.py | 9 +- 3 files changed, 93 insertions(+), 32 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_predicates.py b/src/gpsea/analysis/predicate/genotype/_predicates.py index 03fae44c..e8f3d824 100644 --- a/src/gpsea/analysis/predicate/genotype/_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_predicates.py @@ -10,10 +10,41 @@ from ._api import VariantPredicate +class AlwaysTrueVariantPredicate(VariantPredicate): + """ + `AlwaysTrueVariantPredicate` returns `True` for any variant. + """ + + @staticmethod + def get_instance() -> "AlwaysTrueVariantPredicate": + return ALWAYS_TRUE + + def get_question(self) -> str: + return "" + + def test(self, variant: Variant) -> bool: + return True + + def __eq__(self, value: object) -> bool: + return isinstance(value, AlwaysTrueVariantPredicate) + + def __hash__(self) -> int: + return 17 + + def __str__(self): + return repr(self) + + def __repr__(self): + return 'AlwaysTrueVariantPredicate()' + + +ALWAYS_TRUE = AlwaysTrueVariantPredicate() + + class VariantEffectPredicate(VariantPredicate): """ `VariantEffectPredicate` is a `VariantPredicate` that sets up testing - for a specific `VariantEffect` on a given transcript ID. + for a specific `VariantEffect` on a given transcript ID. Args: effect (VariantEffect): the variant effect to search for @@ -28,16 +59,6 @@ def get_question(self) -> str: return f'{self._effect.name} on {self._tx_id}' def test(self, variant: Variant) -> bool: - """Tests if the `Variant` causes the specified `VariantEffect` - on the given transcript. - - Args: - variant (Variant): a - - Returns: - bool: _description_ - """ - tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) if tx_anno is None: return False @@ -98,7 +119,7 @@ def __repr__(self): class VariantGenePredicate(VariantPredicate): """ - `VariantGenePredicate` tests if the variant + `VariantGenePredicate` tests if the variant is annotated to affect a gene denoted by a `symbol`. Args: @@ -134,7 +155,7 @@ def __repr__(self): class VariantTranscriptPredicate(VariantPredicate): """ - `VariantTranscriptPredicate` tests if the variant + `VariantTranscriptPredicate` tests if the variant is annotated to affect a transcript with `tx_id` accession. Args: @@ -170,24 +191,23 @@ def __repr__(self): class VariantExonPredicate(VariantPredicate): """ - `VariantExonPredicate` tests if the variant affects + `VariantExonPredicate` tests if the variant affects *n*-th exon of the transcript of interest. .. warning:: - We use 1-based numbering to number the exons, + We use 1-based numbering to number the exons, not the usual 0-based numbering of the computer science. - Therefore, the first exon of the transcript + Therefore, the first exon of the transcript has ``exon_number==1``, the second exon is ``2``, and so on ... .. warning:: - We do not check if the `exon_number` spans + We do not check if the `exon_number` spans beyond the number of exons of the given `transcript_id`! - Therefore, ``exon_number==10,000`` will effectively - return :attr:`GenotypeBooleanPredicate.FALSE` - for *all* patients!!! 😱 - Well, at least the patients of the *Homo sapiens sapiens* taxon... + Therefore, ``exon_number==10,000`` will effectively + return `False` for *all* variants!!! 😱 + Well, at least the genomic variants of the *Homo sapiens sapiens* taxon... :param exon: a positive `int` of the target exon :param tx_id: the accession of the transcript of interest, e.g. `NM_123456.7` @@ -195,10 +215,12 @@ class VariantExonPredicate(VariantPredicate): def __init__( self, - exon: int, + exon: int, tx_id: str, ): + assert isinstance(exon, int) and exon > 0, '`exon` must be a positive `int`' self._exon = exon + assert isinstance(tx_id, str) self._tx_id = tx_id def get_question(self) -> str: @@ -455,7 +477,8 @@ def __init__( self._tx_id = tx_id def get_question(self) -> str: - return f'variant affects aminoacid(s) between {self._region.start} and {self._region.end} on protein encoded by transcript {self._tx_id}' + return f'variant affects aminoacid(s) between {self._region.start} and {self._region.end} ' \ + f'on protein encoded by transcript {self._tx_id}' def test(self, variant: Variant) -> bool: tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) @@ -494,9 +517,9 @@ class ProteinFeatureTypePredicate(VariantPredicate): """ def __init__( - self, - feature_type: FeatureType, - tx_id: str, + self, + feature_type: FeatureType, + tx_id: str, protein_metadata_service: ProteinMetadataService, ): self._feature_type = feature_type @@ -504,7 +527,8 @@ def __init__( self._prot_service = protein_metadata_service def get_question(self) -> str: - return f'variant affects {self._feature_type.name} feature type on the protein encoded by transcript {self._tx_id}' + return f'variant affects {self._feature_type.name} feature type on the protein ' \ + f'encoded by transcript {self._tx_id}' def test(self, variant: Variant) -> bool: tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) @@ -519,7 +543,7 @@ def test(self, variant: Variant) -> bool: return False protein_meta = self._prot_service.annotate(protein_id) - for feat in protein_meta.protein_features: + for feat in protein_meta.protein_features: if feat.feature_type == self._feature_type and location.overlaps_with(feat.info.region): return True return False @@ -538,7 +562,11 @@ def __str__(self): return repr(self) def __repr__(self): - return f'ProteinFeatureTypePredicate(feature_type={self._feature_type}, tx_id={self._tx_id}, protein_metadata_service={self._prot_service})' + return 'ProteinFeatureTypePredicate(' \ + f'feature_type={self._feature_type}, ' \ + f'tx_id={self._tx_id}, ' \ + f'protein_metadata_service={self._prot_service}' \ + ')' class ProteinFeaturePredicate(VariantPredicate): @@ -574,7 +602,7 @@ def test(self, variant: Variant) -> bool: return False protein = self._prot_service.annotate(protein_id) - for feat in protein.protein_features: + for feat in protein.protein_features: if feat.info.name == self._feature_name and location.overlaps_with(feat.info.region): return True return False @@ -593,4 +621,7 @@ def __str__(self): return repr(self) def __repr__(self): - return f'ProteinFeaturePredicate(feature_name={self._feature_name}, tx_id={self._tx_id}, protein_metadata_service={self._prot_service})' + return f'ProteinFeaturePredicate(' \ + f'feature_name={self._feature_name}, ' \ + f'tx_id={self._tx_id}, ' \ + f'protein_metadata_service={self._prot_service})' diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index 54d39735..99f23638 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -18,6 +18,14 @@ class VariantPredicates: that are relatively simple to configure. """ + @staticmethod + def true() -> VariantPredicate: + """ + Prepare an absolutely inclusive :class:`VariantPredicate` - a predicate that returns `True` + for any variant whatsoever. + """ + return AlwaysTrueVariantPredicate.get_instance() + @staticmethod def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: """ @@ -140,6 +148,21 @@ def exon( """ Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. + .. warning:: + + We use 1-based numbering to number the exons, + not the usual 0-based numbering of the computer science. + Therefore, the first exon of the transcript + has ``exon_number==1``, the second exon is ``2``, and so on ... + + .. warning:: + + We do not check if the `exon_number` spans + beyond the number of exons of the given `transcript_id`! + Therefore, ``exon_number==10,000`` will effectively return `False` + for *all* variants!!! 😱 + Well, at least the genome variants of the *Homo sapiens sapiens* taxon... + Args: exon: a non-negative `int` with the index of the target exon (e.g. `0` for the 1st exon, `1` for the 2nd, ...) diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py index a076adac..b0f20fb3 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/genotype/test_predicates.py @@ -1,12 +1,19 @@ import pytest -from gpsea.analysis.predicate.genotype import VariantPredicates, ProteinPredicates, VariantPredicate +from gpsea.analysis.predicate.genotype import VariantPredicates, ProteinPredicates from gpsea.model import * from gpsea.model.genome import * class TestVariantPredicates: + def test_always_true_predicate( + self, + suox_cohort: Cohort, + ): + predicate = VariantPredicates.true() + assert all(predicate.test(v) for v in suox_cohort.all_variants()) + @pytest.mark.parametrize( 'effect, expected', [ From 5360c70857b24be21b56ad90677f149a46e341ed Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 10:24:55 +0200 Subject: [PATCH 35/44] The MoI GT predicates test all variants by default. Ensure `exon_predicate` gets the correct args. --- .../analysis/predicate/genotype/_counter.py | 3 +- .../predicate/genotype/_gt_predicates.py | 11 ++++- .../predicate/genotype/test_gt_predicates.py | 48 +++++++++++++++++++ .../predicate/genotype/test_predicates.py | 7 ++- 4 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_counter.py b/src/gpsea/analysis/predicate/genotype/_counter.py index c74be909..8ad77d55 100644 --- a/src/gpsea/analysis/predicate/genotype/_counter.py +++ b/src/gpsea/analysis/predicate/genotype/_counter.py @@ -8,13 +8,14 @@ class AlleleCounter: :param predicate: a :class:`VariantPredicate` for selecting the target variants. """ - # TODO: this class should probably be an implementation detail, + # TODO: this class should probably be an implementation detail, # and not a public member of the package. def __init__( self, predicate: VariantPredicate, ): + assert isinstance(predicate, VariantPredicate) self._predicate = predicate def get_question(self) -> str: diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 9bc8c6bd..c1209330 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -5,6 +5,7 @@ import hpotk +from gpsea.analysis.predicate.genotype._variant import VariantPredicates from gpsea.model import Patient, Sex from .._api import Categorization, PatientCategory @@ -397,7 +398,7 @@ class ModeOfInheritancePredicate(GenotypePolyPredicate): @staticmethod def autosomal_dominant( - variant_predicate: VariantPredicate, + variant_predicate: typing.Optional[VariantPredicate] = None, ) -> "ModeOfInheritancePredicate": """ Create a predicate that assigns the patient either @@ -406,6 +407,9 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + return ModeOfInheritancePredicate._from_moi_info( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), @@ -413,7 +417,7 @@ def autosomal_dominant( @staticmethod def autosomal_recessive( - variant_predicate: VariantPredicate, + variant_predicate: typing.Optional[VariantPredicate] = None, ) -> "ModeOfInheritancePredicate": """ Create a predicate that assigns the patient either into @@ -423,6 +427,9 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + return ModeOfInheritancePredicate._from_moi_info( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 6d1b0988..cd8e7ac6 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -110,6 +110,29 @@ def test_autosomal_dominant( assert categorization.category.name == name + @pytest.mark.parametrize( + "patient_name,name", + [ + ("adam", "HET"), # 0/0 & 0/1 + ("eve", "HET"), # 0/1 & 0/0 + ("cain", "HET"), # 0/1 & 0/0 + ], + ) + def test_autosomal_dominant__with_default_predicate( + self, + patient_name: str, + name: str, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = ModeOfInheritancePredicate.autosomal_dominant() + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.name == name + @pytest.mark.parametrize( "patient_name,name", [ @@ -135,6 +158,31 @@ def test_autosomal_recessive( assert categorization.category.name == name + @pytest.mark.parametrize( + "patient_name,name", + [ + # The White family has two variants: + ("walt", "BIALLELIC_ALT"), # 0/1 & 0/1 + ("skyler", "BIALLELIC_ALT"), # 0/1 & 0/1 + ("flynn", "BIALLELIC_ALT"), # 1/1 & 0/0 + ("holly", "BIALLELIC_ALT"), # 0/0 & 1/1 + ], + ) + def test_autosomal_recessive__with_default_predicate( + self, + patient_name: str, + name: str, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = ModeOfInheritancePredicate.autosomal_recessive() + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.name == name + class TestAllelePredicates: diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py index b0f20fb3..e85e3a3f 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/genotype/test_predicates.py @@ -53,7 +53,7 @@ def test_variant_key_predicate( @pytest.mark.parametrize( 'exon, expected', [ - (0, False), + (1, False), (4, True), (5, False), ] @@ -68,6 +68,11 @@ def test_exon_predicate( assert predicate.test(missense_variant) == expected + def test_exon_predicate_fails_on_invalid_exon(self): + with pytest.raises(AssertionError) as e: + VariantPredicates.exon(0, tx_id='tx:xyz') + assert e.value.args[0] == '`exon` must be a positive `int`' + @pytest.mark.parametrize( 'tx_id, expected', [ From 8cf004b69fe79ebe2a8f274011e20349b0434aa9 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 10:38:08 +0200 Subject: [PATCH 36/44] Update MoI predicates docs. --- .../mode_of_inheritance_predicate.rst | 112 +++++++++--------- 1 file changed, 54 insertions(+), 58 deletions(-) diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst index 2efb4d6b..f213e9a7 100644 --- a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -4,8 +4,9 @@ Mode of Inheritance Predicates ============================== -There are five basic modes of inheritance for single-gene diseases: autosomal dominant, autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial (See -`Understanding Genetics, Appendix B `_). +There are five basic modes of inheritance for single-gene diseases: autosomal dominant, +autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial +(See `Understanding Genetics, Appendix B `_). The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` @@ -15,41 +16,24 @@ The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` suppo the following Mendelian modes of inheritance (MoI): -+-----------------------+-----------------------------------+------------------+------------------------+ -| Mode of inheritance | Sex | Allele count | Genotype category | -+=======================+===================================+==================+========================+ -| Autosomal dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| Autosomal recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 2 | `BIALLELIC_ALT` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 3` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | -+ + +------------------+------------------------+ -| | | 2 | `BIALLELIC_ALT` | -+ + +------------------+------------------------+ -| | | :math:`\ge 3` | ``None`` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | -+ + +------------------+------------------------+ -| | | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ ++-----------------------+------------------+------------------------+ +| Mode of inheritance | Allele count | Genotype category | ++=======================+==================+========================+ +| Autosomal dominant | 0 | `HOM_REF` | ++ +------------------+------------------------+ +| | 1 | `HET` | ++ +------------------+------------------------+ +| | :math:`\ge 2` | ``None`` | ++-----------------------+------------------+------------------------+ +| Autosomal recessive | 0 | `HOM_REF` | ++ +------------------+------------------------+ +| | 1 | `HET` | ++ +------------------+------------------------+ +| | 2 | `BIALLELIC_ALT` | ++ +------------------+------------------------+ +| | :math:`\ge 3` | ``None`` | ++-----------------------+------------------+------------------------+ + .. note:: @@ -61,37 +45,49 @@ Then a predicate for the desired MoI can be created by one of * :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` * :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` -All constructors take an instance -of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. +By default, the MoI predicates will use *all* variants recorded in the individual. +However, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +can be provided to select a variant subset, if necessary. + + +Assign individuals into genotype groups +--------------------------------------- +Here we show seting up a predicate for grouping individuals for differences +between genotypes of a disease with an autosomal recessive MoI. -Example -------- +We use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +to create the predicate: -Here we show seting up a predicate for grouping individuals based on -having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` -to test differences between the genotypes of a disease with an autosomal recessive MoI. +>>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive() +>>> gt_predicate.display_question() +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant meets the condition: +The predicate will use *all* recorded variants to determine if the individual belongs into +homozygous reference (`HOM_REF`), heterozygous (`HET`), or biallelic alternative (`BIALLELIC_ALT`) +category. + + +Use a subset of variants for choosing the genotype group +-------------------------------------------------------- + +To select specific variants, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +can be registered with the MoI predicate. +For instance, the following can be done to only consider the variants that lead +to a missense change on a fictional transcript ``NM_1234.5`` +when assigning the genotype group. We set up the variant predicate: >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' +>>> is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id) +>>> is_missense.get_question() +'MISSENSE_VARIANT on NM_1234.5' -Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -for assigning a patient into a genotype group: +and we use it to create the MoI predicate: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_missense) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' - -The `gt_predicate` can be used in downstream analysis, such as in :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. From 2e4fbca9fed984d820017d2f4168a77065c87e4b Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 11:15:41 +0200 Subject: [PATCH 37/44] Fix lint. --- .../predicate/genotype/_predicates.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_predicates.py b/src/gpsea/analysis/predicate/genotype/_predicates.py index e8f3d824..b1ac93e6 100644 --- a/src/gpsea/analysis/predicate/genotype/_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_predicates.py @@ -359,20 +359,20 @@ def __repr__(self): def _decode_operator(op: str) -> typing.Callable[[int, int], bool]: - if op == '<': - return operator.lt - elif op == '<=': - return operator.le - elif op == '==': - return operator.eq - elif op == '!=': - return operator.ne - elif op == '>=': - return operator.ge - elif op == '>': - return operator.gt - else: - raise ValueError(f'Unsupported operator {op}') + if op == '<': + return operator.lt + elif op == '<=': + return operator.le + elif op == '==': + return operator.eq + elif op == '!=': + return operator.ne + elif op == '>=': + return operator.ge + elif op == '>': + return operator.gt + else: + raise ValueError(f'Unsupported operator {op}') class ChangeLengthPredicate(VariantPredicate): @@ -416,7 +416,7 @@ def __repr__(self): class RefAlleleLengthPredicate(VariantPredicate): """ - `RefAlleleLengthPredicate` tests if the length of the variant's reference allele + `RefAlleleLengthPredicate` tests if the length of the variant's reference allele is greater than, equal, or less than certain value. """ From e3390fd1825542e96775e3823de1950cf3caab49 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 11:59:46 +0200 Subject: [PATCH 38/44] Extract protein metadata from a singular result. --- src/gpsea/preprocessing/_uniprot.py | 80 ++-- .../data/uniprot_response/ITPR1_HUMAN.json | 446 ++++++++++++++++++ tests/preprocessing/test_uniprot.py | 26 +- 3 files changed, 516 insertions(+), 36 deletions(-) create mode 100644 tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json diff --git a/src/gpsea/preprocessing/_uniprot.py b/src/gpsea/preprocessing/_uniprot.py index d1adb98b..ec17ac0d 100644 --- a/src/gpsea/preprocessing/_uniprot.py +++ b/src/gpsea/preprocessing/_uniprot.py @@ -10,10 +10,8 @@ class UniprotProteinMetadataService(ProteinMetadataService): """A class that creates ProteinMetadata objects from data found with the Uniprot REST API. - More info on the Uniprot REST API here - https://www.uniprot.org/help/programmatic_access - - Methods: - annotate(protein_id:str): Gets metadata and creates ProteinMetadata for given protein ID + More info on the Uniprot REST API are + in the `Programmatic access `_ section. """ def __init__( @@ -27,8 +25,8 @@ def __init__( @staticmethod def parse_uniprot_json( - payload: typing.Mapping[str, typing.Any], - protein_id: str, + payload: typing.Mapping[str, typing.Any], + protein_id: str, ) -> ProteinMetadata: """ Try to extract `ProteinMetadata` corresponding to `protein_id` from the Uniprot JSON `payload`. @@ -44,39 +42,52 @@ def parse_uniprot_json( results = payload['results'] if len(results) == 0: raise ValueError(f"No proteins found for ID {protein_id}. Please verify refseq ID.") + elif len(results) == 1: + # In case we get only one result, let's use it! + return UniprotProteinMetadataService._extract_metadata( + protein_id=protein_id, + data=results[0], + ) + else: + for protein in results: + if any(uni['id'] == protein_id for uni in protein['uniProtKBCrossReferences']): + return UniprotProteinMetadataService._extract_metadata( + protein_id=protein_id, + data=protein, + ) - for protein in results: - if any(uni['id'] == protein_id for uni in protein['uniProtKBCrossReferences']): - # `protein` has a cross-reference to the `protein_id` of interest - try: - protein_name = protein['proteinDescription']['recommendedName']['fullName']['value'] - except KeyError: - protein_name = protein['proteinDescription']['submissionNames'][0]['fullName']['value'] + raise ValueError(f'Could not find an entry for {protein_id} in Uniprot response') - all_features_list = [] - try: - for feature in protein['features']: - feature_start = int(feature['location']['start']['value']) - feature_end = int(feature['location']['end']['value']) - feature_name = feature['description'] - feature_info = FeatureInfo( - feature_name, - Region(start=feature_start, end=feature_end), - ) - feature_type = FeatureType[feature['type'].upper()] - protein_feature = ProteinFeature.create(feature_info, feature_type) - all_features_list.append(protein_feature) - except KeyError: - raise ValueError(f"No features for {protein_id}") + @staticmethod + def _extract_metadata(protein_id: str, data: typing.Mapping[str, typing.Any]) -> ProteinMetadata: + # `protein` has a cross-reference to the `protein_id` of interest + try: + protein_name = data['proteinDescription']['recommendedName']['fullName']['value'] + except KeyError: + protein_name = data['proteinDescription']['submissionNames'][0]['fullName']['value'] - try: - protein_length = protein["sequence"]["length"] - except KeyError as e: - raise ValueError(e) + all_features_list = [] + try: + for feature in data['features']: + feature_start = int(feature['location']['start']['value']) + feature_end = int(feature['location']['end']['value']) + feature_name = feature['description'] + feature_info = FeatureInfo( + feature_name, + Region(start=feature_start, end=feature_end), + ) + feature_type = FeatureType[feature['type'].upper()] + protein_feature = ProteinFeature.create(feature_info, feature_type) + all_features_list.append(protein_feature) + except KeyError: + raise ValueError(f"No features for {protein_id}") - return ProteinMetadata(protein_id, protein_name, all_features_list, protein_length) + try: + protein_length = data["sequence"]["length"] + except KeyError as e: + raise ValueError(e) - raise ValueError(f'Could not find an entry for {protein_id} in Uniprot response') + return ProteinMetadata(protein_id, protein_name, all_features_list, protein_length) def annotate(self, protein_id: str) -> ProteinMetadata: """ @@ -96,4 +107,5 @@ def annotate(self, protein_id: str) -> ProteinMetadata: raise ValueError(f"only works with a RefSeq database ID (e.g. NP_037407.4), but we got {protein_id}") api_url = self._url % protein_id response = requests.get(api_url, timeout=self._timeout).json() + return UniprotProteinMetadataService.parse_uniprot_json(response, protein_id) diff --git a/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json b/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json new file mode 100644 index 00000000..4e5fdb0c --- /dev/null +++ b/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json @@ -0,0 +1,446 @@ +{ + "results": [ + { + "entryType": "UniProtKB reviewed (Swiss-Prot)", + "primaryAccession": "Q14643", + "uniProtkbId": "ITPR1_HUMAN", + "proteinDescription": { + "recommendedName": { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000305" + } + ], + "value": "Inositol 1,4,5-trisphosphate-gated calcium channel ITPR1" + } + }, + "alternativeNames": [ + { + "fullName": { + "value": "IP3 receptor isoform 1" + }, + "shortNames": [ + { + "value": "IP3R 1" + }, + { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "InsP3R1" + } + ] + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "8648241" + } + ], + "value": "Inositol 1,4,5 trisphosphate receptor" + } + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7500840" + } + ], + "value": "Inositol 1,4,5-trisphosphate receptor type 1" + } + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7852357" + }, + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "Type 1 inositol 1,4,5-trisphosphate receptor" + }, + "shortNames": [ + { + "value": "Type 1 InsP3 receptor" + } + ] + } + ] + }, + "genes": [ + { + "geneName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7852357" + }, + { + "evidenceCode": "ECO:0000312", + "source": "HGNC", + "id": "HGNC:6180" + } + ], + "value": "ITPR1" + }, + "synonyms": [ + { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "INSP3R1" + } + ] + } + ], + "features": [ + { + "type": "Domain", + "location": { + "start": { + "value": 112, + "modifier": "EXACT" + }, + "end": { + "value": 166, + "modifier": "EXACT" + } + }, + "description": "MIR 1", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 173, + "modifier": "EXACT" + }, + "end": { + "value": 223, + "modifier": "EXACT" + } + }, + "description": "MIR 2", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 231, + "modifier": "EXACT" + }, + "end": { + "value": 287, + "modifier": "EXACT" + } + }, + "description": "MIR 3", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 294, + "modifier": "EXACT" + }, + "end": { + "value": 373, + "modifier": "EXACT" + } + }, + "description": "MIR 4", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 379, + "modifier": "EXACT" + }, + "end": { + "value": 435, + "modifier": "EXACT" + } + }, + "description": "MIR 5", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1015, + "modifier": "EXACT" + }, + "end": { + "value": 1036, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1146, + "modifier": "EXACT" + }, + "end": { + "value": 1178, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1708, + "modifier": "EXACT" + }, + "end": { + "value": 1740, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1760, + "modifier": "EXACT" + }, + "end": { + "value": 1796, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1890, + "modifier": "EXACT" + }, + "end": { + "value": 1915, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1939, + "modifier": "EXACT" + }, + "end": { + "value": 1960, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 2472, + "modifier": "EXACT" + }, + "end": { + "value": 2537, + "modifier": "EXACT" + } + }, + "description": "Interaction with ERP44", + "evidences": [ + { + "evidenceCode": "ECO:0000250", + "source": "UniProtKB", + "id": "P11881" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 2729, + "modifier": "EXACT" + }, + "end": { + "value": 2758, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + } + ], + "uniProtKBCrossReferences": [ + { + "database": "RefSeq", + "id": "NP_001093422.2", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_001099952.2" + } + ], + "isoformId": "Q14643-3" + }, + { + "database": "RefSeq", + "id": "NP_001161744.1", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_001168272.1" + } + ], + "isoformId": "Q14643-2" + }, + { + "database": "RefSeq", + "id": "NP_002213.5", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_002222.5" + } + ], + "isoformId": "Q14643-4" + }, + { + "database": "RefSeq", + "id": "XP_011531985.1", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "XM_011533683.2" + } + ] + } + ], + "sequence": { + "length": 2758 + }, + "extraAttributes": { + "uniParcId": "UPI00015E0851" + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/test_uniprot.py b/tests/preprocessing/test_uniprot.py index f87e478e..461ca863 100644 --- a/tests/preprocessing/test_uniprot.py +++ b/tests/preprocessing/test_uniprot.py @@ -15,6 +15,11 @@ def fpath_uniprot_response_dir( return os.path.join(fpath_preprocessing_data_dir, "uniprot_response") +def read_json_payload(path: str) -> typing.Mapping[str, typing.Any]: + with open(path) as fh: + return json.load(fh) + + class TestUniprotProteinMetadataService: @pytest.fixture @@ -27,8 +32,7 @@ def zn462_human_uniprot_json( fpath_uniprot_response_dir: str, ) -> typing.Mapping[str, typing.Any]: fpath_zn462_human = os.path.join(fpath_uniprot_response_dir, "ZN462_HUMAN.json") - with open(fpath_zn462_human) as f: - return json.load(f) + return read_json_payload(fpath_zn462_human) def test_zn462( self, @@ -44,6 +48,24 @@ def test_zn462( assert protein_metadata.label == "Ankyrin repeat domain-containing protein 11" assert len(protein_metadata.protein_features) == 16 assert protein_metadata.protein_length == 2663 + + def test_itpr1( + self, + fpath_uniprot_response_dir: str, + ): + response_json_path = os.path.join(fpath_uniprot_response_dir, 'ITPR1_HUMAN.json') + response_json = read_json_payload(response_json_path) + + protein_id = "NP_001365381.1" + protein_metadata = UniprotProteinMetadataService.parse_uniprot_json( + payload=response_json, + protein_id=protein_id, + ) + + assert protein_metadata.protein_id == protein_id + assert protein_metadata.label == "Inositol 1,4,5-trisphosphate-gated calcium channel ITPR1" + assert len(protein_metadata.protein_features) == 13 + assert protein_metadata.protein_length == 2758 @pytest.mark.skip("Run manually to regenerate SUOX `NP_001027558.1` metadata") def test_fetch_suox_protein_metadata( From 22bc9dfc6c666e25a54e1f80a119c91234580f82 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 14:51:27 +0200 Subject: [PATCH 39/44] Improve Pydoc for monoallelic and biallelic predicates. --- .../predicate/genotype/_gt_predicates.py | 85 ++++++++++++------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index c1209330..f5fc9284 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -214,19 +214,32 @@ def monoallelic_predicate( names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ + The predicate bins patient into one of two groups, `A` and `B`, + based on presence of *exactly* one allele of a variant + that meets the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: - The predicate bins patient into one of two groups: `A` and `B`: - - +-----------+------------+------------+ - | Group | `A` count | `B` count | - +===========+============+============+ - | A | 1 | 0 | - +-----------+------------+------------+ - | B | 0 | 1 | - +-----------+------------+------------+ - - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) - are assigned the ``None`` group and, thus, omitted from the analysis. + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | A | 1 | 0 | + +-----------+-------------------+-------------------+ + | B | 0 | 1 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and, thus, omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). """ return PolyCountingGenotypePredicate.monoallelic( a_predicate=a_predicate, @@ -241,24 +254,36 @@ def biallelic_predicate( names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ - Get a predicate for binning the individuals into groups, - with respect to allele counts of variants selected by `a_predicate` and `b_predicate`. - - The predicate bins patient into one of three groups: `AA`, `AB` and `BB`: - - +-----------+------------------+------------------+ - | Group | `A` allele count | `B` allele count | - +===========+==================+==================+ - | AA | 2 | 0 | - +-----------+------------------+------------------+ - | AB | 1 | 1 | - +-----------+------------------+------------------+ - | AA | 0 | 2 | - +-----------+------------------+------------------+ - - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) - are assigned the ``None`` group and, thus, omitted from the analysis. - + The predicate bins patient into one of the three groups, + `AA`, `AB`, and `BB`, + based on presence of one or two variant alleles + that meet the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: + + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | AA | 2 | 0 | + +-----------+-------------------+-------------------+ + | AB | 1 | 1 | + +-----------+-------------------+-------------------+ + | AA | 0 | 2 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 1` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and will be, thus, + omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). """ return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, From 7aa36f9887c9dfde8ce121ec2a7ea6404d09e103 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 15:05:25 +0200 Subject: [PATCH 40/44] Compute __eq__ and __hash__ on allele predicate. --- .../predicate/genotype/_gt_predicates.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index f5fc9284..0ff4bf4c 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -189,7 +189,21 @@ def __init__( self._categorizations = tuple(count2cat.values()) self._a_counter = a_counter self._b_counter = b_counter + self._hash = self._compute_hash() + def _compute_hash(self) -> int: + hash_value = 17 + + self._groups = defaultdict(list) + for count, cat in self._count2cat.items(): + hash_value += 13 * hash(count) + hash_value += 13 * hash(cat) + + hash_value += 23 * hash(self._a_counter) + hash_value += 23 * hash(self._b_counter) + + return hash_value + def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -205,7 +219,14 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return self._count2cat.get(counts, None) - # TODO: implement __hash__, __eq__ + def __eq__(self, value: object) -> bool: + return isinstance(value, PolyCountingGenotypePredicate) \ + and self._count2cat == value._count2cat \ + and self._a_counter == value._a_counter \ + and self._b_counter == value._b_counter + + def __hash__(self) -> int: + return self._hash def monoallelic_predicate( From 634b43cd47b96df43a754962936473f478fa07be Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 16:26:15 +0200 Subject: [PATCH 41/44] Do not export `ModeOfInheritancePredicate`. --- .../mode_of_inheritance_predicate.rst | 26 +++--- docs/user-guide/stats.rst | 4 +- .../analysis/predicate/genotype/__init__.py | 2 + .../predicate/genotype/_gt_predicates.py | 88 +++++++++++++++---- .../predicate/genotype/test_gt_predicates.py | 11 +-- 5 files changed, 94 insertions(+), 37 deletions(-) diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst index f213e9a7..6c4178e8 100644 --- a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -9,11 +9,11 @@ autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial (See `Understanding Genetics, Appendix B `_). -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` -assigns the individual into a group based on the number of alleles -that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports -the following Mendelian modes of inheritance (MoI): +The :class:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +and :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` +assigns the individual into a group based on the number of the alleles +observed in the individual. +GPSEA supports the following Mendelian modes of inheritance (MoI): +-----------------------+------------------+------------------------+ @@ -40,11 +40,11 @@ the following Mendelian modes of inheritance (MoI): `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. Clinical judgment should be used to choose the MoI for the cohort analysis. -Then a predicate for the desired MoI can be created by one of -:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: +Then a predicate for the desired MoI can be created by calling one +of the following methods: -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +* :func:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.autosomal_recessive` By default, the MoI predicates will use *all* variants recorded in the individual. However, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` @@ -57,11 +57,11 @@ Assign individuals into genotype groups Here we show seting up a predicate for grouping individuals for differences between genotypes of a disease with an autosomal recessive MoI. -We use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +We use :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` to create the predicate: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive() +>>> from gpsea.analysis.predicate.genotype import autosomal_recessive +>>> gt_predicate = autosomal_recessive() >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' @@ -88,6 +88,6 @@ when assigning the genotype group. We set up the variant predicate: and we use it to create the MoI predicate: ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_missense) +>>> gt_predicate = autosomal_recessive(is_missense) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 3247a121..cb622526 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -136,8 +136,8 @@ to test if the variant leads to a frameshift (in this case): and then we choose the expected mode of inheritance to test. In case of *TBX5*, we expect the autosomal dominant mode of inheritance: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) +>>> from gpsea.analysis.predicate.genotype import autosomal_dominant +>>> gt_predicate = autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET' diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index b9bd8e8a..a27bc0fb 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -2,6 +2,7 @@ from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import autosomal_dominant, autosomal_recessive from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates @@ -9,6 +10,7 @@ __all__ = [ 'GenotypePolyPredicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', + 'autosomal_dominant', 'autosomal_recessive', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 0ff4bf4c..c5dbe07f 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,19 +1,18 @@ import dataclasses import typing +import warnings from collections import defaultdict import hpotk -from gpsea.analysis.predicate.genotype._variant import VariantPredicates from gpsea.model import Patient, Sex from .._api import Categorization, PatientCategory from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter - -# TODO: implement __hash__, __eq__ on predicates +from ._variant import VariantPredicates class AlleleCountingGroupsPredicate(GenotypePolyPredicate): @@ -313,6 +312,47 @@ def biallelic_predicate( ) +def autosomal_dominant( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: + """ + Create a predicate that assigns the patient either + into homozygous reference or heterozygous + group in line with the autosomal dominant mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used. + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + ) + + +def autosomal_recessive( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: + """ + Create a predicate that assigns the patient either into + homozygous reference, heterozygous, or biallelic alternative allele + (homozygous alternative or compound heterozygous) + group in line with the autosomal recessive mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + ) + + @dataclasses.dataclass(eq=True, frozen=True) class GenotypeGroup: allele_count: int @@ -445,7 +485,7 @@ class ModeOfInheritancePredicate(GenotypePolyPredicate): @staticmethod def autosomal_dominant( variant_predicate: typing.Optional[VariantPredicate] = None, - ) -> "ModeOfInheritancePredicate": + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference or heterozygous @@ -453,18 +493,18 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ - if variant_predicate is None: - variant_predicate = VariantPredicates.true() - - return ModeOfInheritancePredicate._from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_dominant` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_dominant(variant_predicate) + @staticmethod def autosomal_recessive( variant_predicate: typing.Optional[VariantPredicate] = None, - ) -> "ModeOfInheritancePredicate": + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference, heterozygous, or biallelic alternative allele @@ -473,14 +513,14 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ - if variant_predicate is None: - variant_predicate = VariantPredicates.true() - - return ModeOfInheritancePredicate._from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_recessive` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_recessive(variant_predicate) + @staticmethod def _from_moi_info( variant_predicate: VariantPredicate, @@ -599,6 +639,12 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: else: return None + def __eq__(self, value: object) -> bool: + return isinstance(value, SexGenotypePredicate) + + def __hash__(self) -> int: + return 31 + INSTANCE = SexGenotypePredicate() @@ -662,6 +708,7 @@ def __init__( self._categorizations = tuple( sorted(categorizations.values(), key=lambda c: c.category.cat_id) ) + self._hash = hash(tuple(categorizations.items())) def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -686,6 +733,13 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return None return categorization + + def __eq__(self, value: object) -> bool: + return isinstance(value, DiagnosisPredicate) \ + and self._id2cat == value._id2cat + + def __hash__(self) -> int: + return self._hash def diagnosis_predicate( diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index cd8e7ac6..425c63f5 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -7,9 +7,10 @@ sex_predicate, monoallelic_predicate, biallelic_predicate, + autosomal_dominant, + autosomal_recessive, VariantPredicates, VariantPredicate, - ModeOfInheritancePredicate, ) @@ -102,7 +103,7 @@ def test_autosomal_dominant( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate) + predicate = autosomal_dominant(variant_predicate) categorization = predicate.test(patient) @@ -125,7 +126,7 @@ def test_autosomal_dominant__with_default_predicate( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_dominant() + predicate = autosomal_dominant() categorization = predicate.test(patient) @@ -150,7 +151,7 @@ def test_autosomal_recessive( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate) + predicate = autosomal_recessive(variant_predicate) categorization = predicate.test(patient) @@ -175,7 +176,7 @@ def test_autosomal_recessive__with_default_predicate( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_recessive() + predicate = autosomal_recessive() categorization = predicate.test(patient) From d7109b43cbe7428a1bd1787204e68c118030e96f Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 16:44:02 +0200 Subject: [PATCH 42/44] Improve legibility of the HPO MTC filter docs section. --- docs/user-guide/mtc.rst | 162 ++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index b01c95d1..7f47e228 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -192,76 +192,96 @@ We use static constructor :func:`~gpsea.analysis.mtc_filter.HpoMtcFilter.default for creating :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`. The constructor takes a threshold as an argument (e.g. 20% in the example above) and the method's logic is made up of 8 individual heuristics -designed to skip testing the HPO terms that are unlikely to yield significant or interesting results: - -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| Code | Name | Description | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF01` | Skip terms that | The ``term_frequency_threshold`` determines the mininum proportion of individuals | -| | occur very rarely | with direct or indirect annotation by the HPO term to test. | -| | | We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), | -| | | and we only retain a term for testing if the proportion of individuals | -| | | in at least one genotype group is greater than | -| | | or equal to ``term_frequency_threshold``. | -| | | This is because of our assumption that even if there is statistical significance, | -| | | if a term is only seen in (for example) 7% of individuals | -| | | in the MISSENSE group and 2% in the not-MISSENSE group, | -| | | the term is unlikely to be of great interest because it is rare. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF02` | Skip terms if | In a related heuristic, we skip terms if no genotype group has more | -| | no cell has more | than one count. This is not completely redundant with the previous condition, | -| | than one count | because some terms may have a small number of total observations. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF03` | Skip terms if | Let's say a term such as | -| | all counts are | `Posterior polar cataract (HP:0001115) `_ | -| | identical | was observed in 7 of 11 individuals with MISSENSE variants | -| | to counts | and in 3 of 8 individuals with NONSENSE variants. | -| | for a child | If we find the same patient counts (7 of 11 and 3 of 8) in the parent term | -| | term | `Polar cataract HP:0010696 `_, | -| | | then we choose to not test the parent term. | -| | | | -| | | This is because the more specific an HPO term is, | -| | | the more information it has (the more interesting the correlation would be if it exists), | -| | | and the result of a test, such as the Fisher Exact test, would be exactly the same | -| | | for *Polar cataract* as for *Posterior polar cataract*. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF04` | Skip terms if | If both (or all) of the genotype groups have the same proportion of individuals | -| | genotypes have | observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, | -| | same HPO | because it is not possible that the Fisher exact test will return a significant result. | -| | proportions | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF05` | Skip terms if | If one of the genotype groups has neither observed nor excluded observations | -| | there are no | for an HPO term, skip it. | -| | HPO observations | | -| | in a group | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF06` | Skip term if | If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) | -| | underpowered | and the total count of patients in all genotype-phenotype groups is less than 7, | -| | for 2x2 or 2x3 | or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients | -| | analysis | is less than 6, then there is a lack even of the nominal statistical power | -| | | and the counts can never be significant. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF07` | Skipping terms | The HPO has a number of other branches that describe modes of inheritance, | -| | that are not | past medical history, and clinical modifiers. | -| | descendents of | We do not think it makes much sense to test for enrichment of these terms, | -| | *Phenotypic* | so, all terms that are not descendants of | -| | *abnormality* | `Phenotypic abnormality `_ are filtered out. | -| | | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF08` | Skipping | All the direct children of the root phenotype term | -| | "general" | `Phenotypic abnormality (HP:0000118) `_ | -| | level terms | are skipped, because of the assumption that if there is a valid signal, | -| | | it will derive from one of the more specific descendents. | -| | | | -| | | For instance, | -| | |`Abnormality of the nervous system `_ | -| | | (HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies | -| | | that if there is a signal from the nervous system, | -| | | it will lead to at least one of the descendents of | -| | | *Abnormality of the nervous system* being significant. | -| | | | -| | | See :ref:`general-hpo-terms` section for details. | -| | | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ +designed to skip testing the HPO terms that are unlikely to yield significant or interesting results. +`HMF01` - Skip terms that occur very rarely +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``term_frequency_threshold`` determines the mininum proportion of individuals +with direct or indirect annotation by the HPO term to test. +We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), +and we only retain a term for testing if the proportion of individuals +in at least one genotype group is greater than +or equal to ``term_frequency_threshold``. +This is because of our assumption that even if there is statistical significance, +if a term is only seen in (for example) 7% of individuals +in the MISSENSE group and 2% in the not-MISSENSE group, +the term is unlikely to be of great interest because it is rare. + + +`HMF02` - Skip terms if no genotype group has more than one count +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a related heuristic, we skip terms if no genotype group has more +than one count. This is not completely redundant with the previous condition, +because some terms may have a small number of total observations. + + +`HMF03` - Skip terms if all counts are identical to counts for a child term +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Let's say a term such as +`Posterior polar cataract (HP:0001115) `_ +was observed in 7 of 11 individuals with MISSENSE variants +and in 3 of 8 individuals with NONSENSE variants. +If we find the same patient counts (7 of 11 and 3 of 8) in the parent term +`Polar cataract HP:0010696 `_, +then we choose to not test the parent term. + +This is because the more specific an HPO term is, +the more information it has (the more interesting the correlation would be if it exists), +and the result of a test, such as the Fisher Exact test, would be exactly the same +for *Polar cataract* as for *Posterior polar cataract*. + + +`HMF04` - Skip terms if genotypes have same HPO proportions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If both (or all) of the genotype groups have the same proportion of individuals +observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, +because it is not possible that the Fisher exact test will return a significant result. + + +`HMF05` - Skip term if one of the genotype groups has neither observed nor excluded observations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Skip terms if there are no HPO observations in a group. + + +`HMF06` - Skip term if underpowered for 2x2 or 2x3 analysis +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) +and the total count of patients in all genotype-phenotype groups is less than 7, +or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients +is less than 6, then there is a lack even of the nominal statistical power +and the counts can never be significant. + + +`HMF07` - Skipping terms that are not descendents of *Phenotypic abnormality* +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The HPO has a number of other branches that describe modes of inheritance, +past medical history, and clinical modifiers. +We do not think it makes much sense to test for enrichment of these terms, +so, all terms that are not descendants of +`Phenotypic abnormality `_ are filtered out. + + +`HMF08` - Skipping "general" level terms +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All the direct children of the root phenotype term +`Phenotypic abnormality (HP:0000118) `_ +are skipped, because of the assumption that if there is a valid signal, +it will derive from one of the more specific descendents. + +For instance, +`Abnormality of the nervous system `_ +(HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies +that if there is a signal from the nervous system, +it will lead to at least one of the descendents of +*Abnormality of the nervous system* being significant. + +See :ref:`general-hpo-terms` section for details. From 1b3c848ffb7ae54f031f9377bc0c2b579b7adf02 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 16:48:44 +0200 Subject: [PATCH 43/44] Prepare to unexport `ModeOfInheritancePredicate`. --- src/gpsea/analysis/predicate/genotype/__init__.py | 2 +- tests/conftest.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index a27bc0fb..9bf0b26a 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -4,7 +4,7 @@ from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import autosomal_dominant, autosomal_recessive from ._gt_predicates import monoallelic_predicate, biallelic_predicate -from ._gt_predicates import ModeOfInheritancePredicate +from ._gt_predicates import ModeOfInheritancePredicate # TODO: remove before 1.0.0 from ._variant import VariantPredicates, ProteinPredicates __all__ = [ diff --git a/tests/conftest.py b/tests/conftest.py index 5459ce3f..900e7f45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, ModeOfInheritancePredicate, VariantPredicates +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, autosomal_dominant from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONDecoder from gpsea.model import * @@ -119,7 +119,7 @@ def suox_mane_tx_id() -> str: def suox_gt_predicate( suox_mane_tx_id: str, ) -> GenotypePolyPredicate: - return ModeOfInheritancePredicate.autosomal_dominant( + return autosomal_dominant( variant_predicate=VariantPredicates.variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id From df8359af8e8da38e25de48dcdfcd3427d1ed7d52 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 13 Sep 2024 21:09:28 +0200 Subject: [PATCH 44/44] Make release `0.4.0`. --- docs/conf.py | 4 ++-- src/gpsea/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 436b59e1..c311c2f8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,9 +60,9 @@ # built documents. # # The short X.Y version. -version = u'0.3' +version = u'0.4' # The full version, including alpha/beta/rc tags. -release = u'0.3.1.dev0' +release = u'0.4.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py index 327b371c..297ea01d 100644 --- a/src/gpsea/__init__.py +++ b/src/gpsea/__init__.py @@ -2,4 +2,4 @@ GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients. """ -__version__ = "0.3.1.dev0" +__version__ = "0.4.0"
Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
Code
TODOSkipping term with only 2 observations (not powered for 2x2)51
TODOSkipping term because all genotypes have same HPO observed proportions50
TODOSkipping general term44
TODOSkipping term with only 3 observations (not powered for 2x2)27
TODOSkipping term with only 6 observations (not powered for 2x2)19HMF01Skipping term with maximum frequency that was less than threshold 0.210
TODOSkipping term with only 4 observations (not powered for 2x2)16HMF02Skipping term because no genotype has more than one observed HPO count4
TODOSkipping term with maximum frequency that was less than threshold 0.210HMF03Skipping term because of a child term with the same individual counts1
TODOSkipping term with only 5 observations (not powered for 2x2)10HMF04Skipping term because all genotypes have same HPO observed proportions50
TODOSkipping term with only 1 observations (not powered for 2x2)7HMF05Skipping term because one genotype had zero observations5
TODOSkipping term because one genotype had zero observations5HMF06Skipping term with less than 7 observations (not powered for 2x2)130
TODOSkipping term because no genotype has more than one observed HPO count4HMF08Skipping general term44
Reason Count
TODO{{ skipped.code }} {{ skipped.reason }} {{ skipped.count }}