diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 41e7661f5..e2d72f8c2 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -52,6 +52,7 @@ jobs: cd docs sphinx-apidoc --separate --module-first -d 2 -H "API reference" --follow-links -o apidocs ../src/gpsea make clean html + # make clean html 2>&1 > /dev/null | less mv _build/html/* ../gh-pages/${DOCDIR} cd .. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index ae7746260..35cabc3d5 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -42,8 +42,8 @@ A typical GPSEA analysis will consist of several steps. Starting with a collecti we perform input Q/C and functional variant annotation to prepare a cohort. With the cohort on hand, we generate reports with summary statistics, variant distributions, and most common Human Phenotype Ontology (HPO) terms or measurements. -We then configure the methods for partitioning the cohort into genotype and phenotype groups, -to test for possible associations between the groups. +We then configure the methods for partitioning the cohort into genotype and phenotype classes, +to test for possible associations between the classes. We finalize the analysis by statistical testing and evaluation of the results. @@ -240,54 +240,56 @@ with one or more variant alleles (*Count*): Partition the cohort by genotype and phenotype ============================================== -To test for genotype-phenotype associations, we need to partition the cohort into subsets. -In GPSEA, we always assign a cohort member into a genotype group, -where each individual is assigned into a single group and the groups do not overlap. -The phenotype is then used to either assign into a group or to calculate a numeric score. +To test for genotype-phenotype associations, we need to divide the cohort into classes. +In GPSEA, we always assign a cohort member into a genotype class, +where each individual is assigned into a single class and the classes do not overlap. +The phenotype is then used to either assign an individual into a class, +or to calculate a numeric score or survival. Partition by genotype --------------------- -In context of the tutorial, we assign each cohort member into a group +In context of the tutorial, we assign each cohort member into a class depending on presence of a single allele of a missense or truncating variant (e.g. frameshift, stop gain, or splice site region): >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, monoallelic_predicate ->>> is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id) +>>> from gpsea.analysis.predicate import variant_effect, anyof +>>> from gpsea.analysis.clf import monoallelic_classifier +>>> is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id) >>> truncating_effects = ( ... VariantEffect.FRAMESHIFT_VARIANT, ... VariantEffect.STOP_GAINED, ... VariantEffect.SPLICE_DONOR_VARIANT, ... VariantEffect.SPLICE_ACCEPTOR_VARIANT, ... ) ->>> is_truncating = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in truncating_effects) ->>> gt_predicate = monoallelic_predicate( +>>> is_truncating = anyof(variant_effect(e, tx_id) for e in truncating_effects) +>>> gt_clf = monoallelic_classifier( ... a_predicate=is_missense, ... b_predicate=is_truncating, ... a_label="Missense", b_label="Truncating", ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Missense', 'Truncating') This is a lot of code, and detailed explanations and examples are available in the :ref:`partitioning` section. -For now, it is enough to know that the `gt_predicate` will assign the individuals -into `Missense` or `Truncating` group. The individuals with the number of missense (or truncating) variants +For now, it is enough to know that the `gt_clf` will assign the individuals +into `Missense` or `Truncating` class. The individuals with the number of missense (or truncating) variants different than one will be omitted from the analysis. Partition by phenotype ---------------------- -We use HPO terms to assign the individuals into phenotype groups, +We use HPO terms to assign the individuals into phenotype classes, according to the term's presence or exclusion. The testing leverages the :ref:`true-path-rule` of ontologies. -We now prepare the predicates for assigning into phenotype groups: +We now prepare the classifiers for assigning into phenotype classes: ->>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest ->>> pheno_predicates = prepare_predicates_for_terms_of_interest( +>>> from gpsea.analysis.clf import prepare_classifiers_for_terms_of_interest +>>> pheno_clfs = prepare_classifiers_for_terms_of_interest( ... cohort=cohort, ... hpo=hpo, ... ) @@ -299,7 +301,7 @@ Multiple testing correction By default, GPSEA performs a test for each HPO term used to annotate at least one individual in the cohort, and there are 369 such terms in *TBX5* cohort: ->>> len(pheno_predicates) +>>> len(pheno_clfs) 369 However, testing multiple hypothesis on the same dataset increases the chance of receiving false positive result. @@ -329,8 +331,8 @@ Now we can perform the testing and evaluate the results. >>> result = analysis.compare_genotype_vs_phenotypes( ... cohort=cohort, -... gt_predicate=gt_predicate, -... pheno_predicates=pheno_predicates, +... gt_clf=gt_clf, +... pheno_clfs=pheno_clfs, ... ) >>> result.total_tests 17 diff --git a/docs/user-guide/analyses/index.rst b/docs/user-guide/analyses/index.rst index 58fdabc02..d0aab1689 100644 --- a/docs/user-guide/analyses/index.rst +++ b/docs/user-guide/analyses/index.rst @@ -17,7 +17,7 @@ and explanations of how they are implemented by our software. :caption: Contents: partitioning/index - phenotype-groups + phenotype-classes mtc phenotype-scores measurements diff --git a/docs/user-guide/analyses/measurements.rst b/docs/user-guide/analyses/measurements.rst index c555d54a0..4661b2ccb 100644 --- a/docs/user-guide/analyses/measurements.rst +++ b/docs/user-guide/analyses/measurements.rst @@ -1,9 +1,9 @@ .. _measurement-stat: -========================== -Compare measurement values -========================== +==================== +Compare measurements +==================== **************** @@ -55,21 +55,21 @@ Missense should *NOT* be severe. TODO - create real predicate. >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=tx_id) +>>> from gpsea.analysis.predicate import variant_effect +>>> is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=tx_id) >>> is_missense.description 'MISSENSE_VARIANT on NM_000500.9' Assuming AR inheritance, we compare missense vs. rest: ->>> from gpsea.analysis.predicate.genotype import biallelic_predicate ->>> gt_predicate = biallelic_predicate( +>>> from gpsea.analysis.clf import biallelic_classifier +>>> gt_clf = biallelic_classifier( ... a_predicate=is_missense, ... b_predicate=~is_missense, ... a_label="Missense", b_label="Other", ... partitions=({0,}, {1, 2}), ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Missense/Missense', 'Missense/Other OR Other/Other') Phenotype score @@ -118,7 +118,7 @@ We execute the analysis by running >>> result = score_analysis.compare_genotype_vs_phenotype_score( ... cohort=cohort, -... gt_predicate=gt_predicate, +... gt_clf=gt_clf, ... pheno_scorer=pheno_scorer, ... ) @@ -140,7 +140,7 @@ individual 14[PMID_30968594_individual_14] 1 664.0 Prepare genotype category legend: ->>> gt_id_to_name = {c.category.cat_id: c.category.name for c in gt_predicate.get_categorizations()} +>>> gt_id_to_name = {c.category.cat_id: c.category.name for c in gt_clf.get_categorizations()} >>> gt_id_to_name {0: 'Missense/Missense', 1: 'Missense/Other OR Other/Other'} diff --git a/docs/user-guide/analyses/partitioning/genotype/allele_count.rst b/docs/user-guide/analyses/partitioning/genotype/allele_count.rst index eb66450f4..00717536c 100644 --- a/docs/user-guide/analyses/partitioning/genotype/allele_count.rst +++ b/docs/user-guide/analyses/partitioning/genotype/allele_count.rst @@ -16,14 +16,15 @@ with the variant category analysis described in the :ref:`variant-category` sect The allele count analysis compares the allele counts while keeping the variant type constant, whereas the variant category analysis compares variant types while keeping the total allele count constant. -A predicate for the allele count analysis is created with -the :func:`~gpsea.analysis.predicate.genotype.allele_count` function. +A classifier for the allele count analysis is created with +the :func:`~gpsea.analysis.clf.allele_count` function. The function needs two arguments: -``target`` with a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` for selecting the target variant type, +``target`` with a :class:`~gpsea.analysis.predicate.VariantPredicate` for selecting the target variant type, and ``counts`` with the target allele counts. The ``target`` variant predicate typically selects a broad variant types - for instance, all variants that affect a gene of interest. The ``counts`` take the allele counts to compare. -The inputs are best illustrated on a few examples. + +Let's show a few examples. ******** @@ -33,18 +34,19 @@ Examples Compare the individuals with *EGFR* mutation ============================================ -We can use the allele count analysis to test for G/P associations between the individuals with no mutation in *EGFR* +We can use the allele count analysis to test for G/P associations +between the individuals with no mutation in *EGFR* and the individuals harboring 1 variant allele. -First, let's create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +First, let's create a :class:`~gpsea.analysis.predicate.VariantPredicate` to select variants that affect *EGFR*: ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> affects_egfr = VariantPredicates.gene(symbol="EGFR") +>>> from gpsea.analysis.predicate import gene +>>> affects_egfr = gene(symbol="EGFR") >>> affects_egfr.description 'affects EGFR' -Next, we create allele count predicate to partition the individuals +Next, we create allele count classifier to partition the individuals based on presence of zero or one *EGFR* mutation allele: .. figure:: img/allele-count-zero-one.png @@ -53,22 +55,23 @@ based on presence of zero or one *EGFR* mutation allele: :width: 600px ->>> from gpsea.analysis.predicate.genotype import allele_count ->>> gt_predicate = allele_count( +>>> from gpsea.analysis.clf import allele_count +>>> gt_clf = allele_count( ... counts=(0, 1), ... target=affects_egfr, ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('0', '1') -We create the predicate with two arguments. -The ``counts`` argument takes a tuple of the target allele counts, -to partition the individuals based on zero or one allele. -The ``target`` takes a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -selecting the target variants. The ``target`` is particularly important +The ``allele_count`` needs two inputs. +The ``counts`` takes a tuple of the target allele counts, +in order to partition the individuals based on zero or one allele. +The ``target`` takes a :class:`~gpsea.analysis.predicate.VariantPredicate` +to select the target variants. +The ``target`` is particularly important if the cohort members include variants in other genes than *EGFR*. -The resulting ``gt_predicate`` can partition a cohort along the genotype axis, +The resulting ``gt_clf`` can partition a cohort along the genotype axis, e.g. to compare the patient survivals in a `survival analysis `. @@ -80,8 +83,8 @@ As another example, let's partition individuals based on one or two alleles of a For this example, the target mutation is any mutation that affects *LMNA*: ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> affects_lmna = VariantPredicates.gene(symbol="LMNA") +>>> from gpsea.analysis.predicate import gene +>>> affects_lmna = gene(symbol="LMNA") >>> affects_lmna.description 'affects LMNA' @@ -94,16 +97,14 @@ and we will compare the individuals with one allele with those with two alleles: :width: 600px ->>> gt_predicate = allele_count( +>>> gt_clf = allele_count( ... counts=(1, 2), ... target=affects_lmna, ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('1', '2') -The predicate will partition the individuals into two groups: +The classifier assigns the individuals into one of two classes: those with one *LMNA* variant allele and those with two *LMNA* variant alleles. -The individual with other allele counts (e.g. `0` or `3`) will be excluded -from the analysis. - +Any cohort member with other allele counts (e.g. `0` or `3`) is ignored. diff --git a/docs/user-guide/analyses/partitioning/genotype/diagnosis.rst b/docs/user-guide/analyses/partitioning/genotype/diagnosis.rst index da25592eb..9c1174102 100644 --- a/docs/user-guide/analyses/partitioning/genotype/diagnosis.rst +++ b/docs/user-guide/analyses/partitioning/genotype/diagnosis.rst @@ -4,26 +4,26 @@ Group by diagnosis ================== -.. figure:: img/diagnosis-predicate.png - :alt: Diagnosis predicate +.. figure:: img/diagnosis-classifier.png + :alt: Diagnosis classifier :align: center :width: 600px We can easily compare individuals diagnosed with different diseases. -:func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` groups the individuals +:func:`~gpsea.analysis.clf.diagnosis_classifier` groups the individuals based on a diagnosis presence. ->>> from gpsea.analysis.predicate.genotype import diagnosis_predicate ->>> gt_predicate = diagnosis_predicate( +>>> from gpsea.analysis.clf import diagnosis_classifier +>>> gt_clf = diagnosis_classifier( ... diagnoses=('OMIM:154700', 'OMIM:129600'), ... labels=('Marfan syndrome', 'Ectopia lentis, familial'), ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('OMIM:154700', 'OMIM:129600') -The predicate takes two or more disease identifiers (`diagnoses`) as well as their names (`labels`), -and it assigns the individuals based on their diagnoses. +The classifier takes two or more disease identifiers (`diagnoses`) as well as their names (`labels`), +and it classifies the individuals based on their diagnoses. Note, the assignment must be unambiguous; any individual labeled with two or more target diagnoses (e.g. an individual diagnosed with both *Marfan syndrome* and *Ectopia lentis, familial* in the example above) diff --git a/docs/user-guide/analyses/partitioning/genotype/img/allele-count-zero-one.png b/docs/user-guide/analyses/partitioning/genotype/img/allele-count-zero-one.png index 3bae270d2..144bf45e2 100644 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/allele-count-zero-one.png and b/docs/user-guide/analyses/partitioning/genotype/img/allele-count-zero-one.png differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/allele-count.png b/docs/user-guide/analyses/partitioning/genotype/img/allele-count.png deleted file mode 100644 index 4f115b3a9..000000000 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/allele-count.png and /dev/null differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/biallelic-classifier-w-partitions.png b/docs/user-guide/analyses/partitioning/genotype/img/biallelic-classifier-w-partitions.png new file mode 100644 index 000000000..793eb7f35 Binary files /dev/null and b/docs/user-guide/analyses/partitioning/genotype/img/biallelic-classifier-w-partitions.png differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/biallelic-classifier.png b/docs/user-guide/analyses/partitioning/genotype/img/biallelic-classifier.png new file mode 100644 index 000000000..3075ab925 Binary files /dev/null and b/docs/user-guide/analyses/partitioning/genotype/img/biallelic-classifier.png differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/biallelic-predicate-w-partitions.png b/docs/user-guide/analyses/partitioning/genotype/img/biallelic-predicate-w-partitions.png deleted file mode 100644 index b735f11b0..000000000 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/biallelic-predicate-w-partitions.png and /dev/null differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/biallelic-predicate.png b/docs/user-guide/analyses/partitioning/genotype/img/biallelic-predicate.png deleted file mode 100644 index 4f39ec8c3..000000000 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/biallelic-predicate.png and /dev/null differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/diagnosis-classifier.png b/docs/user-guide/analyses/partitioning/genotype/img/diagnosis-classifier.png new file mode 100644 index 000000000..a994d2a20 Binary files /dev/null and b/docs/user-guide/analyses/partitioning/genotype/img/diagnosis-classifier.png differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/diagnosis-predicate.png b/docs/user-guide/analyses/partitioning/genotype/img/diagnosis-predicate.png deleted file mode 100644 index aa74dda4d..000000000 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/diagnosis-predicate.png and /dev/null differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/monoallelic-classifier.png b/docs/user-guide/analyses/partitioning/genotype/img/monoallelic-classifier.png new file mode 100644 index 000000000..3dfc93e3f Binary files /dev/null and b/docs/user-guide/analyses/partitioning/genotype/img/monoallelic-classifier.png differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/monoallelic-predicate.png b/docs/user-guide/analyses/partitioning/genotype/img/monoallelic-predicate.png deleted file mode 100644 index af22c179e..000000000 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/monoallelic-predicate.png and /dev/null differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/sex-classifier.png b/docs/user-guide/analyses/partitioning/genotype/img/sex-classifier.png new file mode 100644 index 000000000..33f6cd4d4 Binary files /dev/null and b/docs/user-guide/analyses/partitioning/genotype/img/sex-classifier.png differ diff --git a/docs/user-guide/analyses/partitioning/genotype/img/sex-predicate.png b/docs/user-guide/analyses/partitioning/genotype/img/sex-predicate.png deleted file mode 100644 index a00113278..000000000 Binary files a/docs/user-guide/analyses/partitioning/genotype/img/sex-predicate.png and /dev/null differ diff --git a/docs/user-guide/analyses/partitioning/genotype/index.rst b/docs/user-guide/analyses/partitioning/genotype/index.rst index 40ec2ce5c..3ef0ff046 100644 --- a/docs/user-guide/analyses/partitioning/genotype/index.rst +++ b/docs/user-guide/analyses/partitioning/genotype/index.rst @@ -1,20 +1,20 @@ -.. _genotype-predicates: +.. _genotype-classifiers: -################### -Genotype predicates -################### +#################### +Genotype classifiers +#################### -A genotype predicate partitions the individuals based on their genotype. -In GPSEA, genotype predicates leverage information from one of the three areas: +A genotype classifier assigns an individual into a class based on their genotype. +In GPSEA, genotype classifiers leverage information from one of the three areas: * Sex * Disease diagnosis * Presence of variant(s) that meet certain inclusion criteria (e.g. a missense variant in heterozygous genotype) -Partitioning based on sex or disease diagnosis is relatively straightforward - the individuals +Classification based on sex or disease diagnosis is relatively straightforward - the individuals are assigned by the biological sex or presence of a specific diagnosis. -See :ref:`group-by-sex` and :ref:`group-by-diagnosis` for more details. +See :ref:`group-by-sex` and :ref:`group-by-diagnosis` for more details. Partitioning based on variants is, however, much more flexible, to support the analysis of the broad spectrum of pathomechanisms diff --git a/docs/user-guide/analyses/partitioning/genotype/sex.rst b/docs/user-guide/analyses/partitioning/genotype/sex.rst index 320844367..6a7da2bd6 100644 --- a/docs/user-guide/analyses/partitioning/genotype/sex.rst +++ b/docs/user-guide/analyses/partitioning/genotype/sex.rst @@ -4,23 +4,23 @@ Group by sex ============ -.. figure:: img/sex-predicate.png - :alt: Sex predicate +.. figure:: img/sex-classifier.png + :alt: Sex classifier :align: center :width: 600px It is easy to investigate the differences between males and females. -The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` partitions -the individuals based on their :class:`~gpsea.model.Sex`: +The :func:`~gpsea.analysis.clf.sex_classifier` assigns +an individual into a class based on the :class:`~gpsea.model.Sex`: ->>> from gpsea.analysis.predicate.genotype import sex_predicate ->>> gt_predicate = sex_predicate() ->>> gt_predicate.group_labels +>>> from gpsea.analysis.clf import sex_classifier +>>> gt_clf = sex_classifier() +>>> gt_clf.class_labels ('FEMALE', 'MALE') The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. -Note that we implemented this predicate as a genotype predicate. +Note that we implemented this classification as a :class:`~gpsea.analysis.clf.GenotypeClassifier`. Currently, it is not possible to compare the distribution of genotypes across sexes. diff --git a/docs/user-guide/analyses/partitioning/genotype/variant_category.rst b/docs/user-guide/analyses/partitioning/genotype/variant_category.rst index 963b41a07..a5153e915 100644 --- a/docs/user-guide/analyses/partitioning/genotype/variant_category.rst +++ b/docs/user-guide/analyses/partitioning/genotype/variant_category.rst @@ -11,41 +11,41 @@ with those harboring :math:`AC_{B}=1` (where :math:`B` is e.g. a missense mutati Similarly, in an autosomal recessive disease, we may be interested in comparing the individuals with :math:`AC_{A} \ge 1` with those with :math:`AC_{A} = 0`. In both analyses, we compare two variant categories :math:`A` and :math:`B` -which are described by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +which are described by a :class:`~gpsea.analysis.predicate.VariantPredicate` (see :ref:`variant-predicates` section), while ensuring the allele count sum of both variant categories is :math:`k`. :math:`k = \sum_{i \in \{A, B\}} AC_{i}` -GPSEA provides two predicates: +GPSEA provides two genotype classifiers: .. table:: - +-----------+-----------------------+------------------------------------------------------------------+ - | :math:`k` | Name | Function | - +===========+=======================+==================================================================+ - | 1 | Monoallelic predicate | :func:`~gpsea.analysis.predicate.genotype.monoallelic_predicate` | - +-----------+-----------------------+------------------------------------------------------------------+ - | 2 | Biallelic predicate | :func:`~gpsea.analysis.predicate.genotype.biallelic_predicate` | - +-----------+-----------------------+------------------------------------------------------------------+ + +-----------+------------------------+------------------------------------------------------------------+ + | :math:`k` | Name | Function | + +===========+========================+==================================================================+ + | 1 | Monoallelic classifier | :func:`~gpsea.analysis.clf.monoallelic_classifier` | + +-----------+------------------------+------------------------------------------------------------------+ + | 2 | Biallelic classifier | :func:`~gpsea.analysis.clf.biallelic_classifier` | + +-----------+------------------------+------------------------------------------------------------------+ -.. _monoallelic-predicate: +.. _monoallelic-classifier: -********************* -Monoallelic predicate -********************* +********************** +Monoallelic classifier +********************** -Monoallelic predicate compares the individuals who have *one* allele of a variants of interest. -The predicate uses a pair of variant predicates, `A` and `B`, +Monoallelic classifier compares the individuals who have *one* allele of a variants of interest. +The classifier uses a pair of variant predicates, `A` and `B`, to compute the allele counts :math:`AC_{A}` and :math:`AC_{B}`, -in order to assign an individual into a genotype group. +in order to assign an individual into a genotype class. -.. table:: Monoallelic predicate genotype groups +.. table:: Monoallelic genotype classes +-----------------+-------------------+-------------------+ - | Genotype group | :math:`AC_{A}` | :math:`AC_{B}` | + | Genotype class | :math:`AC_{A}` | :math:`AC_{B}` | +=================+===================+===================+ | A | 1 | 0 | +-----------------+-------------------+-------------------+ @@ -57,8 +57,8 @@ in order to assign an individual into a genotype group. The individuals with :math:`\sum_{i \in \{A, B\}} AC_{i} \neq 1` are omitted from the analysis. -.. figure:: img/monoallelic-predicate.png - :alt: Monoallelic predicate +.. figure:: img/monoallelic-classifier.png + :alt: Monoallelic classifier :align: center :width: 600px @@ -66,7 +66,7 @@ are omitted from the analysis. Example ======= -Let's create a predicate to categorize the individuals +Let's create a classifier to categorize the individuals to those having one missense allele or to those having one frameshift allele with respect to fictional transcript ``NM_1234.5``. @@ -76,11 +76,11 @@ We start by creating the variant predicates for missense (`A`) and frameshift (`B`) variants: >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> is_missense = VariantPredicates.variant_effect(effect=VariantEffect.MISSENSE_VARIANT, tx_id=tx_id) ->>> is_frameshift = VariantPredicates.variant_effect(effect=VariantEffect.FRAMESHIFT_VARIANT, tx_id=tx_id) +>>> from gpsea.analysis.predicate import variant_effect +>>> is_missense = variant_effect(effect=VariantEffect.MISSENSE_VARIANT, tx_id=tx_id) +>>> is_frameshift = variant_effect(effect=VariantEffect.FRAMESHIFT_VARIANT, tx_id=tx_id) -Monoallelic predicate lets us customize the category names. +Monoallelic classifier lets us customize the category names. Let's use `Missense` and `Frameshift` instead of the defaults: >>> a_label = "Missense" @@ -88,39 +88,40 @@ Let's use `Missense` and `Frameshift` instead of the defaults: Now we have all we need to create the predicate: ->>> from gpsea.analysis.predicate.genotype import monoallelic_predicate ->>> gt_predicate = monoallelic_predicate( +>>> from gpsea.analysis.clf import monoallelic_classifier +>>> gt_clf = monoallelic_classifier( ... a_predicate=is_missense, ... b_predicate=is_frameshift, ... a_label=a_label, b_label=b_label, ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Missense', 'Frameshift') -.. _biallelic-predicate: +.. _biallelic-classifier: -******************* -Biallelic predicate -******************* +******************** +Biallelic classifier +******************** -Biallelic predicate compares the individuals with *two* alleles of the variants of interest. -The functionality is very similar to that of monoallelic predicate, with two differences. +Biallelic classifier compares the individuals with *two* alleles of the variants of interest. +The functionality is very similar to that of monoallelic classifier, with two differences: +(1) genotype classes and (2) partitions. -Categories -========== +Genotype classes +================ Biallelic locus can be present in one of three genotypes, allowing an individual -to be assigned into one of the three genotype groups: +to be assigned into one of the three genotype classes: -.. _biallelic-predicate-gt-groups: +.. _biallelic-gt-classes: -.. table:: Biallelic predicate genotype groups +.. table:: Biallelic genotype classes +-------+----------------+-------------------+-------------------+ - | Index | Genotype group | :math:`AC_{A}` | :math:`AC_{B}` | + | Index | Genotype class | :math:`AC_{A}` | :math:`AC_{B}` | +=======+================+===================+===================+ | 0 | A/A | 2 | 0 | +-------+----------------+-------------------+-------------------+ @@ -135,8 +136,8 @@ Note that :math:`\sum_{i \in \{A, B\}} AC_{i} = 2` and the individuals with a different allele count sum are omitted from the analysis. -.. figure:: img/biallelic-predicate.png - :alt: Biallelic predicate +.. figure:: img/biallelic-classifier.png + :alt: Biallelic classifier :align: center :width: 600px @@ -148,68 +149,68 @@ Let `A` and `B` correspond to *MISSENSE* and *FRAMESHIFT* variants, and let's reuse the variant predicates ``is_missense`` and ``is_frameshift`` from the previous section, to compare missense and frameshift variants in the context of an autosomal recessive disease. ->>> from gpsea.analysis.predicate.genotype import biallelic_predicate ->>> gt_predicate = biallelic_predicate( +>>> from gpsea.analysis.clf import biallelic_classifier +>>> gt_clf = biallelic_classifier( ... a_predicate=is_missense, ... b_predicate=is_frameshift, ... a_label="Missense", b_label="Frameshift", ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Missense/Missense', 'Missense/Frameshift', 'Frameshift/Frameshift') -The predicate will assign the individuals into one of three genotype groups: +The classifier assigns an individual into one of three genotype classes: -* `A/A` - two missense alleles -* `A/B` - one missense and one frameshift allele -* `B/B` - two frameshift alleles +* `Missense/Missense` - two missense alleles +* `Missense/Frameshift` - one missense and one frameshift allele +* `Frameshift/Frameshift` - two frameshift alleles Partitions ========== -Sometimes we are interested in lumping several genotype groups into a partition +Sometimes we are interested in lumping several genotype classes into a partition and then comparing the partitions. For instance, in the context of an autosomal recessive disease, we may want to compare individuals with two "mild" mutations with the individuals with at least one "severe" mutation. This comparison can be implemented using the `partitions` option. -A partition is a set of one or more genotype group indices -(see :ref:`biallelic-predicate-gt-groups` table). -Then, the `partitions` option needs two or more partitions. -Typically, two partitions should be provided. +A partition is a set of one or more genotype class indices +(see :ref:`biallelic-gt-classes` table). +Then, two (or more) partitions are provided to biallelic classifier +via the `partitions` option. For example, we can compare the individuals with two missense alleles with those harboring one frameshift and one missense alleles, or two frameshift alleles. Let `A` and `B` correspond to *MISSENSE* and *FRAMESHIFT* variant. -According to :ref:`biallelic-predicate-gt-groups` table, -the `A/A` genotype group corresponds to index `0`, -and the `A/B` and `B/B` genotype groups correspond to indices `1` and `2`, respectively. -We form the corresponding partitions as: +According to :ref:`biallelic-gt-classes` table, +the `A/A` genotype class corresponds to index `0`, +and the `A/B` and `B/B` genotype class correspond to indices `1` and `2`, respectively. +We form the partitions accordingly: >>> partitions = (0, {1, 2}) -Using the ``partitions``, the biallelic predicate splits the individuals -into two groups: +With the ``partitions``, the biallelic classifier splits the individuals +into two classes: * two missense alleles * one missense alelele and one frameshift allele or two frameshift alleles -.. figure:: img/biallelic-predicate-w-partitions.png - :alt: Biallelic predicate with partitions +.. figure:: img/biallelic-classifier-w-partitions.png + :alt: Biallelic classifier with partitions :align: center :width: 600px -This is how to use the ``partitions`` in code: +Using the ``partitions`` in code is a no-brainer: ->>> gt_predicate = biallelic_predicate( +>>> gt_clf = biallelic_classifier( ... a_predicate=is_missense, ... b_predicate=is_frameshift, ... a_label="Missense", b_label="Frameshift", ... partitions=partitions, ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Missense/Missense', 'Missense/Frameshift OR Frameshift/Frameshift') diff --git a/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst b/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst index cb3040f3d..588d3da27 100644 --- a/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst +++ b/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst @@ -6,14 +6,14 @@ Variant Predicates ================== -Variant predicate is a core component to partition a cohort using the genomic variants identified in the cohort members. -A :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +Variant predicate is a core component to classify a cohort using the genomic variants identified in the cohort members. +A :class:`~gpsea.analysis.predicate.VariantPredicate` tests if a :class:`~gpsea.model.Variant` meets the inclusion criteria. For instance, a predicate can test if a variant is a deletion, leads to a missense change, or overlaps with a protein domain. -An array of variant predicates is available as static methods -of the :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` class. +An array of builtin variant predicates is available as functions +of the :mod:`gpsea.analysis.predicate` module. The predicates operate on several lines of information: @@ -32,7 +32,7 @@ The predicates operate on several lines of information: The scope of the builtin predicates is fairly narrow and likely insufficient for real-life analyses. -The predicates can, however, be chained into a compound predicate +However, the predicates can be chained into a compound predicate to achive more expressivity for testing complex conditions, such as "variant is a missense or synonymous variant located in exon 6 of `NM_013275.6`". @@ -42,7 +42,7 @@ Examples ******** Here we show examples of several simple variant predicates and -and how to combine them to test a complex condition. +how to chain them for testing complex conditions. Load cohort @@ -83,32 +83,32 @@ Let's use builtin predicates to verify the properties of the variant ``1_8358231 We can check that the variant overlaps with *RERE* ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> gene = VariantPredicates.gene('RERE') +>>> import gpsea.analysis.predicate as vp +>>> gene = vp.gene('RERE') >>> gene.test(variant) True it overlaps with the *MANE* transcript >>> rere_mane_tx_id = 'NM_001042681.2' ->>> tx = VariantPredicates.transcript(rere_mane_tx_id) +>>> tx = vp.transcript(rere_mane_tx_id) >>> tx.test(variant) True it in fact overlaps with the exon 20, ->>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) +>>> exon20 = vp.exon(exon=20, tx_id=rere_mane_tx_id) >>> exon20.test(variant) True and leads to a missense mutation with respect to the MANE transcript >>> from gpsea.model import VariantEffect ->>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) +>>> missense = vp.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) >>> missense.test(variant) True -See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` +See the :mod:`gpsea.analysis.predicate` module for a complete list of the builtin predicates. @@ -118,7 +118,8 @@ Predicate chain Using the builtin predicates, we can build a logical chain to test complex conditions. For instance, we can test if the variant meets any of several conditions: ->>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) +>>> import gpsea.analysis.predicate as vp +>>> nonsense = vp.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) >>> missense_or_nonsense = missense | nonsense >>> missense_or_nonsense.test(variant) True @@ -136,7 +137,7 @@ e.g. to test if the variant is a *"chromosomal deletion" or a deletion which rem >>> from gpsea.model import VariantClass >>> chromosomal_deletion = "SO:1000029" ->>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) +>>> predicate = vp.structural_type(chromosomal_deletion) | (vp.variant_class(VariantClass.DEL) & vp.change_length("<=", -50)) >>> predicate.description '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' @@ -154,7 +155,7 @@ for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_ ... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, ... # and many more effects.. ... ) ->>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) +>>> non_frameshift_predicate = vp.allof(vp.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) However, this is clearly much better implemented by a logical *not* of a "is frameshift" predicate. @@ -164,7 +165,7 @@ and results in an inverted predicate. This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: ->>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) +>>> non_frameshift_del = ~vp.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & vp.variant_class(VariantClass.DEL) >>> non_frameshift_del.description '(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' @@ -180,7 +181,7 @@ However, if a predicate seems to be missing, feel free to submit an issue in our `GitHub tracker `_, or to implement a custom predicate -by extending the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class 😎. +by extending the :class:`~gpsea.analysis.predicate.VariantPredicate` class 😎. diff --git a/docs/user-guide/analyses/partitioning/index.rst b/docs/user-guide/analyses/partitioning/index.rst index 54c1f7083..cd4601734 100644 --- a/docs/user-guide/analyses/partitioning/index.rst +++ b/docs/user-guide/analyses/partitioning/index.rst @@ -4,32 +4,35 @@ Partitioning ############ -Searching for G/P associations usually requires to partition the individuals -into two or more discrete groups, to allow testing for the inter-group differences. -GPSEA reflects these requirements with its predicate API. +Searching for G/P associations usually requires to assign the individuals +into two or more discrete classes, to allow testing for the inter-class differences. +GPSEA reflects these requirements with its classifier API. -A *predicate* partitions an individual (:class:`~gpsea.model.Patient`) into one of several groups. -The groups must be *exclusive* - each individual must be assigned at most into one group. -In general, it is desirable that the groups cover all or at least the majority of the cohort being analyzed to maximize statistical power. -However, the predicate is allowed to return `None` if the individual cannot be assigned. +A *classifier* assigns an individual (:class:`~gpsea.model.Patient`) into one of several classes. +The classes must be *exclusive* - each individual must be assigned at most into one class. +In general, it is desirable that the classes cover all or at least the majority +of the cohort being analyzed to maximize statistical power. +However, the classifier is allowed to return `None` if the individual cannot be assigned. As a result, the individual will be omitted from the downstream analysis. -Predicates can be applied on both *genotype* and *phenotype*. -Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`) -assign the individual into a group (mostly) based on the variant information, -while the phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`) -decide on the group based on the HPO terms or a diagnosis. +Classifiers can be applied on both *genotype* and *phenotype*. +Genotype classifiers (:class:`~gpsea.analysis.clf.GenotypeClassifier`) +classify the individual (mostly) based on the variant information, +while the phenotype classifiers (:class:`~gpsea.analysis.clf.PhenotypeClassifier`) +decide on the class based on the HPO terms or a diagnosis. -Besides assigning an individual into a discrete group, -a continuous score (typically for phenotype) can also be computed. +Besides assigning an individual into a discrete phenotype class, +a continuous score or a survival can also be computed. A :class:`~gpsea.analysis.pscore.PhenotypeScorer` computes a phenotype score for an individual, and the differences in score distributions of genotype groups can be tested e.g. with Mann-Whitney U test. +An :class:`~gpsea.analysis.temporal.Endpoint` computes a survival +as time until death, disease onset, or onset of an HPO term. -Any GPSEA analysis needs a genotype predicate and a phenotype predicate/scorer. -The following sections show how to prepare the predicates and scorers -for your analyses. We also show a gallery with examples. +Any GPSEA analysis needs a genotype clasifier and a phenotype classifier/scorer/endpoint. +The following sections show how to prepare the classifiers, scorers, and endpoints +for your analysis. We also show a gallery with examples. .. toctree:: diff --git a/docs/user-guide/analyses/partitioning/phenotype/hpo_predicate.rst b/docs/user-guide/analyses/partitioning/phenotype/hpo_predicate.rst index b259feff1..e11722f87 100644 --- a/docs/user-guide/analyses/partitioning/phenotype/hpo_predicate.rst +++ b/docs/user-guide/analyses/partitioning/phenotype/hpo_predicate.rst @@ -1,11 +1,11 @@ -.. _hpo-predicate: +.. _hpo-classifier: -HPO predicate -============= +HPO classifier +============== -When testing for presence or absence of an HPO term, the :class:`~gpsea.analysis.predicate.phenotype.HpoPredicate` +When testing for presence or absence of an HPO term, the :class:`~gpsea.analysis.clf.HpoClassifier` leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. In result, an individual annotated with a term is implicitly annotated with all its ancestors. For instance, an individual annotated with `Ectopia lentis `_ @@ -16,10 +16,11 @@ is also annotated with `Abnormal lens morphology `_. We need to load :class:`~hpotk.ontology.MinimalOntology` with HPO data to access the HPO hierarchy: @@ -28,19 +29,19 @@ We need to load :class:`~hpotk.ontology.MinimalOntology` with HPO data to access >>> store = hpotk.configure_ontology_store() >>> hpo = store.load_minimal_hpo(release='v2024-07-01') -and now we can set up a predicate to test for presence of *Abnormal lens morphology*: +and now we can set up the classifier to test for presence of *Abnormal lens morphology*: ->>> from gpsea.analysis.predicate.phenotype import HpoPredicate +>>> from gpsea.analysis.clf import HpoClassifier >>> query = hpotk.TermId.from_curie('HP:0000517') ->>> pheno_predicate = HpoPredicate( +>>> pheno_clf = HpoClassifier( ... hpo=hpo, ... query=query, ... ) ->>> pheno_predicate.name -'HPO Predicate' ->>> pheno_predicate.description +>>> pheno_clf.name +'HPO Classifier' +>>> pheno_clf.description 'Test for presence of Abnormal lens morphology [HP:0000517]' ->>> pheno_predicate.group_labels +>>> pheno_clf.class_labels ('Yes', 'No') @@ -48,21 +49,29 @@ and now we can set up a predicate to test for presence of *Abnormal lens morphol missing_implies_phenotype_excluded ---------------------------------- -In many cases, published reports of clinical data about individuals with rare diseases describes phenotypic features that were observed, but do not -provide a comprehensive list of features that were explicitly excluded. By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. -Setting this argument to True will cause "n/a" entries to be set to "excluded". We provide this option for exploration but do not recommend its use for the -final analysis unless the assumption behind it is known to be true. +In many cases, published reports of clinical data about individuals with rare diseases +describe phenotypic features that were observed, but do not provide +a comprehensive list of features that were explicitly excluded. +By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. + +However, setting ``missing_implies_excluded=True`` will cause "n/a" entries to be set to "excluded". +We provide this option for exploration but do not recommend its use +for the final analysis unless the assumption behind it is known to be true. -Predicates for all cohort phenotypes -==================================== +Classifiers for all cohort phenotypes +===================================== -Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. -The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` +Constructing phenotype classifiers for all HPO terms of a cohort sounds a bit tedious. +The :func:`~gpsea.analysis.clf.prepare_classifiers_for_terms_of_interest` function cuts down the tedium. -For a given phenopacket collection (e.g. 156 patients with mutations in *WWOX* gene included in Phenopacket Store version `0.1.18`) + +Example +------- + +For a phenopacket collection (e.g. 156 patients with mutations in *WWOX* gene included in Phenopacket Store version `0.1.18`) >>> from ppktstore.registry import configure_phenopacket_registry >>> registry = configure_phenopacket_registry() @@ -79,14 +88,14 @@ processed into a cohort Individuals Processed: ... -we can create HPO predicates for testing all 369 HPO terms used in the cohort +we can create HPO classifiers for testing all 369 HPO terms used in the cohort: ->>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest ->>> pheno_predicates = prepare_predicates_for_terms_of_interest( +>>> from gpsea.analysis.clf import prepare_classifiers_for_terms_of_interest +>>> pheno_clfs = prepare_classifiers_for_terms_of_interest( ... cohort=cohort, ... hpo=hpo, ... ) ->>> len(pheno_predicates) +>>> len(pheno_clfs) 369 and subject the predicates into further analysis, such as :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/analyses/partitioning/phenotype/index.rst b/docs/user-guide/analyses/partitioning/phenotype/index.rst index aa0cbe49e..915ee4020 100644 --- a/docs/user-guide/analyses/partitioning/phenotype/index.rst +++ b/docs/user-guide/analyses/partitioning/phenotype/index.rst @@ -1,10 +1,10 @@ .. _phenotype-predicates-and-scorers: -################################ -Phenotype predicates and scorers -################################ +################################# +Phenotype classifiers and scorers +################################# -GPSEA offers several phenotype predicates. +GPSEA offers several phenotype classifiers. TODO -- separate explanations for HPO (Fisher), scores (Mann Whitney) and survival (log rank test). diff --git a/docs/user-guide/analyses/phenotype-groups.rst b/docs/user-guide/analyses/phenotype-classes.rst similarity index 77% rename from docs/user-guide/analyses/phenotype-groups.rst rename to docs/user-guide/analyses/phenotype-classes.rst index 92b87babc..af1c879eb 100644 --- a/docs/user-guide/analyses/phenotype-groups.rst +++ b/docs/user-guide/analyses/phenotype-classes.rst @@ -1,20 +1,20 @@ -.. _genotype-phenotype-groups: +.. _genotype-phenotype-classes: -===================================== -Compare genotype and phenotype groups -===================================== +====================================== +Compare genotype and phenotype classes +====================================== .. doctest:: :hide: >>> from gpsea import _overwrite -In this section, we show how to test the association between genotype and phenotype categories. +In this section, we show how to test the association between genotype and phenotype classes. We assume a cohort was preprocessed following the :ref:`input-data` section, -and we use predicates described in the :ref:`partitioning` to assign each cohort member -into a group along the genotype and phenotype axes. -We use Fisher exact test (FET) to test for differences between the groups +and we use classifiers described in the :ref:`partitioning` to assign each cohort member +into a class along the genotype and phenotype axes. +We use Fisher exact test (FET) to test for differences between the classes and we apply multiple testing correction to mitigate finding significant associations by chance. @@ -63,9 +63,9 @@ The ``p_value`` evaluates to `5.432292015291845e-06`, meaning there is a signifi The Fisher exact test evaluates whether the observed frequencies in a contingency table significantly deviate from the frequencies we would expect if there were no association between the variables. We want to test whether the frequency of `HP:0000486`` is significantly higher or lower in -one genotype group compared to what would be expected if there were no association. +one genotype class compared to what would be expected if there were no association. Note that by default, the *two-tailed* Fisher exact test is performed, meaning we have no -hypothesis as to whether there is a higher or lower frequency in one of the genotype groups. +hypothesis as to whether there is a higher or lower frequency in one of the genotype classes. However, we are typically interested in testing the associations between the genotype and multiple phenotypic features at once. GPSEA takes advantage of the HPO structure and simplifies the testing for all HPO terms encoded in the cohort. @@ -109,84 +109,95 @@ Configure analysis ================== We want to test the association between frameshift *TBX5* variants and phenotypic abnormalities. -GPSEA exposes a flexible predicate API that lets us create genotype and phenotype predicates +GPSEA exposes a flexible classifier API that lets us create genotype and phenotype classifiers to assign the cohort members into genotype and phenotype categories based on the variants -and the HPO terms. We need to create one genotype predicate and one or more phenotype predicates. +and the HPO terms. +We need to create one genotype classifier and one or more phenotype classifiers. -Genotype predicate ------------------- +Genotype classifier +------------------- -We want to separate the patients into two groups: a group *with* a frameshift variant -and a group *without* a frameshift variant (i.e. any other heterozygous variant). +We want to separate the patients into two classes: a class *with* a frameshift variant +and a class *without* a frameshift variant (i.e. any other heterozygous variant). We will use the *MANE* transcript for the analysis: >>> tx_id = 'NM_181486.4' -Building a genotype predicate is a two step process. -First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +Building a genotype classifier is a two step process. +First, we create a :class:`~gpsea.analysis.predicate.VariantPredicate` to test if the variant is predicted to lead to a frameshift in `NM_181486.4`: >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) +>>> from gpsea.analysis.predicate import variant_effect +>>> is_frameshift = variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) >>> is_frameshift.description 'FRAMESHIFT_VARIANT on NM_181486.4' -and then we wrap `is_frameshift` in a :class:`~gpsea.analysis.predicate.genotype.monoallelic_predicate` -to classify each *TBX5* cohort member either as an individual with one frameshift allele (`Frameshift`) -or as an idividual with one non-frameshift allele (`Other`): +.. note:: + + The :mod:`gpsea.analysis.predicate` documentation lists all available variant predicates + and :ref:`variant-predicates` exemplifies their usage. ->>> from gpsea.analysis.predicate.genotype import monoallelic_predicate ->>> gt_predicate = monoallelic_predicate( +To build a genotype classifier, we wrap `is_frameshift` +in a Monoallelic classifier (:class:`~gpsea.analysis.clf.monoallelic_classifier`), +to classify each *TBX5* cohort member either as an individual with one *frameshift* allele (`Frameshift`) +or as an individual with one *non-frameshift* allele (`Other`): + +>>> from gpsea.analysis.clf import monoallelic_classifier +>>> gt_clf = monoallelic_classifier( ... a_predicate=is_frameshift, ... a_label="Frameshift", ... b_label="Other", ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Frameshift', 'Other') -In the subsequent analysis, `gt_predicate` will assign a cohort member into the respective group. -Note, any patient with :math:`0` or :math:`\ge 2` alleles will be *omitted* from the analysis. +.. note:: + + See the :ref:`genotype-classifiers` for other genotype classifier examples. +In the subsequent analysis, `gt_clf` assigns an individual into a genotype class. +Note, any individual with :math:`0` or :math:`\ge 2` alleles will be *omitted* from the analysis. -Phenotype predicates --------------------- + +Phenotype classifiers +--------------------- We recommend testing the genotype phenotype association for all HPO terms that annotate the cohort members, while taking advantage of the HPO graph structure and of the :ref:`true-path-rule`. -We will use the :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` -utility function to generate phenotype predicates for all HPO terms. +We will use the :func:`~gpsea.analysis.clf.prepare_classifiers_for_terms_of_interest` +utility function to generate phenotype classifiers for all HPO terms. -The function needs HPO to prepare predicates, hence we need to load HPO: +The function needs HPO to prepare classifiers, hence we need to load HPO: >>> import hpotk >>> store = hpotk.configure_ontology_store() >>> hpo = store.load_minimal_hpo(release='v2024-07-01') -and then we can create the predicates +and then we can create the classifiers ->>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest ->>> pheno_predicates = prepare_predicates_for_terms_of_interest( +>>> from gpsea.analysis.clf import prepare_classifiers_for_terms_of_interest +>>> pheno_clfs = prepare_classifiers_for_terms_of_interest( ... cohort=cohort, ... hpo=hpo, ... ) ->>> len(pheno_predicates) +>>> len(pheno_clfs) 369 The function finds 369 HPO terms that annotate at least one individual, including the *indirect* annotations whose presence is implied by the :ref:`true-path-rule`. -.. _phenotype-groups-statistical-analysis: +.. _phenotype-classes-statistical-analysis: Statistical analysis -------------------- We will use :ref:`fisher-exact-test` to test the association -between genotype and phenotype groups, as described previously. +between genotype and phenotype classes, as described previously. In the case of this cohort, we can test association between having a frameshift variant and one of 369 HPO terms. However, testing multiple hypotheses on the same dataset increases the risk of finding @@ -198,11 +209,12 @@ Phenotype MT filter selects a (sub)set of HPO terms for testing, for instance only the user-selected terms (see :class:`~gpsea.analysis.mtc_filter.SpecifiedTermsMtcFilter`) or the terms selected by :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`. -Multiple testing correction then adjusts the nominal p values for the increased risk +MTC then adjusts the nominal p values for the increased risk of false positive G/P associations. The available MTC procedures are listed in the :ref:`mtc-correction-procedures` section. -We must pick one of these to perform genotype-phenotype analysis. +We must choose a phenotype MT filter as well as a MTC procedure to perform genotype-phenotype analysis. + .. _default-hpo-analysis: @@ -210,7 +222,7 @@ Default analysis ^^^^^^^^^^^^^^^^ We recommend using HPO MT filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) as a phenotype MT filter -and Benjamini-Hochberg for multiple testing correction. +and Benjamini-Hochberg for MTC. The default analysis can be configured with :func:`~gpsea.analysis.pcats.configure_hpo_term_analysis` convenience method. >>> from gpsea.analysis.pcats import configure_hpo_term_analysis @@ -273,12 +285,12 @@ The ``analysis`` is identical to the one configured in the :ref:`default-hpo-ana Analysis ======== -We can now test associations between the genotype groups and the HPO terms: +We can now test associations between the genotype classes and the HPO terms: >>> result = analysis.compare_genotype_vs_phenotypes( ... cohort=cohort, -... gt_predicate=gt_predicate, -... pheno_predicates=pheno_predicates, +... gt_clf=gt_clf, +... pheno_clfs=pheno_clfs, ... ) >>> len(result.phenotypes) 369 @@ -286,8 +298,8 @@ We can now test associations between the genotype groups and the HPO terms: 24 -We tested the ``cohort`` for association between the genotype groups (``gt_predicate``) -and HPO terms (``pheno_predicates``). +We tested the ``cohort`` for association between the genotype classes (``gt_clf``) +and HPO terms (``pheno_clfs``). Thanks to phenotype MT filter, we only tested 24 out of 369 terms. The MT filter report shows the filtering details: @@ -299,7 +311,7 @@ The MT filter report shows the filtering details: .. raw:: html :file: report/tbx5_frameshift.mtc_report.html -.. doctest:: phenotype-groups +.. doctest:: phenotype-classes :hide: >>> if _overwrite: mtc_report.write('docs/user-guide/analyses/report/tbx5_frameshift.mtc_report.html') @@ -321,7 +333,7 @@ one HPO term per row. The rows are ordered by the corrected p value and nominal :file: report/tbx5_frameshift.csv :header-rows: 2 -.. doctest:: phenotype-groups +.. doctest:: phenotype-classes :hide: >>> if _overwrite: summary_df.to_csv('docs/user-guide/analyses/report/tbx5_frameshift.csv') diff --git a/docs/user-guide/analyses/phenotype-scores.rst b/docs/user-guide/analyses/phenotype-scores.rst index d10fbcc21..85347ed96 100644 --- a/docs/user-guide/analyses/phenotype-scores.rst +++ b/docs/user-guide/analyses/phenotype-scores.rst @@ -1,9 +1,9 @@ .. _phenotype-score-stats: -########################################### -Compare phenotype scores in genotype groups -########################################### +######################## +Compare phenotype scores +######################## .. doctest:: :hide: @@ -11,11 +11,11 @@ Compare phenotype scores in genotype groups >>> from gpsea import _overwrite -In this section, we show how to test for an association between genotype group and a phenotype score. +In this section, we show how to test for an association between a genotype class and a phenotype score. We assume a cohort was preprocessed following the :ref:`input-data` section, -and we use a genotype predicate and a phenotype scorer from the :ref:`partitioning` section to assign each cohort member -into a genotype group and to compute a phenotype score. -We use Mann-Whitney U test to test for differences in phenotype score distributions between the groups. +and we use a genotype classifier and a phenotype scorer from the :ref:`partitioning` section +to obtain a genotype class and phenotype score for each cohort member. +We use Mann-Whitney U test to test for differences in phenotype score distributions between the classes. .. _mann-whitney-u-test: @@ -24,25 +24,25 @@ We use Mann-Whitney U test to test for differences in phenotype score distributi Mann-Whitney U Test ******************* -We may want to compare the total number of occurences of a specific set of phenotypic features between two different genotype groups. +We may want to compare the total number of occurences of a specific set of phenotypic features between two different genotype classes. For instance, `Jordan et al (2018) `_ found that the total number of structural defects -of the brain, eye, heart, and kidney and sensorineural hearing loss seen in individuals with point mutations in the Atrophin-1 domain of the RERE gene +of the brain, eye, heart, and kidney and sensorineural hearing loss seen in individuals with point mutations in the Atrophin-1 domain of *RERE* is significantly higher than expected based on the number of similar defects seen in individuals with putative loss-of-function variants. Since there are five potential defects, each individual has a count ranging between 0 and 5. -We perform a Mann-Whitney U Test (or Wilcoxon Rank-Sum Test) to compare the distribution of such counts between genotype groups. -This is a non-parametric test that compares the medians of the two groups to determine if they come from the same distribution. +We perform a Mann-Whitney U Test (or Wilcoxon Rank-Sum Test) to compare the distribution of such counts between genotype classes. +This is a non-parametric test that compares the medians of the two classes to determine if they come from the same distribution. >>> import scipy.stats as stats ->>> group1 = [0, 0, 1, 0, 2, 0, 1, 1, 1, 0, 2, 0, 0, 3, 1, 1, 1, 0] ->>> group2 = [4, 5, 3, 4, 3, 3, 3, 4, 4, 5, 5, 2, 3, 0, 3, 5, 2, 3] ->>> r = stats.mannwhitneyu(x=group1, y=group2, alternative = 'two-sided') +>>> class1 = [0, 0, 1, 0, 2, 0, 1, 1, 1, 0, 2, 0, 0, 3, 1, 1, 1, 0] +>>> class2 = [4, 5, 3, 4, 3, 3, 3, 4, 4, 5, 5, 2, 3, 0, 3, 5, 2, 3] +>>> r = stats.mannwhitneyu(x=class1, y=class2, alternative = 'two-sided') >>> p_value = r.pvalue >>> float(p_value) 6.348081479150902e-06 -p value of `6.348081479150901e-06` suggests a significant difference between the groups. +p value of `6.348081479150901e-06` suggests a significant difference between the classes. **************** @@ -110,13 +110,13 @@ In this example, the point mutation is a mutation that meets the following condi * the :ref:`change-length-of-an-allele` is equal to `0` >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> from gpsea.analysis.predicate import change_length, ref_length, anyof, variant_effect >>> point_mutation_effects = ( ... VariantEffect.MISSENSE_VARIANT, ... ) ->>> point_mutation = VariantPredicates.change_length('==', 0) \ -... & VariantPredicates.ref_length('==', 1) \ -... & VariantPredicates.any(VariantPredicates.variant_effect(effect, tx_id) for effect in point_mutation_effects) +>>> point_mutation = change_length('==', 0) \ +... & ref_length('==', 1) \ +... & anyof(variant_effect(effect, tx_id) for effect in point_mutation_effects) >>> point_mutation.description '((change length == 0 AND reference allele length == 1) AND MISSENSE_VARIANT on NM_001042681.2)' @@ -129,20 +129,20 @@ For the loss of function predicate, the following variant effects are considered ... VariantEffect.START_LOST, ... VariantEffect.STOP_GAINED, ... ) ->>> lof_mutation = VariantPredicates.any(VariantPredicates.variant_effect(eff, tx_id) for eff in lof_effects) +>>> lof_mutation = anyof(variant_effect(eff, tx_id) for eff in lof_effects) >>> lof_mutation.description '(TRANSCRIPT_ABLATION on NM_001042681.2 OR FRAMESHIFT_VARIANT on NM_001042681.2 OR START_LOST on NM_001042681.2 OR STOP_GAINED on NM_001042681.2)' -The genotype predicate will bin the patient into two groups: a point mutation group or the loss of function group: +The genotype predicate will bin the patient into two classes: a point mutation or the loss of function: ->>> from gpsea.analysis.predicate.genotype import monoallelic_predicate ->>> gt_predicate = monoallelic_predicate( +>>> from gpsea.analysis.clf import monoallelic_classifier +>>> gt_clf = monoallelic_classifier( ... a_predicate=point_mutation, ... b_predicate=lof_mutation, ... a_label="Point", b_label="LoF", ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Point', 'LoF') @@ -234,7 +234,7 @@ We execute the analysis by running >>> result = score_analysis.compare_genotype_vs_phenotype_score( ... cohort=cohort, -... gt_predicate=gt_predicate, +... gt_clf=gt_clf, ... pheno_scorer=pheno_scorer, ... ) @@ -263,7 +263,7 @@ Subject 2[PMID_29330883_Subject_2] 1 1 The data frame provides a `genotype` category and a `phenotype_score` for each patient. The genotype category should be interpreted in the context of the genotype predicate: ->>> gt_id_to_name = {c.category.cat_id: c.category.name for c in gt_predicate.get_categorizations()} +>>> gt_id_to_name = {c.category.cat_id: c.category.name for c in gt_clf.get_categorizations()} >>> gt_id_to_name {0: 'Point', 1: 'LoF'} @@ -285,7 +285,7 @@ to visualize the phenotype score distributions: ... ) -.. image:: /img/rere_phenotype_score_boxplot.png +.. image:: report/rere_phenotype_score_boxplot.png :alt: Phenotype score distribution :align: center :width: 600px @@ -293,7 +293,7 @@ to visualize the phenotype score distributions: .. doctest:: phenotype-scores :hide: - >>> if _overwrite: fig.savefig('docs/img/rere_phenotype_score_boxplot.png') + >>> if _overwrite: fig.savefig('docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png') diff --git a/docs/img/rere_phenotype_score_boxplot.png b/docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png similarity index 100% rename from docs/img/rere_phenotype_score_boxplot.png rename to docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png diff --git a/docs/img/umod_km_curves.png b/docs/user-guide/analyses/report/umod_km_curves.png similarity index 100% rename from docs/img/umod_km_curves.png rename to docs/user-guide/analyses/report/umod_km_curves.png diff --git a/docs/user-guide/analyses/survival.rst b/docs/user-guide/analyses/survival.rst index 258fbe6b8..616bfa467 100644 --- a/docs/user-guide/analyses/survival.rst +++ b/docs/user-guide/analyses/survival.rst @@ -46,26 +46,27 @@ Configure analysis >>> tx_id = 'NM_003361.4' -Genotype predicate ------------------- + +Genotype classifier +------------------- One allele of exon 3 vs. one allele of elsewhere. ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> is_in_exon3 = VariantPredicates.exon(exon=3, tx_id=tx_id) +>>> from gpsea.analysis.predicate import exon +>>> is_in_exon3 = exon(exon=3, tx_id=tx_id) >>> is_in_exon3.description 'overlaps with exon 3 of NM_003361.4' -Monoallelic predicate to compare one allele of *UMOD* exon 3 variant +Monoallelic classifier to compare one allele of *UMOD* exon 3 variant versus one allele of other *UMOD* variant: ->>> from gpsea.analysis.predicate.genotype import monoallelic_predicate ->>> gt_predicate = monoallelic_predicate( +>>> from gpsea.analysis.clf import monoallelic_classifier +>>> gt_clf = monoallelic_classifier( ... a_predicate=is_in_exon3, ... b_predicate=~is_in_exon3, ... a_label="Exon 3", b_label="Other", ... ) ->>> gt_predicate.group_labels +>>> gt_clf.class_labels ('Exon 3', 'Other') @@ -102,6 +103,7 @@ the genotype groups: >>> from gpsea.analysis.temporal.stats import LogRankTest >>> survival_statistic = LogRankTest() + Final analysis -------------- @@ -112,6 +114,7 @@ We will put the final analysis together into a :class:`~gpsea.analysis.temporal. ... statistic=survival_statistic, ... ) + Analysis ======== @@ -119,7 +122,7 @@ We execute the analysis by running >>> result = survival_analysis.compare_genotype_vs_survival( ... cohort=cohort, -... gt_predicate=gt_predicate, +... gt_clf=gt_clf, ... endpoint=endpoint, ... ) @@ -152,7 +155,7 @@ We can plot Kaplan-Meier curves: ... ) >>> _ = ax.grid(axis="y") -.. image:: /img/umod_km_curves.png +.. image:: report/umod_km_curves.png :alt: UMOD Kaplan-Meier curves :align: center :width: 600px @@ -160,7 +163,7 @@ We can plot Kaplan-Meier curves: .. doctest:: survival :hide: - >>> if _overwrite: fig.savefig('docs/img/umod_km_curves.png') + >>> if _overwrite: fig.savefig('docs/user-guide/analyses/report/umod_km_curves.png') Raw data @@ -179,7 +182,7 @@ AII.5[PMID_22034507_AII_5] 0 Survival(value=22280.25, is_censored=False AIII.4[PMID_22034507_AIII_4] 0 Survival(value=19723.5, is_censored=False) Each line corresponeds to an individual and the dataframe is indexed by the individual's identifier/label. -The `genotype` column contains the genotype group code, +The `genotype` column contains the genotype class code, and `phenotype` column includes a :class:`~gpsea.analysis.temporal.Survival` value or `None` if computing the survival was impossible (see :func:`~gpsea.analysis.temporal.endpoint.hpo_onset` for details). The `Survival` reports the number of days until attaining the endpoint, diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py index 3aad29657..1725cf9e8 100644 --- a/src/gpsea/analysis/_base.py +++ b/src/gpsea/analysis/_base.py @@ -6,8 +6,7 @@ import numpy as np import pandas as pd -from .predicate.phenotype import PhenotypePolyPredicate, P -from .predicate.genotype import GenotypePolyPredicate +from .clf import GenotypeClassifier, PhenotypeClassifier, P from ._partition import Partitioning @@ -34,7 +33,7 @@ def __init__( assert isinstance(pval, float) and (math.isnan(pval) or 0.0 <= pval <= 1.0) self._pval = float(pval) - + @property def statistic(self) -> typing.Optional[float]: """ @@ -57,7 +56,12 @@ def __eq__(self, value: object) -> bool: ) def __hash__(self) -> int: - return hash((self._statistic, self._pval,)) + return hash( + ( + self._statistic, + self._pval, + ) + ) def __str__(self) -> str: return repr(self) @@ -73,7 +77,7 @@ class AnalysisException(Exception): To aid troubleshooting, the exception includes :attr:`~gpsea.analysis.AnalysisException.data` - a mapping with any data that has been computed prior encountering the issues. """ - + def __init__( self, data: typing.Mapping[str, typing.Any], @@ -88,7 +92,7 @@ def data(self) -> typing.Mapping[str, typing.Any]: Get a mapping with (partial) data to aid troubleshooting. """ return self._data - + def __repr__(self) -> str: return f"AnalysisException(args={self.args}, data={self._data})" @@ -97,7 +101,7 @@ class Statistic(metaclass=abc.ABCMeta): """ Mixin for classes that are used to compute a nominal p value for a genotype-phenotype association. """ - + def __init__( self, name: str, @@ -110,12 +114,12 @@ def name(self) -> str: Get the name of the statistic (e.g. `Fisher Exact Test`, `Logrank test`). """ return self._name - + def __eq__(self, value: object) -> bool: if isinstance(value, Statistic): return self._name == value._name return NotImplemented - + def __hash__(self) -> int: return hash((self._name,)) @@ -127,22 +131,22 @@ class AnalysisResult(metaclass=abc.ABCMeta): def __init__( self, - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, statistic: Statistic, ): - assert isinstance(gt_predicate, GenotypePolyPredicate) - self._gt_predicate = gt_predicate + assert isinstance(gt_clf, GenotypeClassifier) + self._gt_clf = gt_clf assert isinstance(statistic, Statistic) self._statistic = statistic @property - def gt_predicate(self) -> GenotypePolyPredicate: + def gt_clf(self) -> GenotypeClassifier: """ - Get the genotype predicate used in the survival analysis that produced this result. + Get the genotype classifier used in the survival analysis that produced this result. """ - return self._gt_predicate - + return self._gt_clf + @property def statistic(self) -> Statistic: """ @@ -151,15 +155,19 @@ def statistic(self) -> Statistic: return self._statistic def __eq__(self, value: object) -> bool: - return isinstance(value, AnalysisResult) \ - and self._gt_predicate == value._gt_predicate \ + return ( + isinstance(value, AnalysisResult) + and self._gt_clf == value._gt_clf and self._statistic == value._statistic - + ) + def __hash__(self) -> int: - return hash(( - self._gt_predicate, - self._statistic, - )) + return hash( + ( + self._gt_clf, + self._statistic, + ) + ) class MultiPhenotypeAnalysisResult(typing.Generic[P], AnalysisResult): @@ -167,24 +175,24 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], AnalysisResult): `MultiPhenotypeAnalysisResult` reports the outcome of an analysis that tested the association of genotype with two or more phenotypes. """ - + def __init__( self, - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Iterable[PhenotypeClassifier[P]], statistic: Statistic, n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], - statistic_results: typing.Sequence[StatisticResult], + statistic_results: typing.Sequence[typing.Optional[StatisticResult]], corrected_pvals: typing.Optional[typing.Sequence[float]], - mtc_correction: typing.Optional[str] + mtc_correction: typing.Optional[str], ): super().__init__( - gt_predicate=gt_predicate, + gt_clf=gt_clf, statistic=statistic, ) - self._pheno_predicates = tuple(pheno_predicates) + self._pheno_clfs = tuple(pheno_clfs) self._n_usable = tuple(n_usable) self._all_counts = tuple(all_counts) @@ -205,44 +213,44 @@ def _check_sanity(self) -> typing.Sequence[str]: errors = [] # All sequences must have the same lengths ... for seq, name in ( - (self._n_usable, 'n_usable'), - (self._all_counts, 'all_counts'), - (self._statistic_results, 'statistic_results'), + (self._n_usable, "n_usable"), + (self._all_counts, "all_counts"), + (self._statistic_results, "statistic_results"), ): - if len(self._pheno_predicates) != len(seq): + if len(self._pheno_clfs) != len(seq): errors.append( - f"`len(pheno_predicates)` must be the same as `len({name})` but " - f"{len(self._pheno_predicates)}!={len(seq)}" + f"`len(pheno_clfs)` must be the same as `len({name})` but " + f"{len(self._pheno_clfs)}!={len(seq)}" ) # ... including the optional corrected p values - if self._corrected_pvals is not None and len(self._pheno_predicates) != len(self._corrected_pvals): + if self._corrected_pvals is not None and len(self._pheno_clfs) != len( + self._corrected_pvals + ): errors.append( f"`len(pheno_predicates)` must be the same as `len(corrected_pvals)` but " - f"{len(self._pheno_predicates)}!={len(self._corrected_pvals)}" + f"{len(self._pheno_clfs)}!={len(self._corrected_pvals)}" ) - if not isinstance(self._gt_predicate, GenotypePolyPredicate): - errors.append( - "`gt_predicate` must be an instance of `GenotypePolyPredicate`" - ) + if not isinstance(self._gt_clf, GenotypeClassifier): + errors.append("`gt_clf` must be an instance of `GenotypeClassifier`") return errors @property - def pheno_predicates( + def pheno_clfs( self, - ) -> typing.Sequence[PhenotypePolyPredicate[P]]: + ) -> typing.Sequence[PhenotypeClassifier[P]]: """ - Get the phenotype predicates used in the analysis. + Get the phenotype classifiers used in the analysis. """ - return self._pheno_predicates + return self._pheno_clfs @property def phenotypes(self) -> typing.Sequence[P]: """ Get the phenotypes that were tested for association with genotype in the analysis. """ - return tuple(p.phenotype for p in self._pheno_predicates) + return tuple(p.phenotype for p in self._pheno_clfs) @property def n_usable(self) -> typing.Sequence[int]: @@ -286,7 +294,9 @@ def pvals(self) -> typing.Sequence[float]: Get a sequence of nominal p values for each tested phenotype. The sequence includes a `NaN` value for each phenotype that was *not* tested. """ - return tuple(float('nan') if r is None else r.pval for r in self._statistic_results ) + return tuple( + float("nan") if r is None else r.pval for r in self._statistic_results + ) @property def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: @@ -296,10 +306,10 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: The sequence includes a `NaN` value for each phenotype that was *not* tested. """ return self._corrected_pvals - + def n_significant_for_alpha( self, - alpha: float = .05, + alpha: float = 0.05, ) -> typing.Optional[int]: """ Get the count of the corrected p values with the value being less than or equal to `alpha`. @@ -310,10 +320,10 @@ def n_significant_for_alpha( return None else: return sum(p_val <= alpha for p_val in self.corrected_pvals) - + def significant_phenotype_indices( self, - alpha: float = .05, + alpha: float = 0.05, pval_kind: typing.Literal["corrected", "nominal"] = "corrected", ) -> typing.Optional[typing.Sequence[int]]: """ @@ -328,10 +338,10 @@ def significant_phenotype_indices( vals = np.array(self.pvals) else: raise ValueError(f"Unsupported `pval_kind` value {pval_kind}") - + if vals is None: return None - + not_na = ~np.isnan(vals) significant = vals <= alpha selected = not_na & significant @@ -354,25 +364,29 @@ def mtc_correction(self) -> typing.Optional[str]: return self._mtc_correction def __eq__(self, value: object) -> bool: - return isinstance(value, MultiPhenotypeAnalysisResult) \ - and super(AnalysisResult, self).__eq__(value) \ - and self._pheno_predicates == value._pheno_predicates \ - and self._n_usable == value._n_usable \ - and self._all_counts == value._all_counts \ - and self._statistic_results == value._statistic_results \ - and self._corrected_pvals == value._corrected_pvals \ + return ( + isinstance(value, MultiPhenotypeAnalysisResult) + and super(AnalysisResult, self).__eq__(value) + and self._pheno_clfs == value._pheno_clfs + and self._n_usable == value._n_usable + and self._all_counts == value._all_counts + and self._statistic_results == value._statistic_results + and self._corrected_pvals == value._corrected_pvals and self._mtc_correction == value._mtc_correction - + ) + def __hash__(self) -> int: - return hash(( - super(AnalysisResult, self).__hash__(), - self._pheno_predicates, - self._n_usable, - self._all_counts, - self._statistic_results, - self._corrected_pvals, - self._mtc_correction, - )) + return hash( + ( + super(AnalysisResult, self).__hash__(), + self._pheno_clfs, + self._n_usable, + self._all_counts, + self._statistic_results, + self._corrected_pvals, + self._mtc_correction, + ) + ) class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta): @@ -380,7 +394,7 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta): `MonoPhenotypeAnalysisResult` reports the outcome of an analysis that tested a single genotype-phenotype association. """ - + GT_COL = "genotype" """ Name of column for storing genotype data. @@ -398,13 +412,13 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta): def __init__( self, - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, phenotype: Partitioning, statistic: Statistic, data: pd.DataFrame, statistic_result: StatisticResult, ): - super().__init__(gt_predicate, statistic) + super().__init__(gt_clf, statistic) assert isinstance(phenotype, Partitioning) self._phenotype = phenotype @@ -416,7 +430,7 @@ def __init__( assert isinstance(statistic_result, StatisticResult) self._statistic_result = statistic_result - + @property def phenotype(self) -> Partitioning: """ @@ -431,12 +445,12 @@ def data(self) -> pd.DataFrame: The index of the data frame contains the identifiers of the tested individuals, and the values are stored in `genotype` and `phenotype` columns. - + The `genotype` column includes the genotype category ID - (:attr:`~gpsea.analysis.predicate.PatientCategory.cat_id`) + (:attr:`~gpsea.analysis.clf.PatientCategory.cat_id`) or `None` if the individual could not be assigned into a genotype group. The `phenotype` contains the phenotype values, and the data type depends on the analysis. - + Here are some common phenotype data types: * a phenotype score computed in :class:`~gpsea.analysis.pscore.PhenotypeScoreAnalysis` is a `float` @@ -469,16 +483,20 @@ def pval(self) -> float: return self._statistic_result.pval def __eq__(self, value: object) -> bool: - return isinstance(value, MonoPhenotypeAnalysisResult) \ - and super(AnalysisResult, self).__eq__(value) \ - and self._phenotype == value._phenotype \ - and self._statistic_result == value._statistic_result \ + return ( + isinstance(value, MonoPhenotypeAnalysisResult) + and super(AnalysisResult, self).__eq__(value) + and self._phenotype == value._phenotype + and self._statistic_result == value._statistic_result and self._data.equals(value._data) - + ) + def __hash__(self) -> int: - return hash(( - super(AnalysisResult, self).__hash__(), - self._phenotype, - self._statistic_result, - self._data, - )) + return hash( + ( + super(AnalysisResult, self).__hash__(), + self._phenotype, + self._statistic_result, + self._data, + ) + ) diff --git a/src/gpsea/analysis/clf/__init__.py b/src/gpsea/analysis/clf/__init__.py new file mode 100644 index 000000000..3cf48eda7 --- /dev/null +++ b/src/gpsea/analysis/clf/__init__.py @@ -0,0 +1,38 @@ +from ._api import Classifier, PatientCategory, Categorization +from ._api import C, GenotypeClassifier +from ._api import P, PhenotypeClassifier, PhenotypeCategorization +from ._counter import AlleleCounter +from ._pheno import HpoClassifier, DiseasePresenceClassifier +from ._gt_classifiers import ( + sex_classifier, + diagnosis_classifier, + monoallelic_classifier, + biallelic_classifier, + allele_count, +) +from ._util import ( + prepare_classifiers_for_terms_of_interest, + prepare_hpo_terms_of_interest, +) + + +__all__ = [ + "Classifier", + "PatientCategory", + "Categorization", + "GenotypeClassifier", + "C", + "AlleleCounter", + "sex_classifier", + "diagnosis_classifier", + "monoallelic_classifier", + "biallelic_classifier", + "allele_count", + "PhenotypeClassifier", + "PhenotypeCategorization", + "P", + "HpoClassifier", + "DiseasePresenceClassifier", + "prepare_classifiers_for_terms_of_interest", + "prepare_hpo_terms_of_interest", +] diff --git a/src/gpsea/analysis/clf/_api.py b/src/gpsea/analysis/clf/_api.py new file mode 100644 index 000000000..ad94f5874 --- /dev/null +++ b/src/gpsea/analysis/clf/_api.py @@ -0,0 +1,350 @@ +import abc +import os +import typing + +from collections import Counter + +import hpotk +import hpotk.util + +from gpsea.model import Patient + +from .._partition import Partitioning + + +class PatientCategory: + """ + `PatientCategory` represents one of several exclusive discrete classes. + + Patient class has :attr:`cat_id`, a unique numeric identifier of the class, + :attr:`name` with human-readable class name, and :attr:`description` with an optional verbose description. + """ + + def __init__( + self, + cat_id: int, + name: str, + description: typing.Optional[str] = None, + ): + self._cat_id = hpotk.util.validate_instance(cat_id, int, "cat_id") + self._name = hpotk.util.validate_instance(name, str, "name") + self._description = hpotk.util.validate_optional_instance( + description, str, "description" + ) + + @property + def cat_id(self) -> int: + """ + Get an `int` with the unique numeric identifier of the class. + """ + return self._cat_id + + @property + def name(self) -> str: + """ + Get a `str` with a human-readable name of the class. + """ + return self._name + + @property + def description(self) -> typing.Optional[str]: + """ + Get a `str` with an optional detailed class description. + """ + return self._description + + def __repr__(self) -> str: + return ( + f"PatientCategory(cat_id={self.cat_id}, " + f"name={self.name}, " + f"description={self.description})" + ) + + def __str__(self) -> str: + return self._name + + def __eq__(self, other) -> bool: + return ( + isinstance(other, PatientCategory) + and self.cat_id == other.cat_id + and self.name == other.name + and self.description == other.description + ) + + def __hash__(self) -> int: + return hash((self.cat_id, self.name, self.description)) + + +class Categorization: + """ + `Categorization` represents one of discrete classes a :class:`~gpsea.model.Patient` can be assigned into. + """ + + @staticmethod + def from_raw_parts( + cat_id: int, + name: str, + description: typing.Optional[str] = None, + ): + """ + Create `Categorization` from the `cat_id` identifier, `name`, and an optional `description`. + """ + return Categorization( + category=PatientCategory( + cat_id=cat_id, + name=name, + description=description, + ) + ) + + def __init__( + self, + category: PatientCategory, + ): + self._category = hpotk.util.validate_instance( + category, PatientCategory, "category" + ) + + @property + def category(self) -> PatientCategory: + return self._category + + def __eq__(self, other): + return isinstance(other, Categorization) and self._category == other._category + + def __hash__(self): + return hash((self._category,)) + + def __repr__(self): + return f"Categorization(category={self._category})" + + def __str__(self): + return repr(self) + + +C = typing.TypeVar("C", bound=Categorization) +""" +A generic bound for types that extend :class:`Categorization`. +""" + + +class Classifier(typing.Generic[C], Partitioning, metaclass=abc.ABCMeta): + """ + `Classifier` partitions a :class:`~gpsea.model.Patient` into one of several discrete classes + represented by a :class:`~gpsea.analysis.clf.Categorization`. + + The classes must be *exclusive* - the individual can be binned into one and only one class, + and *exhaustive* - the classes must cover all possible scenarios. + + However, if the individual cannot be assigned into any meaningful class, `None` can be returned. + As a rule of thumb, returning `None` will exclude the individual from the analysis. + """ + + @abc.abstractmethod + def get_categorizations(self) -> typing.Sequence[C]: + """ + Get a sequence of all categories which the classifier can produce. + """ + pass + + def get_categories(self) -> typing.Iterator[PatientCategory]: + """ + Get an iterator with :class:`PatientCategory` instances that the classifier can produce. + """ + return (c.category for c in self.get_categorizations()) + + @property + def class_labels(self) -> typing.Collection[str]: + """ + Get a collection with names of the :class:`PatientCategory` items that the classifier can produce. + """ + return tuple(cat.name for cat in self.get_categories()) + + def summarize_classes(self) -> str: + cat_names = ", ".join(self.class_labels) + return f"{self.variable_name}: {cat_names}" + + def summarize( + self, + out: typing.TextIO, + ): + """ + Summarize the predicate into the `out` handle. + + The summary includes the name, summary, and the groups the predicate can assign individuals into. + """ + Partitioning.summarize(self, out) + out.write(self.summarize_classes()) + out.write(os.linesep) + + def n_categorizations(self) -> int: + """ + Get the number of categorizations the classifier can produce. + """ + return len(self.get_categorizations()) + + def get_category( + self, + cat_id: int, + ) -> PatientCategory: + """ + Get the category name for a :attr:`PatientCategory.cat_id`. + + :param cat_id: an `int` with the id. + :raises: ValueError if there is no such category was defined. + """ + for ctg in self.get_categories(): + if ctg.cat_id == cat_id: + return ctg + raise ValueError(f"No category for {cat_id} was found") + + def get_category_name( + self, + cat_id: int, + ) -> str: + """ + Get the category name for a :attr:`PatientCategory.cat_id`. + + :param cat_id: an `int` with the id. + :raises: ValueError if there is no such category was defined. + """ + return self.get_category(cat_id).name + + @staticmethod + def _check_categorizations( + categorizations: typing.Sequence[Categorization], + ) -> typing.Sequence[str]: + """ + Check that the categorizations meet the requirements. + + The requirements include: + + * the `cat_id` must be unique within the predicate + """ + issues = [] + # There must be at least one category + + # `cat_id` must be unique for a predicate! + cat_id_counts = Counter() + for c in categorizations: + cat_id_counts[c.category.cat_id] += 1 + + for cat_id, count in cat_id_counts.items(): + if count > 1: + issues.append(f"`cat_id` {cat_id} is present {count}>1 times") + + return issues + + @abc.abstractmethod + def test(self, individual: Patient) -> typing.Optional[C]: + """ + Assign an `individual` into a class. + + Return `None` if the individual cannot be assigned into any meaningful class. + """ + pass + + +class GenotypeClassifier(Classifier[Categorization], metaclass=abc.ABCMeta): + """ + `GenotypeClassifier` is a base class for all types + that assign an individual into a group based on the genotype. + """ + + pass + + +P = typing.TypeVar("P") +""" +Phenotype entity of interest, such as :class:`~hpotk.model.TermId`, representing an HPO term or an OMIM/MONDO term. + +However, phenotype entity can be anything as long as it is :class:`~typing.Hashable` and comparable +(have `__eq__` and `__lt__` magic methods). +""" + +YES = PatientCategory(1, 'Yes', 'The patient belongs to the group.') +""" +Category for a patient who *belongs* to the tested group. +""" + +NO = PatientCategory(0, 'No', 'The patient does not belong to the group.') +""" +Category for a patient who does *not* belong to the tested group. +""" + + +class PhenotypeCategorization(typing.Generic[P], Categorization): + """ + On top of the attributes of the `Categorization`, `PhenotypeCategorization` keeps track of the target phenotype `P`. + """ + + def __init__( + self, + category: PatientCategory, + phenotype: P, + ): + super().__init__(category) + self._phenotype = phenotype + + @property + def phenotype(self) -> P: + return self._phenotype + + def __repr__(self) -> str: + return ( + "PhenotypeCategorization(" + f"category={self._category}, " + f"phenotype={self._phenotype})" + ) + + def __str__(self) -> str: + return repr(self) + + def __eq__(self, other) -> bool: + return isinstance(other, PhenotypeCategorization) \ + and self._category == other._category \ + and self._phenotype == other._phenotype + + def __hash__(self) -> int: + return hash((self._category, self._phenotype)) + + +class PhenotypeClassifier( + typing.Generic[P], Classifier[PhenotypeCategorization[P]], + metaclass=abc.ABCMeta, +): + """ + Phenotype classifier assigns an individual into a class `P` based on the phenotype. + + The class `P` can be a :class:`~hpotk.model.TermId` representing an HPO term or an OMIM/MONDO term. + + Only one class can be investigated, and :attr:`phenotype` returns the investigated phenotype + (e.g. *Arachnodactyly* `HP:0001166`). + + As another hallmark of this predicate, one of the categorizations must correspond to the group of patients + who exibit the investigated phenotype. The categorization is provided + via :attr:`present_phenotype_categorization` property. + """ + + @property + @abc.abstractmethod + def phenotype(self) -> P: + """ + Get the phenotype entity of interest. + """ + pass + + @property + @abc.abstractmethod + def present_phenotype_categorization(self) -> PhenotypeCategorization[P]: + """ + Get the categorization which represents the group of the patients who exibit the investigated phenotype. + """ + pass + + @property + def present_phenotype_category(self) -> PatientCategory: + """ + Get the patient category that correspond to the group of the patients who exibit the investigated phenotype. + """ + return self.present_phenotype_categorization.category diff --git a/src/gpsea/analysis/predicate/genotype/_counter.py b/src/gpsea/analysis/clf/_counter.py similarity index 97% rename from src/gpsea/analysis/predicate/genotype/_counter.py rename to src/gpsea/analysis/clf/_counter.py index 8ad77d555..9b21ed33b 100644 --- a/src/gpsea/analysis/predicate/genotype/_counter.py +++ b/src/gpsea/analysis/clf/_counter.py @@ -1,5 +1,6 @@ from gpsea.model import Patient, Genotype -from ._api import VariantPredicate + +from ..predicate import VariantPredicate class AlleleCounter: diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/clf/_gt_classifiers.py similarity index 82% rename from src/gpsea/analysis/predicate/genotype/_gt_predicates.py rename to src/gpsea/analysis/clf/_gt_classifiers.py index 36d47052a..51a4ce289 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/clf/_gt_classifiers.py @@ -5,12 +5,11 @@ import hpotk from gpsea.model import Patient +from ..predicate import VariantPredicate, true -from .._api import Categorization, PatientCategory -from ._api import GenotypePolyPredicate -from ._api import VariantPredicate +from ._api import Categorization, PatientCategory +from ._api import GenotypeClassifier from ._counter import AlleleCounter -from ._variant import VariantPredicates def _fixate_partitions( @@ -22,10 +21,14 @@ def _fixate_partitions( fixed.append((partition,)) elif isinstance(partition, typing.Iterable): vals = tuple(partition) - assert all(isinstance(val, int) for val in vals), 'All indices must be `int`s!' + assert all( + isinstance(val, int) for val in vals + ), "All indices must be `int`s!" fixed.append(vals) else: - raise ValueError(f'Partition {i} is neither an `int` nor an iterable of `int`s: {partition}') + raise ValueError( + f"Partition {i} is neither an `int` nor an iterable of `int`s: {partition}" + ) return fixed @@ -125,7 +128,7 @@ def _compute_hash( return hash_value -class PolyCountingGenotypePredicate(GenotypePolyPredicate): +class PolyCountingGenotypeClassifier(GenotypeClassifier): # NOT PART OF THE PUBLIC API @staticmethod @@ -134,7 +137,7 @@ def monoallelic( b_predicate: VariantPredicate, a_label: str, b_label: str, - ) -> "PolyCountingGenotypePredicate": + ) -> "PolyCountingGenotypeClassifier": count2cat = { (1, 0): Categorization( PatientCategory( @@ -148,7 +151,7 @@ def monoallelic( ), } - return PolyCountingGenotypePredicate.for_predicates_and_categories( + return PolyCountingGenotypeClassifier.for_predicates_and_categories( total_count=1, count2cat=count2cat, a_predicate=a_predicate, @@ -162,14 +165,14 @@ def biallelic( a_label: str, b_label: str, partitions: typing.Iterable[typing.Iterable[int]], - ) -> "PolyCountingGenotypePredicate": + ) -> "PolyCountingGenotypeClassifier": count2cat = _build_count_to_cat( a_label=a_label, b_label=b_label, partitions=partitions, ) - return PolyCountingGenotypePredicate.for_predicates_and_categories( + return PolyCountingGenotypeClassifier.for_predicates_and_categories( total_count=2, count2cat=count2cat, a_predicate=a_predicate, @@ -182,8 +185,8 @@ def for_predicates_and_categories( count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], a_predicate: VariantPredicate, b_predicate: VariantPredicate, - ) -> "PolyCountingGenotypePredicate": - return PolyCountingGenotypePredicate( + ) -> "PolyCountingGenotypeClassifier": + return PolyCountingGenotypeClassifier( total_count=total_count, a_counter=AlleleCounter(a_predicate), b_counter=AlleleCounter(b_predicate), @@ -209,12 +212,12 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: @property def name(self) -> str: - return "Allele Group Predicate" + return "Allele Group Classifier" @property def description(self) -> str: allele = "allele" if self._total_count == 1 else "alleles" - return f"Partition by allele group ({self._total_count} {allele} per group)" + return f"Classify by allele group ({self._total_count} {allele} per group)" @property def variable_name(self) -> str: @@ -231,7 +234,7 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def __eq__(self, value: object) -> bool: return ( - isinstance(value, PolyCountingGenotypePredicate) + isinstance(value, PolyCountingGenotypeClassifier) and self._count2cat == value._count2cat and self._a_counter == value._a_counter and self._b_counter == value._b_counter @@ -241,18 +244,18 @@ def __hash__(self) -> int: return self._hash -def monoallelic_predicate( +def monoallelic_classifier( a_predicate: VariantPredicate, b_predicate: typing.Optional[VariantPredicate] = None, a_label: str = "A", b_label: str = "B", -) -> GenotypePolyPredicate: +) -> GenotypeClassifier: """ - The predicate bins patient into one of two groups, `A` and `B`, + Monoallelic classifier bins patient into one of two groups, `A` and `B`, based on presence of *exactly one* allele of a variant that meets the predicate criteria. - See :ref:`monoallelic-predicate` for more information and an example usage. + See :ref:`monoallelic-classifier` for more information and an example usage. :param a_predicate: predicate to test if the variants meet the criteria of the first group (named `A` by default). @@ -268,7 +271,7 @@ def monoallelic_predicate( if b_predicate is None: b_predicate = ~a_predicate - return PolyCountingGenotypePredicate.monoallelic( + return PolyCountingGenotypeClassifier.monoallelic( a_predicate=a_predicate, b_predicate=b_predicate, a_label=a_label, @@ -276,20 +279,24 @@ def monoallelic_predicate( ) -def biallelic_predicate( +def biallelic_classifier( a_predicate: VariantPredicate, b_predicate: typing.Optional[VariantPredicate] = None, a_label: str = "A", b_label: str = "B", - partitions: typing.Collection[typing.Union[int, typing.Collection[int]]] = (0, 1, 2), -) -> GenotypePolyPredicate: + partitions: typing.Collection[typing.Union[int, typing.Collection[int]]] = ( + 0, + 1, + 2, + ), +) -> GenotypeClassifier: """ - The predicate bins patient into one of the three groups, + Biallelic classifier assigns an individual into one of the three classes, `AA`, `AB`, and `BB`, based on presence of *two* variant alleles - that meet the predicate criteria. + that meet the criteria. - See :ref:`biallelic-predicate` for more information and an example usage. + See :ref:`biallelic-classifier` for more information and an example usage. :param a_predicate: predicate to test if the variants meet the criteria of the first group (named `A` by default). @@ -310,7 +317,7 @@ def biallelic_predicate( if b_predicate is None: b_predicate = ~a_predicate - return PolyCountingGenotypePredicate.biallelic( + return PolyCountingGenotypeClassifier.biallelic( a_predicate=a_predicate, b_predicate=b_predicate, a_label=a_label, @@ -344,9 +351,9 @@ def _build_ac_to_cat( def allele_count( counts: typing.Collection[typing.Union[int, typing.Collection[int]]], target: typing.Optional[VariantPredicate] = None, -) -> GenotypePolyPredicate: +) -> GenotypeClassifier: """ - Create a predicate to assign the patient into a group based on the allele count + Allele count classifier assigns the individual into a group based on the allele count of the target variants. The `counts` option takes an `int` collection or a collection of `int` collections. @@ -361,22 +368,22 @@ def allele_count( The following counts will partition the cohort into individuals with zero allele or one target allele: - >>> from gpsea.analysis.predicate.genotype import allele_count + >>> from gpsea.analysis.clf import allele_count >>> zero_vs_one = allele_count(counts=(0, 1)) - >>> zero_vs_one.summarize_groups() + >>> zero_vs_one.summarize_classes() 'Allele count: 0, 1' - These counts will create three groups for individuals with zero, one or two alleles: + These counts will create three classes for individuals with zero, one or two alleles: >>> zero_vs_one_vs_two = allele_count(counts=(0, 1, 2)) - >>> zero_vs_one_vs_two.summarize_groups() + >>> zero_vs_one_vs_two.summarize_classes() 'Allele count: 0, 1, 2' Last, the counts below will create two groups, one for the individuals with zero target variant type alleles, and one for the individuals with one or two alleles: >>> zero_vs_one_vs_two = allele_count(counts=(0, {1, 2})) - >>> zero_vs_one_vs_two.summarize_groups() + >>> zero_vs_one_vs_two.summarize_classes() 'Allele count: 0, 1 OR 2' Note that we wrap the last two allele counts in a set. @@ -386,7 +393,7 @@ def allele_count( or `None` if *all* variants in the individual should be used. """ if target is None: - target = VariantPredicates.true() + target = true() else: assert isinstance(target, VariantPredicate) @@ -396,15 +403,14 @@ def allele_count( count2cat = _build_ac_to_cat(counts) counter = AlleleCounter(predicate=target) - - return AlleleCountPredicate( + + return AlleleCountClassifier( count2cat=count2cat, counter=counter, ) -class AlleleCountPredicate(GenotypePolyPredicate): - +class AlleleCountClassifier(GenotypeClassifier): def __init__( self, count2cat: typing.Mapping[int, Categorization], @@ -415,7 +421,9 @@ def __init__( assert isinstance(counter, AlleleCounter) self._counter = counter - self._categorizations = tuple(_deduplicate_categorizations(self._count2cat.values())) + self._categorizations = tuple( + _deduplicate_categorizations(self._count2cat.values()) + ) self._hash = _compute_hash(self._count2cat, (self._counter,)) def get_categorizations(self) -> typing.Sequence[Categorization]: @@ -423,11 +431,11 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: @property def name(self) -> str: - return "Allele Count Predicate" + return "Allele Count Classifier" @property def description(self) -> str: - return "Partition by the allele count" + return "Classify by the allele count" @property def variable_name(self) -> str: @@ -440,15 +448,17 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return self._count2cat.get(count, None) def __eq__(self, value: object) -> bool: - return isinstance(value, AlleleCountPredicate) \ - and self._count2cat == value._count2cat \ + return ( + isinstance(value, AlleleCountClassifier) + and self._count2cat == value._count2cat and self._counter == value._counter + ) def __hash__(self) -> int: return self._hash -class SexGenotypePredicate(GenotypePolyPredicate): +class SexGenotypeClassifier(GenotypeClassifier): # NOT PART OF THE PUBLIC API def __init__(self): @@ -474,11 +484,11 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: @property def name(self) -> str: - return "Sex Predicate" + return "Sex Classifier" @property def description(self) -> str: - return "Partition by sex" + return "Classify by sex" @property def variable_name(self) -> str: @@ -496,16 +506,16 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return None def __eq__(self, value: object) -> bool: - return isinstance(value, SexGenotypePredicate) + return isinstance(value, SexGenotypeClassifier) def __hash__(self) -> int: return 31 -INSTANCE = SexGenotypePredicate() +INSTANCE = SexGenotypeClassifier() -def sex_predicate() -> GenotypePolyPredicate: +def sex_classifier() -> GenotypeClassifier: """ Get a genotype predicate for categorizing patients by their :class:`~gpsea.model.Sex`. @@ -514,13 +524,12 @@ def sex_predicate() -> GenotypePolyPredicate: return INSTANCE -class DiagnosisPredicate(GenotypePolyPredicate): - +class DiagnosisClassifier(GenotypeClassifier): @staticmethod def create( diagnoses: typing.Iterable[typing.Union[str, hpotk.TermId]], labels: typing.Optional[typing.Iterable[str]] = None, - ) -> "DiagnosisPredicate": + ) -> "DiagnosisClassifier": # First, collect the iterables and check sanity. diagnosis_ids = [] for d in diagnoses: @@ -538,10 +547,12 @@ def create( else: labels = tuple(labels) - assert (len(diagnosis_ids) >= 2), \ - f"We need at least 2 diagnoses: {len(diagnosis_ids)}" - assert len(diagnosis_ids) == len(labels), \ - f"The number of labels must match the number of diagnose IDs: {len(diagnosis_ids)}!={len(labels)}" + assert ( + len(diagnosis_ids) >= 2 + ), f"We need at least 2 diagnoses: {len(diagnosis_ids)}" + assert ( + len(diagnosis_ids) == len(labels) + ), f"The number of labels must match the number of diagnose IDs: {len(diagnosis_ids)}!={len(labels)}" # Then, prepare the categorizations. categorizations = { @@ -554,7 +565,7 @@ def create( } # Last, put the predicate together. - return DiagnosisPredicate(categorizations) + return DiagnosisClassifier(categorizations) def __init__( self, @@ -568,12 +579,12 @@ def __init__( @property def name(self) -> str: - return "Diagnosis Predicate" + return "Diagnosis Classifier" @property def description(self) -> str: diagnoses = ", ".join(cat.category.name for cat in self._categorizations) - return f"Partition the individual by presence of {diagnoses}" + return f"Classify the individual by presence of {diagnoses}" @property def variable_name(self) -> str: @@ -599,25 +610,24 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return None return categorization - + def __eq__(self, value: object) -> bool: - return isinstance(value, DiagnosisPredicate) \ - and self._id2cat == value._id2cat - + return isinstance(value, DiagnosisClassifier) and self._id2cat == value._id2cat + def __hash__(self) -> int: return self._hash -def diagnosis_predicate( +def diagnosis_classifier( diagnoses: typing.Iterable[typing.Union[str, hpotk.TermId]], labels: typing.Optional[typing.Iterable[str]] = None, -) -> GenotypePolyPredicate: +) -> GenotypeClassifier: """ - Create a genotype predicate that bins the patient based on presence of a disease diagnosis, + Genotype classifier bins an individual based on presence of a disease diagnosis, as listed in :attr:`~gpsea.model.Patient.diseases` attribute. - If an individual is diagnosed with more than one disease from the provided `diagnoses`, - the individual will be assigned into no group (`None`). + If the individual is diagnosed with more than one disease from the provided `diagnoses`, + the individual is assigned into no group (`None`). See the :ref:`group-by-diagnosis` section for an example. @@ -626,7 +636,7 @@ def diagnosis_predicate( :param labels: an iterable with diagnose names or `None` if disease IDs should be used instead. The number of labels must match the number of predicates. """ - return DiagnosisPredicate.create( + return DiagnosisClassifier.create( diagnoses=diagnoses, labels=labels, ) diff --git a/src/gpsea/analysis/clf/_pheno.py b/src/gpsea/analysis/clf/_pheno.py new file mode 100644 index 000000000..a137d1a3c --- /dev/null +++ b/src/gpsea/analysis/clf/_pheno.py @@ -0,0 +1,185 @@ +import typing + +import hpotk + +from gpsea.model import Patient + +from ._api import PhenotypeClassifier, PhenotypeCategorization, YES, NO + + +class HpoClassifier(PhenotypeClassifier[hpotk.TermId]): + """ + `HpoClassifier` tests if a patient is annotated with an HPO term. + + Note, `query` must be a term of the provided `hpo`! + + See :ref:`hpo-classifier` section for an example usage. + + :param hpo: HPO ontology + :param query: the HPO term to test + :param missing_implies_phenotype_excluded: `True` if lack of an explicit annotation implies term's absence`. + """ + + def __init__( + self, + hpo: hpotk.MinimalOntology, + query: hpotk.TermId, + missing_implies_phenotype_excluded: bool = False, + ): + assert isinstance(hpo, hpotk.MinimalOntology) + self._hpo = hpo + assert isinstance(query, hpotk.TermId) + self._query = query + self._query_label = self._hpo.get_term_name(query) + assert self._query_label is not None, f"Query {query} is in HPO" + assert isinstance(missing_implies_phenotype_excluded, bool) + self._missing_implies_phenotype_excluded = missing_implies_phenotype_excluded + self._phenotype_observed = PhenotypeCategorization( + category=YES, + phenotype=self._query, + ) + self._phenotype_excluded = PhenotypeCategorization( + category=NO, + phenotype=self._query, + ) + # Some tests depend on the order of `self._categorizations`. + self._categorizations = (self._phenotype_observed, self._phenotype_excluded) + + @property + def name(self) -> str: + return "HPO Classifier" + + @property + def description(self) -> str: + return f"Test for presence of {self._query_label} [{self._query.value}]" + + @property + def variable_name(self) -> str: + return self._query.value + + @property + def phenotype(self) -> hpotk.TermId: + return self._query + + @property + def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]: + return self._phenotype_observed + + def get_categorizations( + self, + ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]: + return self._categorizations + + def test( + self, patient: Patient, + ) -> typing.Optional[PhenotypeCategorization[hpotk.TermId]]: + self._check_patient(patient) + + if len(patient.phenotypes) == 0: + return None + + for phenotype in patient.phenotypes: + if phenotype.is_present: + if self._query == phenotype.identifier or any( + self._query == anc + for anc in self._hpo.graph.get_ancestors(phenotype) + ): + return self._phenotype_observed + else: + if self._missing_implies_phenotype_excluded: + return self._phenotype_excluded + else: + if phenotype.identifier == self._query or any( + phenotype.identifier == anc + for anc in self._hpo.graph.get_ancestors(self._query) + ): + return self._phenotype_excluded + + return None + + def __eq__(self, value: object) -> bool: + return isinstance(value, HpoClassifier) \ + and self._hpo.version == value._hpo.version \ + and self._query == value._query \ + and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded + + def __hash__(self) -> int: + return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) + + def __repr__(self): + return f"HpoClassifier(query={self._query})" + + +class DiseasePresenceClassifier(PhenotypeClassifier[hpotk.TermId]): + """ + `DiseasePresenceClassifier` tests if an individual was diagnosed with a disease. + + :param disease_id_query: a disease identifier formatted either as a CURIE `str` (e.g. ``OMIM:256000``) + or as a :class:`~hpotk.TermId`. + """ + + def __init__( + self, + disease_id_query: typing.Union[str, hpotk.TermId], + ): + if isinstance(disease_id_query, str): + self._query = hpotk.TermId.from_curie(disease_id_query) + elif isinstance(disease_id_query, hpotk.TermId): + self._query = disease_id_query + else: + raise AssertionError + + self._diagnosis_present = PhenotypeCategorization( + category=YES, + phenotype=disease_id_query, + ) + self._diagnosis_excluded = PhenotypeCategorization( + category=NO, + phenotype=disease_id_query, + ) + + @property + def name(self) -> str: + return "Disease Classifier" + + @property + def description(self) -> str: + return f"Partition based on a diagnosis of {self._query.value}" + + @property + def variable_name(self) -> str: + return self._query.value + + @property + def phenotype(self) -> hpotk.TermId: + return self._query + + @property + def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]: + return self._diagnosis_present + + def get_categorizations( + self, + ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]: + return self._diagnosis_present, self._diagnosis_excluded + + def test( + self, patient: Patient + ) -> typing.Optional[PhenotypeCategorization[hpotk.TermId]]: + self._check_patient(patient) + + for dis in patient.diseases: + if dis.is_present and dis.identifier == self._query: + return self._diagnosis_present + + return self._diagnosis_excluded + + def __eq__(self, value: object) -> bool: + return isinstance(value, DiseasePresenceClassifier) \ + and self._query == value._query + + def __hash__(self) -> int: + return hash((self._query,)) + + def __repr__(self): + return f"DiseasePresenceClassifier(query={self._query})" diff --git a/src/gpsea/analysis/predicate/genotype/_test__gt_predicates.py b/src/gpsea/analysis/clf/_test__gt_classifiers.py similarity index 97% rename from src/gpsea/analysis/predicate/genotype/_test__gt_predicates.py rename to src/gpsea/analysis/clf/_test__gt_classifiers.py index 0eb50d0c7..a165d057b 100644 --- a/src/gpsea/analysis/predicate/genotype/_test__gt_predicates.py +++ b/src/gpsea/analysis/clf/_test__gt_classifiers.py @@ -1,7 +1,7 @@ import typing import pytest -from ._gt_predicates import _build_count_to_cat, _build_ac_to_cat, _qc_partitions +from ._gt_classifiers import _build_count_to_cat, _build_ac_to_cat, _qc_partitions @pytest.mark.parametrize( diff --git a/src/gpsea/analysis/predicate/phenotype/_util.py b/src/gpsea/analysis/clf/_util.py similarity index 86% rename from src/gpsea/analysis/predicate/phenotype/_util.py rename to src/gpsea/analysis/clf/_util.py index 95ae2b21d..7235252a9 100644 --- a/src/gpsea/analysis/predicate/phenotype/_util.py +++ b/src/gpsea/analysis/clf/_util.py @@ -4,18 +4,18 @@ import hpotk -from ._pheno import PhenotypePolyPredicate, HpoPredicate +from ._pheno import PhenotypeClassifier, HpoClassifier from gpsea.model import Patient -def prepare_predicates_for_terms_of_interest( +def prepare_classifiers_for_terms_of_interest( cohort: typing.Iterable[Patient], hpo: hpotk.MinimalOntology, missing_implies_excluded: bool = False, -) -> typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]: +) -> typing.Sequence[PhenotypeClassifier[hpotk.TermId]]: """ - A convenience method for creating a battery of :class:`PhenotypePolyPredicate` predicates + A convenience method for creating a suite of phenotype classifiers for testing all phenotypes of interest. :param cohort: a cohort of individuals to investigate. @@ -23,12 +23,14 @@ def prepare_predicates_for_terms_of_interest( :param missing_implies_excluded: `True` if absence of an annotation should be counted as its explicit exclusion. """ return tuple( - HpoPredicate( + HpoClassifier( hpo=hpo, query=term, missing_implies_phenotype_excluded=missing_implies_excluded, - ) for term in prepare_hpo_terms_of_interest( - cohort, hpo, + ) + for term in prepare_hpo_terms_of_interest( + cohort, + hpo, ) ) diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index 2166bd734..83387ead1 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -9,8 +9,7 @@ from hpotk.constants.hpo.base import PHENOTYPIC_ABNORMALITY -from ..predicate.genotype import GenotypePolyPredicate -from ..predicate.phenotype import PhenotypePolyPredicate, P +from ..clf import GenotypeClassifier, PhenotypeClassifier, P @dataclasses.dataclass(eq=True, frozen=True) @@ -38,6 +37,7 @@ class PhenotypeMtcResult: of be filtered out (:meth:`is_filtered_out`) in which case :attr:`mtc_issue` with more context regarding the culprit must be available. """ + # The following is used as a singleton for the "OK" result __ok_instance = None @@ -45,7 +45,9 @@ class PhenotypeMtcResult: def ok() -> "PhenotypeMtcResult": # singleton if PhenotypeMtcResult.__ok_instance is None: - PhenotypeMtcResult.__ok_instance = PhenotypeMtcResult(status=True, issue=None) + PhenotypeMtcResult.__ok_instance = PhenotypeMtcResult( + status=True, issue=None + ) return PhenotypeMtcResult.__ok_instance @staticmethod @@ -60,9 +62,13 @@ def __init__( ): assert isinstance(status, bool) if status: - assert issue is None, '`issue` must NOT be provided if the HPO term passed the MTC filter' + assert ( + issue is None + ), "`issue` must NOT be provided if the HPO term passed the MTC filter" else: - assert issue is not None, '`issue` must be provided if the HPO term failed the MTC filter' + assert ( + issue is not None + ), "`issue` must be provided if the HPO term failed the MTC filter" self._status = status self._issue = issue @@ -85,15 +91,17 @@ def reason(self) -> typing.Optional[str]: return self.mtc_issue.reason def __eq__(self, value: object) -> bool: - return isinstance(value, PhenotypeMtcResult) \ - and self._status == value._status \ + return ( + isinstance(value, PhenotypeMtcResult) + and self._status == value._status and self._issue == value._issue + ) def __hash__(self) -> int: return hash((self._status, self._issue)) def __str__(self) -> str: - return f'PhenotypeMtcResult(status={self._status}, issue={self._issue})' + return f"PhenotypeMtcResult(status={self._status}, issue={self._issue})" def __repr__(self) -> str: return str(self) @@ -121,16 +129,16 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta): @abc.abstractmethod def filter( self, - gt_predicate: GenotypePolyPredicate, - ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Sequence[PhenotypeClassifier[P]], counts: typing.Sequence[pd.DataFrame], cohort_size: int, ) -> typing.Sequence[PhenotypeMtcResult]: """ Test if the phenotype with given counts should be included in the downstream analysis. - :param gt_predicate: the predicate that produced the columns of the `count` data frame. - :param ph_predicates: the phenotype predicates that produced the rows of the `counts` data frames. + :param gt_clf: the clsasifier that produced the columns of the `count` data frame. + :param pheno_clfs: the phenotype classifiers that produced the rows of the `counts` data frames. :param counts: a sequence of 2D data frames for the tested phenotypes. Each data frame corresponds to a genotype/phenotype contingency matrix. :param cohort_size: the size of the cohort. @@ -162,13 +170,13 @@ class UseAllTermsMtcFilter(PhenotypeMtcFilter[typing.Any]): def filter( self, - gt_predicate: GenotypePolyPredicate, - ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Sequence[PhenotypeClassifier[P]], counts: typing.Sequence[pd.DataFrame], cohort_size: int, ) -> typing.Sequence[PhenotypeMtcResult]: # Always OK! 😏 - return tuple(PhenotypeMtcFilter.OK for _ in ph_predicates) + return tuple(PhenotypeMtcFilter.OK for _ in pheno_clfs) def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: return (PhenotypeMtcFilter.OK,) @@ -189,7 +197,9 @@ class SpecifiedTermsMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): See :ref:`specified-terms-mt-filter` section for more info. """ - NON_SPECIFIED_TERM = PhenotypeMtcResult.fail(code="ST1", reason="Non-specified term") + NON_SPECIFIED_TERM = PhenotypeMtcResult.fail( + code="ST1", reason="Non-specified term" + ) """ The MTC filtering result returned when an HPO term does not belong among the selection of terms to be tested. @@ -210,14 +220,14 @@ def terms_to_test(self) -> typing.Collection[hpotk.TermId]: def filter( self, - gt_predicate: GenotypePolyPredicate, - ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Sequence[PhenotypeClassifier[P]], counts: typing.Sequence[pd.DataFrame], cohort_size: int, ) -> typing.Sequence[PhenotypeMtcResult]: results = [] - for predicate in ph_predicates: - if predicate.phenotype in self._terms_to_test: + for pclf in pheno_clfs: + if pclf.phenotype in self._terms_to_test: results.append(SpecifiedTermsMtcFilter.OK) else: results.append(SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM) @@ -231,7 +241,7 @@ def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: def filter_method_name(self) -> str: return "Specified terms MTC filter" - + @staticmethod def verify_term_id(val: typing.Union[str, hpotk.TermId]) -> hpotk.TermId: if isinstance(val, str): @@ -253,15 +263,19 @@ class HpoMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): """ NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO = PhenotypeMtcResult.fail( - "HMF02", "Skipping term because no genotype has more than one observed HPO count", + "HMF02", + "Skipping term because no genotype has more than one observed HPO count", ) SAME_COUNT_AS_THE_ONLY_CHILD = PhenotypeMtcResult.fail( - "HMF03", "Skipping term because of a child term with the same individual counts", + "HMF03", + "Skipping term because of a child term with the same individual counts", ) SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS = PhenotypeMtcResult.fail( "HMF05", "Skipping term because one genotype had zero observations" ) - SKIPPING_NON_PHENOTYPE_TERM = PhenotypeMtcResult.fail("HMF07", "Skipping non phenotype term") + SKIPPING_NON_PHENOTYPE_TERM = PhenotypeMtcResult.fail( + "HMF07", "Skipping non phenotype term" + ) SKIPPING_GENERAL_TERM = PhenotypeMtcResult.fail("HMF08", "Skipping general term") @staticmethod @@ -289,14 +303,14 @@ def default_filter( """ # The very top of the ontology is completely uninteresting. # Knowing about association between `All`, or `Phenotypic abnormality` tells us very little - top_level_terms = {phenotypic_abnormality, } + top_level_terms = { + phenotypic_abnormality, + } top_level_terms.update(hpo.graph.get_ancestors(phenotypic_abnormality)) # Collect sets of the 1st level terms # e.g., Abnormality of the cardiovascular system (HP:0001626) - first_level_terms = set( - hpo.graph.get_children(phenotypic_abnormality) - ) + first_level_terms = set(hpo.graph.get_children(phenotypic_abnormality)) # the second level (children of the top level), contains terms we want to keep, # such as HP:0001611 Hypernasal speech, but it also contains "general" terms that @@ -349,13 +363,13 @@ def __init__( self._below_frequency_threshold = PhenotypeMtcResult.fail( code="HMF01", reason="Skipping term with maximum frequency that was" - f" less than threshold {self._hpo_term_frequency_filter}", + f" less than threshold {self._hpo_term_frequency_filter}", ) self._below_annotation_frequency_threshold = PhenotypeMtcResult.fail( code="HMF09", reason="Skipping term with maximum annotation frequency that was" - f" less than threshold {self._hpo_annotation_frequency_threshold}", + f" less than threshold {self._hpo_annotation_frequency_threshold}", ) # Do not perform a test if the counts in the genotype categories do not even have nominal statistical power @@ -375,28 +389,30 @@ def __init__( self._not_powered_for_2_by_2 = PhenotypeMtcResult.fail( code="HMF06", reason=f"Skipping term with less than {self._min_observations_for_2_by_2} observations" - " (not powered for 2x2)", + " (not powered for 2x2)", ) self._min_observations_for_2_by_3 = 6 self._not_powered_for_2_by_3 = PhenotypeMtcResult.fail( code="HMF06", reason=f"Skipping term with less than {self._min_observations_for_2_by_3} observations" - " (not powered for 2x3)", + " (not powered for 2x3)", ) def filter( self, - gt_predicate: GenotypePolyPredicate, - ph_predicates: typing.Sequence[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], counts: typing.Sequence[pd.DataFrame], cohort_size: int, ) -> typing.Sequence[PhenotypeMtcResult]: - phenotypes = [p.phenotype for p in ph_predicates] + phenotypes = [p.phenotype for p in pheno_clfs] p_to_idx = {p: i for i, p in enumerate(phenotypes)} annotation_count_thr = self._hpo_annotation_frequency_threshold * cohort_size - results: typing.MutableSequence[typing.Optional[PhenotypeMtcResult]] = [None for _ in range(len(phenotypes))] + results: typing.MutableSequence[typing.Optional[PhenotypeMtcResult]] = [ + None for _ in range(len(phenotypes)) + ] for term_id in self._get_ordered_terms(phenotypes): try: idx = p_to_idx[term_id] @@ -410,18 +426,16 @@ def filter( results[idx] = HpoMtcFilter.SKIPPING_GENERAL_TERM continue - if not self._hpo.graph.is_ancestor_of( - PHENOTYPIC_ABNORMALITY, term_id - ): + if not self._hpo.graph.is_ancestor_of(PHENOTYPIC_ABNORMALITY, term_id): results[idx] = HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM continue - ph_predicate = ph_predicates[idx] + ph_clf = pheno_clfs[idx] contingency_matrix = counts[idx] max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( contingency_matrix, - ph_predicate=ph_predicate, + ph_clf=ph_clf, ) if max_freq < self._hpo_term_frequency_filter: results[idx] = self._below_frequency_threshold @@ -432,25 +446,33 @@ def filter( results[idx] = self._below_annotation_frequency_threshold continue - if contingency_matrix.shape == (2, 2) and total_count < self._min_observations_for_2_by_2: + if ( + contingency_matrix.shape == (2, 2) + and total_count < self._min_observations_for_2_by_2 + ): results[idx] = self._not_powered_for_2_by_2 continue - elif contingency_matrix.shape == (2, 3) and total_count < self._min_observations_for_2_by_3: + elif ( + contingency_matrix.shape == (2, 3) + and total_count < self._min_observations_for_2_by_3 + ): results[idx] = self._not_powered_for_2_by_3 continue if not HpoMtcFilter.some_cell_has_greater_than_one_count( counts=contingency_matrix, - ph_predicate=ph_predicate, + ph_clf=ph_clf, ): results[idx] = HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO continue elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts=contingency_matrix, - gt_predicate=gt_predicate, + gt_clf=gt_clf, ): - results[idx] = HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS + results[idx] = ( + HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS + ) continue # Check if the term has exactly one child with a very similar number of individuals @@ -469,7 +491,9 @@ def filter( break if child_contingency_matrix is not None: - if (contingency_matrix - child_contingency_matrix).abs().max(axis=None) < 1: + if (contingency_matrix - child_contingency_matrix).abs().max( + axis=None + ) < 1: # Do not test if the count is exactly the same to the counts in the only child term. results[idx] = HpoMtcFilter.SAME_COUNT_AS_THE_ONLY_CHILD continue @@ -486,7 +510,7 @@ def filter( term_name = self._hpo.get_term_name(phenotypes[i]) missing.append(term_name) - msg = 'Missing results for {}'.format(', '.join(missing)) + msg = "Missing results for {}".format(", ".join(missing)) raise ValueError(msg) # Ignoring the type hint, because we checked the type match above. @@ -512,14 +536,14 @@ def filter_method_name(self) -> str: @staticmethod def get_number_of_observed_hpo_observations( counts_frame: pd.DataFrame, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ph_clf: PhenotypeClassifier[hpotk.TermId], ) -> int: - return counts_frame.loc[ph_predicate.present_phenotype_category].sum() + return counts_frame.loc[ph_clf.present_phenotype_category].sum() @staticmethod def get_maximum_group_observed_HPO_frequency( counts_frame: pd.DataFrame, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ph_clf: PhenotypeClassifier[hpotk.TermId], ) -> float: """ Returns: @@ -528,22 +552,22 @@ def get_maximum_group_observed_HPO_frequency( all_hpo_count_per_gt = counts_frame.sum() if (all_hpo_count_per_gt == 0).all(): # Prevent division by zeros - return 0. + return 0.0 - present_hpo_count_per_gt = counts_frame.loc[ph_predicate.present_phenotype_category] + present_hpo_count_per_gt = counts_frame.loc[ph_clf.present_phenotype_category] return (present_hpo_count_per_gt / all_hpo_count_per_gt).max() @staticmethod def one_genotype_has_zero_hpo_observations( counts: pd.DataFrame, - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, ): - return any(counts.loc[:, c].sum() == 0 for c in gt_predicate.get_categories()) + return any(counts.loc[:, c].sum() == 0 for c in gt_clf.get_categories()) @staticmethod def some_cell_has_greater_than_one_count( counts: pd.DataFrame, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + ph_clf: PhenotypeClassifier[hpotk.TermId], ) -> bool: """ If no genotype has more than one HPO count, we do not want to do a test. For instance, if MISSENSE has one @@ -551,12 +575,12 @@ def some_cell_has_greater_than_one_count( Args: counts: pandas DataFrame with counts - ph_predicate: the phenotype predicate that produced the counts + ph_clf: the phenotype classifier that produced the counts Returns: true if at least one of the genotypes has more than one observed HPO count """ - return (counts.loc[ph_predicate.present_phenotype_category] > 1).any() + return (counts.loc[ph_clf.present_phenotype_category] > 1).any() def _get_ordered_terms( self, diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py index ae5317d37..fe0d35bf3 100644 --- a/src/gpsea/analysis/pcats/__init__.py +++ b/src/gpsea/analysis/pcats/__init__.py @@ -1,10 +1,10 @@ """ -The `gpsea.analysis.pcats` tests the association between genotype and phenotype groups, -if the groups can be defined in terms of discrete, unique, and non-overlapping categories. +The `gpsea.analysis.pcats` tests the association between genotype and phenotype classes, +if the classess can be defined in terms of discrete, unique, and non-overlapping categories. -Each individual is assigned into a genotype and phenotype group -using :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -and :class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate` respectively. +Each individual is assigned into a genotype and phenotype class +using :class:`~gpsea.analysis.clf.GenotypeClassifier` +and :class:`~gpsea.analysis.clf.PhenotypeClassifier` respectively. A contingency matrix with group counts is prepared and the counts are tested for association using :class:`~gpsea.analysis.pcats.stats.CountStatistic`, @@ -24,13 +24,15 @@ from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult from ._impl import DiseaseAnalysis from ._impl import HpoTermAnalysis, HpoTermAnalysisResult -from ._impl import apply_predicates_on_patients +from ._impl import apply_classifiers_on_individuals from ._config import configure_hpo_term_analysis __all__ = [ - 'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult', - 'DiseaseAnalysis', - 'HpoTermAnalysis', 'HpoTermAnalysisResult', - 'apply_predicates_on_patients', - 'configure_hpo_term_analysis', + "MultiPhenotypeAnalysis", + "MultiPhenotypeAnalysisResult", + "DiseaseAnalysis", + "HpoTermAnalysis", + "HpoTermAnalysisResult", + "apply_classifiers_on_individuals", + "configure_hpo_term_analysis", ] diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 080bd487f..c6ae48895 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -12,44 +12,44 @@ from gpsea.model import Patient -from ..predicate.genotype import GenotypePolyPredicate -from ..predicate.phenotype import P, PhenotypePolyPredicate +from ..clf import GenotypeClassifier +from ..clf import P, PhenotypeClassifier from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult from .stats import CountStatistic from .._base import MultiPhenotypeAnalysisResult, StatisticResult -DEFAULT_MTC_PROCEDURE = 'fdr_bh' + +DEFAULT_MTC_PROCEDURE = "fdr_bh" """ Use Benjamini-Hochberg as the default MTC procedure. """ -def apply_predicates_on_patients( - patients: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Sequence[PhenotypePolyPredicate[P]], +def apply_classifiers_on_individuals( + individuals: typing.Iterable[Patient], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Sequence[PhenotypeClassifier[P]], ) -> typing.Tuple[ typing.Sequence[int], typing.Sequence[pd.DataFrame], ]: """ - Apply the phenotype predicates `pheno_predicates` and the genotype predicate `gt_predicate` - to bin the `patients` into categories. + Classify individuals with the genotype and phenotype classifiers. - Note, it may not be possible to bin *all* patients with a genotype/phenotype pair, - since a predicate is allowed to return `None` (e.g. if it bins the patient into MISSENSE or NONSENSE groups - but the patient has no MISSENSE or NONSENSE variants). If this happens, the patient will not be "usable" + Note, it may not be possible to classify *all* individuals with a genotype/phenotype pair, + since a clasifier is allowed to return `None` (e.g. if it assigns the individual into MISSENSE or NONSENSE groups + but the patient has no MISSENSE or NONSENSE variants). If this happens, the individual will not be "usable" for the phenotype `P`. Args: - patients: a sequence of the patients to bin into categories - gt_predicate: a genotype predicate to apply - pheno_predicates: a sequence with the phenotype predicates to apply + individuals: a sequence of individuals to classify + gt_clf: classifier to assign a genotype class + pheno_clfs: a sequence of phenotype classifiers to apply Returns: a tuple with 2 items: - - a sequence with counts of patients that could be binned according to the phenotype `P`. + - a sequence with counts of individuals that could be classified according to the phenotype `P`. - a sequence with data frames with counts of patients in i-th phenotype category and j-th genotype category where i and j are rows and columns of the data frame. """ @@ -57,7 +57,7 @@ def apply_predicates_on_patients( # Apply genotype and phenotype predicates count_dict = {} - for ph_predicate in pheno_predicates: + for ph_predicate in pheno_clfs: if ph_predicate.phenotype not in count_dict: # Make an empty frame for keeping track of the counts. count_dict[ph_predicate.phenotype] = pd.DataFrame( @@ -67,14 +67,14 @@ def apply_predicates_on_patients( name=ph_predicate.variable_name, ), columns=pd.Index( - data=gt_predicate.get_categories(), - name=gt_predicate.variable_name, + data=gt_clf.get_categories(), + name=gt_clf.variable_name, ), ) - for patient in patients: + for patient in individuals: pheno_cat = ph_predicate.test(patient) - geno_cat = gt_predicate.test(patient) + geno_cat = gt_clf.test(patient) if pheno_cat is not None and geno_cat is not None: count_dict[pheno_cat.phenotype].loc[ @@ -84,17 +84,15 @@ def apply_predicates_on_patients( # Convert dicts to numpy arrays n_usable_patients = [ - n_usable_patient_counter[ph_predicate.phenotype] - for ph_predicate in pheno_predicates + n_usable_patient_counter[ph_predicate.phenotype] for ph_predicate in pheno_clfs ] - counts = [count_dict[ph_predicate.phenotype] for ph_predicate in pheno_predicates] + counts = [count_dict[ph_predicate.phenotype] for ph_predicate in pheno_clfs] return n_usable_patients, counts class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta): - def __init__( self, count_statistic: CountStatistic, @@ -112,48 +110,50 @@ def __init__( (e.g. Bonferroni MTC) or false discovery rate for the FDR procedures (e.g. Benjamini-Hochberg). """ assert isinstance(count_statistic, CountStatistic) - assert len(count_statistic.supports_shape) == 2, "The statistic must support 2D contingency tables" + assert ( + len(count_statistic.supports_shape) == 2 + ), "The statistic must support 2D contingency tables" self._count_statistic = count_statistic self._mtc_correction = mtc_correction - assert isinstance(mtc_alpha, float) and 0. <= mtc_alpha <= 1. + assert isinstance(mtc_alpha, float) and 0.0 <= mtc_alpha <= 1.0 self._mtc_alpha = mtc_alpha def compare_genotype_vs_phenotypes( self, cohort: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Iterable[PhenotypeClassifier[P]], ) -> MultiPhenotypeAnalysisResult[P]: # Check compatibility between the count statistic and predicate. issues = MultiPhenotypeAnalysis._check_compatibility( count_statistic=self._count_statistic, - gt_predicate=gt_predicate, - pheno_predicates=pheno_predicates, + gt_clf=gt_clf, + pheno_clfs=pheno_clfs, ) if len(issues) != 0: msg = os.linesep.join(issues) - raise ValueError(f'Cannot execute the analysis: {msg}') + raise ValueError(f"Cannot execute the analysis: {msg}") return self._compute_result( cohort=cohort, - gt_predicate=gt_predicate, - pheno_predicates=pheno_predicates, + gt_clf=gt_clf, + pheno_clfs=pheno_clfs, ) @abc.abstractmethod def _compute_result( self, cohort: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Iterable[PhenotypeClassifier[P]], ) -> MultiPhenotypeAnalysisResult[P]: pass @staticmethod def _check_compatibility( count_statistic: CountStatistic, - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Iterable[PhenotypeClassifier[P]], ) -> typing.Collection[str]: # There should be 2 items due to check in `__init__`. (pheno, geno) = count_statistic.supports_shape @@ -165,16 +165,16 @@ def _check_compatibility( elif isinstance(pheno, typing.Sequence): pheno_accepted = pheno else: - issues.append('Cannot use a count statistic that does not check phenotypes') + issues.append("Cannot use a count statistic that does not check phenotypes") pheno_failed = [] - for i, ph_predicate in enumerate(pheno_predicates): + for i, ph_predicate in enumerate(pheno_clfs): if ph_predicate.n_categorizations() not in pheno_accepted: pheno_failed.append(i) if len(pheno_failed) != 0: issues.append( - 'Phenotype predicates {} are incompatible with the count statistic'.format( - ', '.join(str(i) for i in pheno_failed) + "Phenotype predicates {} are incompatible with the count statistic".format( + ", ".join(str(i) for i in pheno_failed) ) ) @@ -184,12 +184,16 @@ def _check_compatibility( elif isinstance(geno, typing.Sequence): geno_accepted = geno elif pheno is None: - raise ValueError('Cannot use a count statistic that does not check genotypes') + raise ValueError( + "Cannot use a count statistic that does not check genotypes" + ) else: - raise ValueError(f'Cannot use a count statistic that supports shape {pheno, geno}') + raise ValueError( + f"Cannot use a count statistic that supports shape {pheno, geno}" + ) - if gt_predicate.n_categorizations() not in geno_accepted: - issues.append('Genotype predicate is incompatible with the count statistic') + if gt_clf.n_categorizations() not in geno_accepted: + issues.append("Genotype predicate is incompatible with the count statistic") return issues @@ -229,22 +233,21 @@ def _apply_mtc( class DiseaseAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]): - def _compute_result( self, cohort: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Iterable[PhenotypeClassifier[P]], ) -> MultiPhenotypeAnalysisResult[hpotk.TermId]: - pheno_predicates = tuple(pheno_predicates) - if len(pheno_predicates) == 0: + pheno_clfs = tuple(pheno_clfs) + if len(pheno_clfs) == 0: raise ValueError("No phenotype predicates were provided") # 1 - Count the patients - n_usable, all_counts = apply_predicates_on_patients( - patients=cohort, - gt_predicate=gt_predicate, - pheno_predicates=pheno_predicates, + n_usable, all_counts = apply_classifiers_on_individuals( + individuals=cohort, + gt_clf=gt_clf, + pheno_clfs=pheno_clfs, ) # 2 - Compute nominal p values @@ -257,10 +260,10 @@ def _compute_result( corrected_pvals = self._apply_mtc(stats=stats) return MultiPhenotypeAnalysisResult( - gt_predicate=gt_predicate, + gt_clf=gt_clf, statistic=self._count_statistic, mtc_correction=self._mtc_correction, - pheno_predicates=pheno_predicates, + pheno_clfs=pheno_clfs, n_usable=n_usable, all_counts=all_counts, statistic_results=stats, @@ -278,21 +281,20 @@ class HpoTermAnalysisResult(MultiPhenotypeAnalysisResult[hpotk.TermId]): def __init__( self, - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, statistic: CountStatistic, mtc_correction: typing.Optional[str], - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], + pheno_clfs: typing.Iterable[PhenotypeClassifier[hpotk.TermId]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], statistic_results: typing.Sequence[typing.Optional[StatisticResult]], corrected_pvals: typing.Optional[typing.Sequence[float]], mtc_filter_name: str, mtc_filter_results: typing.Sequence[PhenotypeMtcResult], - ): super().__init__( - gt_predicate=gt_predicate, - pheno_predicates=pheno_predicates, + gt_clf=gt_clf, + pheno_clfs=pheno_clfs, statistic=statistic, n_usable=n_usable, all_counts=all_counts, @@ -309,10 +311,10 @@ def __init__( def _check_hpo_result_sanity(self) -> typing.Sequence[str]: errors = [] - if len(self._pheno_predicates) != len(self._mtc_filter_results): + if len(self._pheno_clfs) != len(self._mtc_filter_results): errors.append( f"`len(pheno_predicates)` must be the same as `len(mtc_filter_results)` but " - f"{len(self._pheno_predicates)}!={len(self._mtc_filter_results)}" + f"{len(self._pheno_clfs)}!={len(self._mtc_filter_results)}" ) return errors @@ -338,17 +340,24 @@ def n_filtered_out(self) -> int: return sum(result.is_filtered_out() for result in self.mtc_filter_results) def __eq__(self, other): - return isinstance(other, HpoTermAnalysisResult) \ - and super(MultiPhenotypeAnalysisResult, self).__eq__(other) \ - and self._mtc_filter_name == other._mtc_filter_name \ + return ( + isinstance(other, HpoTermAnalysisResult) + and super(MultiPhenotypeAnalysisResult, self).__eq__(other) + and self._mtc_filter_name == other._mtc_filter_name and self._mtc_filter_results == other._mtc_filter_results + ) def __hash__(self): - return hash(( - super(MultiPhenotypeAnalysisResult, self).__hash__(), - self._pheno_predicates, self._n_usable, self._all_counts, - self._mtc_filter_name, self._mtc_filter_results, - )) + return hash( + ( + super(MultiPhenotypeAnalysisResult, self).__hash__(), + self._pheno_clfs, + self._n_usable, + self._all_counts, + self._mtc_filter_name, + self._mtc_filter_results, + ) + ) class HpoTermAnalysis(MultiPhenotypeAnalysis[hpotk.TermId]): @@ -381,25 +390,25 @@ def __init__( def _compute_result( self, cohort: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, - pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], + gt_clf: GenotypeClassifier, + pheno_clfs: typing.Iterable[PhenotypeClassifier[hpotk.TermId]], ) -> HpoTermAnalysisResult: - pheno_predicates = tuple(pheno_predicates) - if len(pheno_predicates) == 0: + pheno_clfs = tuple(pheno_clfs) + if len(pheno_clfs) == 0: raise ValueError("No phenotype predicates were provided") # 1 - Count the patients - n_usable, all_counts = apply_predicates_on_patients( + n_usable, all_counts = apply_classifiers_on_individuals( cohort, - gt_predicate, - pheno_predicates, + gt_clf, + pheno_clfs, ) # 2 - Apply MTC filter and select p values to MTC cohort_size = sum(1 for _ in cohort) mtc_filter_results = self._mtc_filter.filter( - gt_predicate=gt_predicate, - ph_predicates=pheno_predicates, + gt_clf=gt_clf, + pheno_clfs=pheno_clfs, counts=all_counts, cohort_size=cohort_size, ) @@ -424,10 +433,10 @@ def _compute_result( corrected_pvals[mtc_mask] = self._apply_mtc(stats=results[mtc_mask]) return HpoTermAnalysisResult( - gt_predicate=gt_predicate, + gt_clf=gt_clf, statistic=self._count_statistic, mtc_correction=self._mtc_correction, - pheno_predicates=pheno_predicates, + pheno_clfs=pheno_clfs, n_usable=n_usable, all_counts=all_counts, statistic_results=results, @@ -437,7 +446,7 @@ def _compute_result( ) -WHATEVER = typing.TypeVar('WHATEVER') +WHATEVER = typing.TypeVar("WHATEVER") def slice_list_in_numpy_style( diff --git a/src/gpsea/analysis/predicate/__init__.py b/src/gpsea/analysis/predicate/__init__.py index 5b4592402..4e6186333 100644 --- a/src/gpsea/analysis/predicate/__init__.py +++ b/src/gpsea/analysis/predicate/__init__.py @@ -1,5 +1,43 @@ -from ._api import PolyPredicate, PatientCategory, Categorization +from ._api import VariantPredicate +from ._variant import ( + allof, + anyof, + change_length, + exon, + gene, + is_large_imprecise_sv, + is_structural_deletion, + is_structural_variant, + protein_feature, + protein_feature_type, + protein_region, + ref_length, + structural_type, + transcript, + true, + variant_class, + variant_effect, + variant_key, +) __all__ = [ - 'PolyPredicate', 'PatientCategory', 'Categorization', + "VariantPredicate", + "true", + "allof", + "anyof", + "variant_effect", + "variant_key", + "gene", + "transcript", + "exon", + "protein_region", + "is_large_imprecise_sv", + "is_structural_variant", + "structural_type", + "variant_class", + "ref_length", + "change_length", + "is_structural_deletion", + "protein_feature_type", + "protein_feature", ] diff --git a/src/gpsea/analysis/predicate/_api.py b/src/gpsea/analysis/predicate/_api.py index 1e5283403..7afa33a80 100644 --- a/src/gpsea/analysis/predicate/_api.py +++ b/src/gpsea/analysis/predicate/_api.py @@ -1,234 +1,223 @@ import abc -import os +import warnings import typing -from collections import Counter - -import hpotk -import hpotk.util - -from gpsea.model import Patient +from gpsea.model import Variant from .._partition import Partitioning -class PatientCategory: +class VariantPredicate(Partitioning, metaclass=abc.ABCMeta): """ - `PatientCategory` represents one of several exclusive discrete groups. + `VariantPredicate` tests if a variant meets a certain criterion. - Patient category has :attr:`cat_id`, a unique numeric identifier of the group, - :attr:`name` with human-readable group name, and :attr:`description` with an optional verbose description. - """ + The subclasses *MUST* implement all abstract methods of this class + *plus* ``__eq__`` and ``__hash__``, to support building the compound predicates. - def __init__(self, cat_id: int, - name: str, - description: typing.Optional[str] = None): - self._cat_id = hpotk.util.validate_instance(cat_id, int, 'cat_id') - self._name = hpotk.util.validate_instance(name, str, 'name') - self._description = hpotk.util.validate_optional_instance(description, str, 'description') + We *strongly* recommend implementing ``__str__`` and ``__repr__`` as well. + """ - @property - def cat_id(self) -> int: + def get_question(self) -> str: """ - Get an `int` with the unique numeric identifier of the group. + Prepare a `str` with the question the predicate can answer. """ - return self._cat_id + # TODO: remove in `v1.0.0` + warnings.warn( + "`get_question` will be removed soon. Use `description` property instead", + DeprecationWarning, + stacklevel=2, + ) + return self.description - @property - def name(self) -> str: + @abc.abstractmethod + def test(self, variant: Variant) -> bool: """ - Get a `str` with a human-readable name of the group. + Test if the `variant` meets a criterion. + + Args: + variant: an instance of :class:`~gpsea.model.Variant` to test. + + Returns: + bool: `True` if the variant meets the criterion and `False` otherwise. """ - return self._name + pass - @property - def description(self) -> typing.Optional[str]: + def __and__(self, other): """ - Get a `str` with an optional detailed group description. + Create a variant predicate which passes if *BOTH* `self` and `other` pass. """ - return self._description - - def __repr__(self) -> str: - return f"PatientCategory(cat_id={self.cat_id}, " \ - f"name={self.name}, " \ - f"description={self.description})" - - def __str__(self) -> str: - return self._name + if isinstance(other, VariantPredicate): + if self == other: + return self - def __eq__(self, other) -> bool: - return isinstance(other, PatientCategory) \ - and self.cat_id == other.cat_id \ - and self.name == other.name \ - and self.description == other.description + if isinstance(self, AllVariantPredicate) and isinstance( + other, AllVariantPredicate + ): + # Merging two *all* variant predicates is equivalent + # to chaining their predicates + return AllVariantPredicate(*self.predicates, *other.predicates) + else: + return AllVariantPredicate(self, other) + else: + return NotImplemented - def __hash__(self) -> int: - return hash((self.cat_id, self.name, self.description)) + def __or__(self, other): + """ + Create a variant predicate which passes if *EITHER* `self` *OR* `other` passes. + """ + if isinstance(other, VariantPredicate): + if self == other: + return self + if isinstance(self, AnyVariantPredicate) and isinstance( + other, AnyVariantPredicate + ): + # Merging two any variant predicates is equivalent + # to chaining their predicates + return AnyVariantPredicate(*self.predicates, *other.predicates) + else: + return AnyVariantPredicate(self, other) + else: + return NotImplemented -class Categorization: - """ - `Categorization` represents one of discrete group a :class:`~gpsea.model.Patient` can be assigned into. - """ - - @staticmethod - def from_raw_parts( - cat_id: int, - name: str, - description: typing.Optional[str] = None, - ): + def __invert__(self): """ - Create `Categorization` from the `cat_id` identifier, `name`, and an optional `description`. + Create a variant predicate that passes if the underlying predicate fails. """ - return Categorization( - category=PatientCategory( - cat_id=cat_id, - name=name, - description=description, - ) - ) + if isinstance(self, InvVariantPredicate): + # Unwrap to prevent double negation + return self._inner + else: + return InvVariantPredicate(self) + + +class LogicalVariantPredicate(VariantPredicate, metaclass=abc.ABCMeta): + # NOT PART OF THE PUBLIC API def __init__( self, - category: PatientCategory, + *predicates, ): - self._category = hpotk.util.validate_instance(category, PatientCategory, 'category') + if len(predicates) == 0: + raise ValueError("Predicates must not be empty!") + self._predicates = tuple(predicates) @property - def category(self) -> PatientCategory: - return self._category + def predicates(self) -> typing.Sequence[VariantPredicate]: + return self._predicates - def __eq__(self, other): - return isinstance(other, Categorization) and self._category == other._category + @property + def name(self) -> str: + sep = f" {self._separator_symbol()} " + return "(" + sep.join(p.name for p in self._predicates) + ")" - def __hash__(self): - return hash((self._category,)) + @property + def description(self) -> str: + sep = f" {self._separator_word().upper()} " + return "(" + sep.join(p.description for p in self._predicates) + ")" - def __repr__(self): - return f'Categorization(category={self._category})' + @property + def variable_name(self) -> str: + sep = f" {self._separator_symbol()} " + return "(" + sep.join(p.variable_name for p in self._predicates) + ")" - def __str__(self): - return repr(self) + @abc.abstractmethod + def _separator_symbol(self) -> str: + pass + @abc.abstractmethod + def _separator_word(self) -> str: + pass -C = typing.TypeVar('C', bound=Categorization) -""" -A generic bound for types that extend :class:`Categorization`. -""" +class AnyVariantPredicate(LogicalVariantPredicate): + # NOT PART OF THE PUBLIC API -class PolyPredicate(typing.Generic[C], Partitioning, metaclass=abc.ABCMeta): - """ - `PolyPredicate` partitions a :class:`~gpsea.model.Patient` into one of several discrete groups - represented by a :class:`~gpsea.analysis.predicate.Categorization`. + def _separator_symbol(self) -> str: + return "|" - The groups must be *exclusive* - the patient can be binned into one and only one group, - and *exhaustive* - the groups must cover all possible scenarios. + def _separator_word(self) -> str: + return "or" - However, if the patient cannot be assigned into any meaningful category, `None` can be returned. - As a rule of thumb, returning `None` will exclude the patient from the analysis. - """ + def test(self, variant: Variant) -> bool: + return any(predicate.test(variant) for predicate in self._predicates) - @abc.abstractmethod - def get_categorizations(self) -> typing.Sequence[C]: - """ - Get a sequence of all categories which the predicate can produce. - """ - pass + def __eq__(self, value: object) -> bool: + if isinstance(value, AnyVariantPredicate): + return self._predicates == value._predicates + return False - def get_categories(self) -> typing.Iterator[PatientCategory]: - """ - Get an iterator with :class:`PatientCategory` instances that the predicate can produce. - """ - return (c.category for c in self.get_categorizations()) + def __hash__(self) -> int: + return 17 * hash(self._predicates) - @property - def group_labels(self) -> typing.Collection[str]: - """ - Get a collection with names of the :class:`PatientCategory` items that the predicate can produce. - """ - return tuple(cat.name for cat in self.get_categories()) + def __str__(self) -> str: + return "(" + " OR ".join(str(p) for p in self._predicates) + ")" - def summarize_groups(self) -> str: - cat_names = ', '.join(self.group_labels) - return f"{self.variable_name}: {cat_names}" + def __repr__(self) -> str: + return f"AnyVariantPredicate(predicates={self._predicates})" - def summarize( - self, - out: typing.TextIO, - ): - """ - Summarize the predicate into the `out` handle. - The summary includes the name, summary, and the groups the predicate can assign individuals into. - """ - Partitioning.summarize(self, out) - out.write(self.summarize_groups()) - out.write(os.linesep) +class AllVariantPredicate(LogicalVariantPredicate): + # NOT PART OF THE PUBLIC API - def n_categorizations(self) -> int: - """ - Get the number of categorizations the predicate can produce. - """ - return len(self.get_categorizations()) + def _separator_symbol(self) -> str: + return "&" - def get_category( - self, - cat_id: int, - ) -> PatientCategory: - """ - Get the category name for a :class:`PatientCategory.cat_id`. + def _separator_word(self) -> str: + return "and" - :param cat_id: an `int` with the id. - :raises: ValueError if there is no such category was defined. - """ - for ctg in self.get_categories(): - if ctg.cat_id == cat_id: - return ctg - raise ValueError(f'No category for {cat_id} was found') + def test(self, variant: Variant) -> bool: + return all(predicate.test(variant) for predicate in self._predicates) + + def __eq__(self, value: object) -> bool: + if isinstance(value, AllVariantPredicate): + return self._predicates == value._predicates + return False + + def __hash__(self) -> int: + return 31 * hash(self._predicates) + + def __str__(self) -> str: + return "(" + " AND ".join(str(p) for p in self._predicates) + ")" + + def __repr__(self) -> str: + return f"AllVariantPredicate(predicates={self._predicates})" + + +class InvVariantPredicate(VariantPredicate): + # NOT PART OF THE PUBLIC API - def get_category_name( + def __init__( self, - cat_id: int, - ) -> str: - """ - Get the category name for a :class:`PatientCategory.cat_id`. + inner: VariantPredicate, + ): + self._inner = inner - :param cat_id: an `int` with the id. - :raises: ValueError if there is no such category was defined. - """ - return self.get_category(cat_id).name - - @staticmethod - def _check_categorizations( - categorizations: typing.Sequence[Categorization], - ) -> typing.Sequence[str]: - """ - Check that the categorizations meet the requirements. + @property + def name(self) -> str: + return "NOT " + self._inner.name - The requirements include: + @property + def description(self) -> str: + return "NOT " + self._inner.description - * the `cat_id` must be unique within the predicate - """ - issues = [] - # There must be at least one category + @property + def variable_name(self) -> str: + return "NOT " + self._inner.variable_name - # `cat_id` must be unique for a predicate! - cat_id_counts = Counter() - for c in categorizations: - cat_id_counts[c.category.cat_id] += 1 + def test(self, variant: Variant) -> bool: + return not self._inner.test(variant) - for cat_id, count in cat_id_counts.items(): - if count > 1: - issues.append(f'`cat_id` {cat_id} is present {count}>1 times') - - return issues + def __eq__(self, value: object) -> bool: + if isinstance(value, InvVariantPredicate): + return self._inner == value._inner + return False - @abc.abstractmethod - def test(self, patient: Patient) -> typing.Optional[C]: - """ - Assign a `patient` into a categorization. + def __hash__(self) -> int: + return -hash(self._inner) - Return `None` if the patient cannot be assigned into any meaningful category. - """ - pass + def __str__(self) -> str: + return f"NOT {self._inner}" + + def __repr__(self) -> str: + return f"NotVariantPredicate(inner={self._inner})" diff --git a/src/gpsea/analysis/predicate/genotype/_predicates.py b/src/gpsea/analysis/predicate/_predicates.py similarity index 100% rename from src/gpsea/analysis/predicate/genotype/_predicates.py rename to src/gpsea/analysis/predicate/_predicates.py diff --git a/src/gpsea/analysis/predicate/_variant.py b/src/gpsea/analysis/predicate/_variant.py new file mode 100644 index 000000000..a1a3a1eae --- /dev/null +++ b/src/gpsea/analysis/predicate/_variant.py @@ -0,0 +1,442 @@ +import typing + +import hpotk + +from gpsea.model import VariantClass, VariantEffect, ProteinMetadata, FeatureType +from gpsea.model.genome import Region +from ._api import VariantPredicate, AllVariantPredicate, AnyVariantPredicate +from ._predicates import ( + AlwaysTrueVariantPredicate, + ChangeLengthPredicate, + IsLargeImpreciseStructuralVariantPredicate, + ProteinFeaturePredicate, + ProteinFeatureTypePredicate, + ProteinRegionPredicate, + RefAlleleLengthPredicate, + StructuralTypePredicate, + VariantClassPredicate, + VariantEffectPredicate, + VariantExonPredicate, + VariantGenePredicate, + VariantKeyPredicate, + VariantTranscriptPredicate, +) + + +# We do not need more than just one instance of these predicates. +IS_TRANSLOCATION = VariantClassPredicate(VariantClass.TRANSLOCATION) +IS_LARGE_IMPRECISE_SV = IsLargeImpreciseStructuralVariantPredicate() + + +def true() -> VariantPredicate: + """ + The most inclusive variant predicate - returns `True` for any variant whatsoever. + """ + return AlwaysTrueVariantPredicate.get_instance() + + +def allof(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that returns `True` if ALL `predicates` evaluate to `True`. + + This is useful for building compound predicates programmatically. + + **Example** + + Build a predicate to test if variant has a functional annotation to genes *SURF1* and *SURF2*: + + >>> from gpsea.analysis.predicate import allof, gene + + >>> genes = ('SURF1', 'SURF2',) + >>> predicate = allof(gene(g) for g in genes) + >>> predicate.description + '(affects SURF1 AND affects SURF2)' + + Args: + predicates: an iterable of predicates to test + """ + predicates = tuple(predicates) + if len(predicates) == 1: + # No need to wrap one predicate into a logical predicate. + return predicates[0] + else: + return AllVariantPredicate(*predicates) + + +def anyof(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that returns `True` if ANY of the `predicates` evaluates to `True`. + + This can be useful for building compound predicates programmatically. + + **Example** + + Build a predicate to test if variant leads to a missense + or nonsense change on a fictional transcript `NM_123456.7`: + + >>> from gpsea.model import VariantEffect + >>> from gpsea.analysis.predicate import anyof, variant_effect + + >>> tx_id = 'NM_123456.7' + >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,) + >>> predicate = anyof(variant_effect(e, tx_id) for e in effects) + >>> predicate.description + '(MISSENSE_VARIANT on NM_123456.7 OR STOP_GAINED on NM_123456.7)' + + Args: + predicates: an iterable of predicates to test + """ + predicates = tuple(predicates) + if len(predicates) == 1: + # No need to wrap one predicate into a logical predicate. + return predicates[0] + else: + return AnyVariantPredicate(*predicates) + + +def variant_effect( + effect: VariantEffect, + tx_id: str, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` to test if the functional annotation predicts the variant to lead to + a certain variant effect. + + **Example** + + Make a predicate for testing if the variant leads to a missense change on transcript `NM_123.4`: + + >>> from gpsea.model import VariantEffect + >>> from gpsea.analysis.predicate import variant_effect + >>> predicate = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_123.4') + >>> predicate.description + 'MISSENSE_VARIANT on NM_123.4' + + Args: + effect: the target :class:`~gpsea.model.VariantEffect` + tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) + """ + return VariantEffectPredicate(effect, tx_id) + + +def variant_key(key: str) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that tests if the variant matches the provided `key`. + + Args: + key: a `str` with the variant key (e.g. `X_12345_12345_C_G` or `22_10001_20000_INV`) + """ + return VariantKeyPredicate(key) + + +def gene(symbol: str) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that tests if the variant affects a given gene. + + Args: + symbol: a `str` with the gene symbol (e.g. ``'FBN1'``). + """ + return VariantGenePredicate(symbol) + + +def transcript(tx_id: str) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that tests if the variant affects a transcript. + + Args: + tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) + """ + return VariantTranscriptPredicate(tx_id) + + +def exon( + exon: int, + tx_id: str, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. + + .. warning:: + + We use 1-based numbering to number the exons, + not the usual 0-based numbering of the computer science. + Therefore, the first exon of the transcript + has ``exon_number==1``, the second exon is ``2``, and so on ... + + .. warning:: + + We do not check if the `exon_number` spans + beyond the number of exons of the given `transcript_id`! + Therefore, ``exon_number==10,000`` will effectively return `False` + for *all* variants!!! 😱 + Well, at least the genome variants of the *Homo sapiens sapiens* taxon... + + Args: + exon: a positive `int` with the index of the target exon + (e.g. `1` for the 1st exon, `2` for the 2nd, ...) + tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) + """ + return VariantExonPredicate(exon, tx_id) + + +def protein_region( + region: typing.Union[typing.Tuple[int, int], Region], + tx_id: str, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` that tests if the variant + overlaps with a region on a protein of a specific transcript. + + + Example + ------- + + Create a predicate to test if the variant overlaps with the 5th aminoacid + of the protein encoded by a fictional transcript `NM_1234.5`: + + >>> from gpsea.analysis.predicate import protein_region + >>> overlaps_with_fifth_aa = protein_region(region=(5, 5), tx_id="NM_1234.5") + >>> overlaps_with_fifth_aa.description + 'overlaps with [5,5] region of the protein encoded by NM_1234.5' + + Create a predicate to test if the variant overlaps with the first 20 aminoacid residues of the same transcript: + + >>> overlaps_with_first_20 = protein_region(region=(1, 20), tx_id="NM_1234.5") + >>> overlaps_with_first_20.description + 'overlaps with [1,20] region of the protein encoded by NM_1234.5' + + Args: + region: a :class:`~gpsea.model.genome.Region` that gives the start and end coordinate + of the region of interest on a protein strand or a tuple with 1-based coordinates. + """ + if isinstance(region, Region): + pass + elif ( + isinstance(region, tuple) + and len(region) == 2 + and all(isinstance(r, int) and r > 0 for r in region) + ): + start = region[0] - 1 # Convert to 0-based + end = region[1] + region = Region(start=start, end=end) + else: + raise ValueError( + f"region must be a `Region` or a tuple with two positive `int`s, but got {region}" + ) + + return ProteinRegionPredicate(region, tx_id) + + +def is_large_imprecise_sv() -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the variant is a large structural variant (SV) + without exact breakpoint coordinates. + """ + return IS_LARGE_IMPRECISE_SV + + +def is_structural_variant( + threshold: int = 50, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the variant is a structural variant (SV). + + SVs are usually defined as variant affecting more than a certain number of base pairs. + The thresholds vary in the literature, but here we use 50bp as a default. + + Any variant that affects at least `threshold` base pairs is considered an SV. + Large SVs with unknown breakpoint coordinates or translocations + (:class:`~gpsea.model.VariantClass.TRANSLOCATION`) are always considered as an SV. + + Args: + threshold: a non-negative `int` with the number of base pairs that must be affected + """ + assert threshold >= 0, "`threshold` must be non-negative!" + return ( + change_length("<=", -threshold) + | change_length(">=", threshold) + | is_large_imprecise_sv() + | IS_TRANSLOCATION + ) + + +def structural_type( + curie: typing.Union[str, hpotk.TermId], +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the variant has a certain structural type. + + We recommend using a descendant of `structural_variant` + (`SO:0001537 `_) + as the structural type. + + **Example** + + Make a predicate for testing if the variant is a chromosomal deletion (`SO:1000029`): + + >>> from gpsea.analysis.predicate import structural_type + >>> predicate = structural_type('SO:1000029') + >>> predicate.description + 'structural type is SO:1000029' + + Args: + curie: compact uniform resource identifier (CURIE) with the structural type to test. + """ + return StructuralTypePredicate.from_curie(curie) + + +def variant_class( + variant_class: VariantClass, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the variant + is of a certain :class:`~gpsea.model.VariantClass`. + + **Example** + + Make a predicate to test if the variant is a deletion: + + >>> from gpsea.model import VariantClass + >>> from gpsea.analysis.predicate import variant_class + >>> predicate = variant_class(VariantClass.DEL) + >>> predicate.description + 'variant class is DEL' + + Args: + variant_class: the variant class to test. + """ + return VariantClassPredicate( + query=variant_class, + ) + + +def ref_length( + operator: typing.Literal["<", "<=", "==", "!=", ">=", ">"], + length: int, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele + of variant is above, below, or (not) equal to certain `length`. + + .. seealso:: + + See :ref:`length-of-the-reference-allele` for more info. + + **Example** + + Prepare a predicate that tests that the REF allele includes more than 5 base pairs: + + >>> from gpsea.analysis.predicate import ref_length + >>> predicate = ref_length('>', 5) + >>> predicate.description + 'reference allele length > 5' + + Args: + operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. + length: a non-negative `int` with the length threshold. + """ + return RefAlleleLengthPredicate(operator, length) + + +def change_length( + operator: typing.Literal["<", "<=", "==", "!=", ">=", ">"], + threshold: int, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the variant's change length + is above, below, or (not) equal to certain `threshold`. + + .. seealso:: + + See :ref:`change-length-of-an-allele` for more info. + + **Example** + + Make a predicate for testing if the change length is less than or equal to `-10`, + e.g. to test if a variant is a *deletion* leading to removal of at least 10 base pairs: + + >>> from gpsea.analysis.predicate import change_length + >>> predicate = change_length('<=', -10) + >>> predicate.description + 'change length <= -10' + + Args: + operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. + threshold: an `int` with the threshold. Can be negative, zero, or positive. + """ + return ChangeLengthPredicate(operator, threshold) + + +def is_structural_deletion( + threshold: int = -50, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` for testing if the variant + is a `chromosomal deletion `_ or a structural variant deletion + that leads to removal of at least *n* base pairs (50bp by default). + + .. note:: + + The predicate uses :meth:`~gpsea.model.VariantCoordinates.change_length` + to determine if the length of the variant is above or below `threshold`. + + **IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele + is shorter than the reference allele. See :ref:`change-length-of-an-allele` for more info. + + **Example** + + Prepare a predicate for testing if the variant is a chromosomal deletion that removes at least 20 base pairs: + + >>> from gpsea.analysis.predicate import is_structural_deletion + >>> predicate = is_structural_deletion(-20) + >>> predicate.description + '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -20))' + + Args: + threshold: an `int` with the change length threshold to determine if a variant is "structural" + (-50 bp by default). + """ + chromosomal_deletion = "SO:1000029" + return structural_type(chromosomal_deletion) | ( + variant_class(VariantClass.DEL) & change_length("<=", threshold) + ) + + +def protein_feature_type( + feature_type: typing.Union[FeatureType, str], + protein_metadata: ProteinMetadata, +) -> VariantPredicate: + """ + Prepare a :class:`~gpsea.analysis.predicate.VariantPredicate` + to test if the variant affects a `feature_type` of a protein. + + Args: + feature_type: the target protein :class:`~gpsea.model.FeatureType` + (e.g. :class:`~gpsea.model.FeatureType.DOMAIN`). + protein_metadata: the information about the protein. + """ + if isinstance(feature_type, FeatureType): + FeatureType.deprecation_warning() + feature_type = feature_type.name + return ProteinFeatureTypePredicate( + feature_type=feature_type, + protein_metadata=protein_metadata, + ) + + +def protein_feature( + feature_id: str, + protein_metadata: ProteinMetadata, +) -> VariantPredicate: + """ + Prepare a :class:`VariantPredicate` to test if the variant affects a protein feature + labeled with the provided `feature_id`. + + Args: + feature_id: the id of the target protein feature (e.g. `ANK 1`) + protein_metadata: the information about the protein. + """ + return ProteinFeaturePredicate( + feature_id=feature_id, + protein_metadata=protein_metadata, + ) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py deleted file mode 100644 index c35fcaa5e..000000000 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from ._api import GenotypePolyPredicate -from ._api import VariantPredicate -from ._counter import AlleleCounter -from ._gt_predicates import sex_predicate, diagnosis_predicate -from ._gt_predicates import monoallelic_predicate, biallelic_predicate -from ._gt_predicates import allele_count -from ._variant import VariantPredicates - -__all__ = [ - 'GenotypePolyPredicate', - 'sex_predicate', 'diagnosis_predicate', - 'monoallelic_predicate', 'biallelic_predicate', - 'allele_count', - 'AlleleCounter', 'VariantPredicate', - 'VariantPredicates', -] diff --git a/src/gpsea/analysis/predicate/genotype/_api.py b/src/gpsea/analysis/predicate/genotype/_api.py deleted file mode 100644 index 9d11072fc..000000000 --- a/src/gpsea/analysis/predicate/genotype/_api.py +++ /dev/null @@ -1,226 +0,0 @@ -import abc -import warnings -import typing - -from gpsea.model import Variant -from .._api import PolyPredicate, Categorization -from ..._partition import Partitioning - - -class GenotypePolyPredicate(PolyPredicate[Categorization], metaclass=abc.ABCMeta): - """ - `GenotypePolyPredicate` is a base class for all :class:`~gpsea.analysis.predicate.PolyPredicate` - that assign an individual into a group based on the genotype. - """ - pass - - -class VariantPredicate(Partitioning, metaclass=abc.ABCMeta): - """ - `VariantPredicate` tests if a variant meets a certain criterion. - - The subclasses *MUST* implement all abstract methods of this class - *plus* ``__eq__`` and ``__hash__``, to support building the compound predicates. - - We *strongly* recommend implementing ``__str__`` and ``__repr__`` as well. - """ - - def get_question(self) -> str: - """ - Prepare a `str` with the question the predicate can answer. - """ - # TODO: remove in `v1.0.0` - warnings.warn( - "`get_question` will be removed soon. Use `description` property instead", - DeprecationWarning, stacklevel=2, - ) - return self.description - - @abc.abstractmethod - def test(self, variant: Variant) -> bool: - """ - Test if the `variant` meets a criterion. - - Args: - variant: an instance of :class:`~gpsea.model.Variant` to test. - - Returns: - bool: `True` if the variant meets the criterion and `False` otherwise. - """ - pass - - def __and__(self, other): - """ - Create a variant predicate which passes if *BOTH* `self` and `other` pass. - """ - if isinstance(other, VariantPredicate): - if self == other: - return self - - if isinstance(self, AllVariantPredicate) and isinstance(other, AllVariantPredicate): - # Merging two *all* variant predicates is equivalent - # to chaining their predicates - return AllVariantPredicate(*self.predicates, *other.predicates) - else: - return AllVariantPredicate(self, other) - else: - return NotImplemented - - def __or__(self, other): - """ - Create a variant predicate which passes if *EITHER* `self` *OR* `other` passes. - """ - if isinstance(other, VariantPredicate): - if self == other: - return self - - if isinstance(self, AnyVariantPredicate) and isinstance(other, AnyVariantPredicate): - # Merging two any variant predicates is equivalent - # to chaining their predicates - return AnyVariantPredicate(*self.predicates, *other.predicates) - else: - return AnyVariantPredicate(self, other) - else: - return NotImplemented - - def __invert__(self): - """ - Create a variant predicate that passes if the underlying predicate fails. - """ - if isinstance(self, InvVariantPredicate): - # Unwrap to prevent double negation - return self._inner - else: - return InvVariantPredicate(self) - - -class LogicalVariantPredicate(VariantPredicate, metaclass=abc.ABCMeta): - # NOT PART OF THE PUBLIC API - - def __init__( - self, - *predicates, - ): - if len(predicates) == 0: - raise ValueError('Predicates must not be empty!') - self._predicates = tuple(predicates) - - @property - def predicates(self) -> typing.Sequence[VariantPredicate]: - return self._predicates - - @property - def name(self) -> str: - sep = f" {self._separator_symbol()} " - return "(" + sep.join(p.name for p in self._predicates) + ")" - - @property - def description(self) -> str: - sep = f" {self._separator_word().upper()} " - return "(" + sep.join(p.description for p in self._predicates) + ")" - - @property - def variable_name(self) -> str: - sep = f" {self._separator_symbol()} " - return "(" + sep.join(p.variable_name for p in self._predicates) + ")" - - @abc.abstractmethod - def _separator_symbol(self) -> str: - pass - - @abc.abstractmethod - def _separator_word(self) -> str: - pass - - -class AnyVariantPredicate(LogicalVariantPredicate): - # NOT PART OF THE PUBLIC API - - def _separator_symbol(self) -> str: - return "|" - - def _separator_word(self) -> str: - return "or" - - def test(self, variant: Variant) -> bool: - return any(predicate.test(variant) for predicate in self._predicates) - - def __eq__(self, value: object) -> bool: - if isinstance(value, AnyVariantPredicate): - return self._predicates == value._predicates - return False - - def __hash__(self) -> int: - return 17 * hash(self._predicates) - - def __str__(self) -> str: - return '(' + ' OR '.join(str(p) for p in self._predicates) + ')' - - def __repr__(self) -> str: - return f'AnyVariantPredicate(predicates={self._predicates})' - - -class AllVariantPredicate(LogicalVariantPredicate): - # NOT PART OF THE PUBLIC API - - def _separator_symbol(self) -> str: - return "&" - - def _separator_word(self) -> str: - return "and" - - def test(self, variant: Variant) -> bool: - return all(predicate.test(variant) for predicate in self._predicates) - - def __eq__(self, value: object) -> bool: - if isinstance(value, AllVariantPredicate): - return self._predicates == value._predicates - return False - - def __hash__(self) -> int: - return 31 * hash(self._predicates) - - def __str__(self) -> str: - return '(' + ' AND '.join(str(p) for p in self._predicates) + ')' - - def __repr__(self) -> str: - return f'AllVariantPredicate(predicates={self._predicates})' - - -class InvVariantPredicate(VariantPredicate): - # NOT PART OF THE PUBLIC API - - def __init__( - self, - inner: VariantPredicate, - ): - self._inner = inner - - @property - def name(self) -> str: - return "NOT " + self._inner.name - - @property - def description(self) -> str: - return "NOT " + self._inner.description - - @property - def variable_name(self) -> str: - return "NOT " + self._inner.variable_name - - def test(self, variant: Variant) -> bool: - return not self._inner.test(variant) - - def __eq__(self, value: object) -> bool: - if isinstance(value, InvVariantPredicate): - return self._inner == value._inner - return False - - def __hash__(self) -> int: - return -hash(self._inner) - - def __str__(self) -> str: - return f'NOT {self._inner}' - - def __repr__(self) -> str: - return f'NotVariantPredicate(inner={self._inner})' diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py deleted file mode 100644 index 0a39d170b..000000000 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ /dev/null @@ -1,445 +0,0 @@ -import typing - -import hpotk - -from gpsea.model import VariantClass, VariantEffect, ProteinMetadata, FeatureType -from gpsea.model.genome import Region -from ._api import VariantPredicate, AllVariantPredicate, AnyVariantPredicate -from ._predicates import ( - AlwaysTrueVariantPredicate, - ChangeLengthPredicate, - IsLargeImpreciseStructuralVariantPredicate, - ProteinFeaturePredicate, - ProteinFeatureTypePredicate, - ProteinRegionPredicate, - RefAlleleLengthPredicate, - StructuralTypePredicate, - VariantClassPredicate, - VariantEffectPredicate, - VariantExonPredicate, - VariantGenePredicate, - VariantKeyPredicate, - VariantTranscriptPredicate, -) - - -# We do not need more than just one instance of these predicates. -IS_TRANSLOCATION = VariantClassPredicate(VariantClass.TRANSLOCATION) -IS_LARGE_IMPRECISE_SV = IsLargeImpreciseStructuralVariantPredicate() - - -class VariantPredicates: - """ - `VariantPredicates` is a static utility class to provide the variant predicates - that are relatively simple to configure. - """ - - @staticmethod - def true() -> VariantPredicate: - """ - Prepare an absolutely inclusive :class:`VariantPredicate` - a predicate that returns `True` - for any variant whatsoever. - """ - return AlwaysTrueVariantPredicate.get_instance() - - @staticmethod - def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that returns `True` if ALL `predicates` evaluate to `True`. - - This is useful for building compound predicates programmatically. - - **Example** - - Build a predicate to test if variant has a functional annotation to genes *SURF1* and *SURF2*: - - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - - >>> genes = ('SURF1', 'SURF2',) - >>> predicate = VariantPredicates.all(VariantPredicates.gene(g) for g in genes) - >>> predicate.description - '(affects SURF1 AND affects SURF2)' - - Args: - predicates: an iterable of predicates to test - """ - predicates = tuple(predicates) - if len(predicates) == 1: - # No need to wrap one predicate into a logical predicate. - return predicates[0] - else: - return AllVariantPredicate(*predicates) - - @staticmethod - def any(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that returns `True` if ANY of the `predicates` evaluates to `True`. - - This can be useful for building compound predicates programmatically. - - **Example** - - Build a predicate to test if variant leads to a missense - or nonsense change on a fictional transcript `NM_123456.7`: - - >>> from gpsea.model import VariantEffect - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - - >>> tx_id = 'NM_123456.7' - >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,) - >>> predicate = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in effects) - >>> predicate.description - '(MISSENSE_VARIANT on NM_123456.7 OR STOP_GAINED on NM_123456.7)' - - Args: - predicates: an iterable of predicates to test - """ - predicates = tuple(predicates) - if len(predicates) == 1: - # No need to wrap one predicate into a logical predicate. - return predicates[0] - else: - return AnyVariantPredicate(*predicates) - - @staticmethod - def variant_effect( - effect: VariantEffect, - tx_id: str, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` to test if the functional annotation predicts the variant to lead to - a certain variant effect. - - **Example** - - Make a predicate for testing if the variant leads to a missense change on transcript `NM_123.4`: - - >>> from gpsea.model import VariantEffect - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> predicate = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_123.4') - >>> predicate.description - 'MISSENSE_VARIANT on NM_123.4' - - Args: - effect: the target :class:`~gpsea.model.VariantEffect` - tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) - """ - return VariantEffectPredicate(effect, tx_id) - - @staticmethod - def variant_key(key: str) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that tests if the variant matches the provided `key`. - - Args: - key: a `str` with the variant key (e.g. `X_12345_12345_C_G` or `22_10001_20000_INV`) - """ - return VariantKeyPredicate(key) - - @staticmethod - def gene(symbol: str) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that tests if the variant affects a given gene. - - Args: - symbol: a `str` with the gene symbol (e.g. ``'FBN1'``). - """ - return VariantGenePredicate(symbol) - - @staticmethod - def transcript(tx_id: str) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that tests if the variant affects a transcript. - - Args: - tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) - """ - return VariantTranscriptPredicate(tx_id) - - @staticmethod - def exon( - exon: int, - tx_id: str, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. - - .. warning:: - - We use 1-based numbering to number the exons, - not the usual 0-based numbering of the computer science. - Therefore, the first exon of the transcript - has ``exon_number==1``, the second exon is ``2``, and so on ... - - .. warning:: - - We do not check if the `exon_number` spans - beyond the number of exons of the given `transcript_id`! - Therefore, ``exon_number==10,000`` will effectively return `False` - for *all* variants!!! 😱 - Well, at least the genome variants of the *Homo sapiens sapiens* taxon... - - Args: - exon: a positive `int` with the index of the target exon - (e.g. `1` for the 1st exon, `2` for the 2nd, ...) - tx_id: a `str` with the accession ID of the target transcript (e.g. `NM_123.4`) - """ - return VariantExonPredicate(exon, tx_id) - - @staticmethod - def region( - region: typing.Union[typing.Tuple[int, int], Region], - tx_id: str, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` that tests if the variant - overlaps with a region on a protein of a specific transcript. - - - Example - ------- - - Create a predicate to test if the variant overlaps with the 5th aminoacid - of the protein encoded by a fictional transcript `NM_1234.5`: - - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> overlaps_with_fifth_aa = VariantPredicates.region(region=(5, 5), tx_id="NM_1234.5") - >>> overlaps_with_fifth_aa.description - 'overlaps with [5,5] region of the protein encoded by NM_1234.5' - - Create a predicate to test if the variant Overlaps with the first 20 aminoacid residues of the same transcript: - - >>> overlaps_with_first_20 = VariantPredicates.region(region=(1, 20), tx_id="NM_1234.5") - >>> overlaps_with_first_20.description - 'overlaps with [1,20] region of the protein encoded by NM_1234.5' - - Args: - region: a :class:`~gpsea.model.genome.Region` that gives the start and end coordinate - of the region of interest on a protein strand or a tuple with 1-based coordinates. - """ - if isinstance(region, Region): - pass - elif isinstance(region, tuple) and len(region) == 2 and all(isinstance(c, int) and c > 0 for c in region): - start = region[0] - 1 # Convert to 0-based - end = region[1] - region = Region(start=start, end=end) - else: - raise ValueError(f'region must be a `Region` or a tuple with two positive `int`s, but got {region}') - - return ProteinRegionPredicate(region, tx_id) - - @staticmethod - def is_large_imprecise_sv() -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the variant is a large structural variant (SV) - without exact breakpoint coordinates. - """ - return IS_LARGE_IMPRECISE_SV - - @staticmethod - def is_structural_variant( - threshold: int = 50, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the variant is a structural variant (SV). - - SVs are usually defined as variant affecting more than a certain number of base pairs. - The thresholds vary in the literature, but here we use 50bp as a default. - - Any variant that affects at least `threshold` base pairs is considered an SV. - Large SVs with unknown breakpoint coordinates or translocations - (:class:`~gpsea.model.VariantClass.TRANSLOCATION`) are always considered as an SV. - - Args: - threshold: a non-negative `int` with the number of base pairs that must be affected - """ - assert threshold >= 0, "`threshold` must be non-negative!" - return ( - VariantPredicates.change_length("<=", -threshold) - | VariantPredicates.change_length(">=", threshold) - | VariantPredicates.is_large_imprecise_sv() - | IS_TRANSLOCATION - ) - - @staticmethod - def structural_type( - curie: typing.Union[str, hpotk.TermId], - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the variant has a certain structural type. - - We recommend using a descendant of `structural_variant` - (`SO:0001537 `_) - as the structural type. - - **Example** - - Make a predicate for testing if the variant is a chromosomal deletion (`SO:1000029`): - - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> predicate = VariantPredicates.structural_type('SO:1000029') - >>> predicate.description - 'structural type is SO:1000029' - - Args: - curie: compact uniform resource identifier (CURIE) with the structural type to test. - """ - return StructuralTypePredicate.from_curie(curie) - - @staticmethod - def variant_class( - variant_class: VariantClass, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the variant - is of a certain :class:`~gpsea.model.VariantClass`. - - **Example** - - Make a predicate to test if the variant is a deletion: - - >>> from gpsea.model import VariantClass - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> predicate = VariantPredicates.variant_class(VariantClass.DEL) - >>> predicate.description - 'variant class is DEL' - - Args: - variant_class: the variant class to test. - """ - return VariantClassPredicate( - query=variant_class, - ) - - @staticmethod - def ref_length( - operator: typing.Literal["<", "<=", "==", "!=", ">=", ">"], - length: int, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the reference (REF) allele - of variant is above, below, or (not) equal to certain `length`. - - .. seealso:: - - See :ref:`length-of-the-reference-allele` for more info. - - **Example** - - Prepare a predicate that tests that the REF allele includes more than 5 base pairs: - - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> predicate = VariantPredicates.ref_length('>', 5) - >>> predicate.description - 'reference allele length > 5' - - Args: - operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. - length: a non-negative `int` with the length threshold. - """ - return RefAlleleLengthPredicate(operator, length) - - @staticmethod - def change_length( - operator: typing.Literal["<", "<=", "==", "!=", ">=", ">"], - threshold: int, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the variant's change length - is above, below, or (not) equal to certain `threshold`. - - .. seealso:: - - See :ref:`change-length-of-an-allele` for more info. - - **Example** - - Make a predicate for testing if the change length is less than or equal to `-10`, - e.g. to test if a variant is a *deletion* leading to removal of at least 10 base pairs: - - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> predicate = VariantPredicates.change_length('<=', -10) - >>> predicate.description - 'change length <= -10' - - Args: - operator: a `str` with the desired test. Must be one of ``{ '<', '<=', '==', '!=', '>=', '>' }``. - threshold: an `int` with the threshold. Can be negative, zero, or positive. - """ - return ChangeLengthPredicate(operator, threshold) - - @staticmethod - def is_structural_deletion( - threshold: int = -50, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` for testing if the variant - is a `chromosomal deletion `_ or a structural variant deletion - that leads to removal of at least *n* base pairs (50bp by default). - - .. note:: - - The predicate uses :meth:`~gpsea.model.VariantCoordinates.change_length` - to determine if the length of the variant is above or below `threshold`. - - **IMPORTANT**: the change lengths of deletions are *negative*, since the alternate allele - is shorter than the reference allele. See :ref:`change-length-of-an-allele` for more info. - - **Example** - - Prepare a predicate for testing if the variant is a chromosomal deletion that removes at least 20 base pairs: - - >>> from gpsea.analysis.predicate.genotype import VariantPredicates - >>> predicate = VariantPredicates.is_structural_deletion(-20) - >>> predicate.description - '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -20))' - - Args: - threshold: an `int` with the change length threshold to determine if a variant is "structural" - (-50 bp by default). - """ - chromosomal_deletion = "SO:1000029" - return VariantPredicates.structural_type(chromosomal_deletion) | ( - VariantPredicates.variant_class(VariantClass.DEL) - & VariantPredicates.change_length("<=", threshold) - ) - - @staticmethod - def protein_feature_type( - feature_type: typing.Union[FeatureType, str], - protein_metadata: ProteinMetadata, - ) -> VariantPredicate: - """ - Prepare a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` - to test if the variant affects a `feature_type` of a protein. - - Args: - feature_type: the target protein :class:`~gpsea.model.FeatureType` - (e.g. :class:`~gpsea.model.FeatureType.DOMAIN`). - protein_metadata: the information about the protein. - """ - if isinstance(feature_type, FeatureType): - FeatureType.deprecation_warning() - feature_type = feature_type.name - return ProteinFeatureTypePredicate( - feature_type=feature_type, - protein_metadata=protein_metadata, - ) - - @staticmethod - def protein_feature( - feature_id: str, - protein_metadata: ProteinMetadata, - ) -> VariantPredicate: - """ - Prepare a :class:`VariantPredicate` to test if the variant affects a protein feature - labeled with the provided `feature_id`. - - Args: - feature_id: the id of the target protein feature (e.g. `ANK 1`) - protein_metadata: the information about the protein. - """ - return ProteinFeaturePredicate( - feature_id=feature_id, - protein_metadata=protein_metadata, - ) diff --git a/src/gpsea/analysis/predicate/phenotype/__init__.py b/src/gpsea/analysis/predicate/phenotype/__init__.py deleted file mode 100644 index b322ca9db..000000000 --- a/src/gpsea/analysis/predicate/phenotype/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -The `gpsea.analysis.predicate.phenotype` package provides the :class:`PhenotypePolyPredicate` -for assigning :class:`~gpsea.model.Patient` into a phenotype group. - -An individual can be assigned based on presence/absence of a disease diagnosis (:class:`DiseasePresencePredicate`) -or using the phenotype features encoded into HPO terms (:class:`~gpsea.analysis.predicate.phenotype.HpoPredicate`). -""" - -from ._pheno import PhenotypePolyPredicate, HpoPredicate -from ._pheno import DiseasePresencePredicate -from ._pheno import PhenotypeCategorization, P -from ._util import prepare_predicates_for_terms_of_interest, prepare_hpo_terms_of_interest - -__all__ = [ - 'PhenotypePolyPredicate', 'HpoPredicate', - 'DiseasePresencePredicate', - 'PhenotypeCategorization', 'P', - 'prepare_predicates_for_terms_of_interest', 'prepare_hpo_terms_of_interest', -] diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py deleted file mode 100644 index 08d34a7bc..000000000 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ /dev/null @@ -1,316 +0,0 @@ -import abc -import typing - -import hpotk -from hpotk.util import validate_instance - -from gpsea.model import Patient - -from .._api import PolyPredicate, PatientCategory, Categorization - -P = typing.TypeVar("P") -""" -Phenotype entity of interest, such as :class:`~hpotk.model.TermId`, representing an HPO term or an OMIM/MONDO term. - -However, phenotype entity can be anything as long as it is :class:`~typing.Hashable` and comparable -(have `__eq__` and `__lt__` magic methods). -""" - -YES = PatientCategory(1, 'Yes', 'The patient belongs to the group.') -""" -Category for a patient who *belongs* to the tested group. -""" - -NO = PatientCategory(0, 'No', 'The patient does not belong to the group.') -""" -Category for a patient who does *not* belong to the tested group. -""" - - -class PhenotypeCategorization(typing.Generic[P], Categorization): - """ - On top of the attributes of the `Categorization`, `PhenotypeCategorization` keeps track of the target phenotype `P`. - """ - - def __init__( - self, - category: PatientCategory, - phenotype: P, - ): - super().__init__(category) - self._phenotype = phenotype - - @property - def phenotype(self) -> P: - return self._phenotype - - def __repr__(self) -> str: - return ( - "PhenotypeCategorization(" - f"category={self._category}, " - f"phenotype={self._phenotype})" - ) - - def __str__(self) -> str: - return repr(self) - - def __eq__(self, other) -> bool: - return isinstance(other, PhenotypeCategorization) \ - and self._category == other._category \ - and self._phenotype == other._phenotype - - def __hash__(self) -> int: - return hash((self._category, self._phenotype)) - - -class PhenotypePolyPredicate( - typing.Generic[P], PolyPredicate[PhenotypeCategorization[P]], metaclass=abc.ABCMeta -): - """ - `PhenotypePolyPredicate` is a base class for all :class:`~gpsea.analysis.predicate.PolyPredicate` - that assigns an individual into a group based on phenotype. - The predicate assigns an individual into one of phenotype categories `P`. - - Each phenotype category `P` can be a :class:`~hpotk.model.TermId` representing an HPO term or an OMIM/MONDO term. - - Only one category can be investigated, and :attr:`phenotype` returns the investigated phenotype - (e.g. *Arachnodactyly* `HP:0001166`). - - As another hallmark of this predicate, one of the categorizations must correspond to the group of patients - who exibit the investigated phenotype. The categorization is provided - via :attr:`present_phenotype_categorization` property. - """ - - @property - @abc.abstractmethod - def phenotype(self) -> P: - """ - Get the phenotype entity of interest. - """ - pass - - @property - @abc.abstractmethod - def present_phenotype_categorization(self) -> PhenotypeCategorization[P]: - """ - Get the categorization which represents the group of the patients who exibit the investigated phenotype. - """ - pass - - @property - def present_phenotype_category(self) -> PatientCategory: - """ - Get the patient category that correspond to the group of the patients who exibit the investigated phenotype. - """ - return self.present_phenotype_categorization.category - - -class HpoPredicate(PhenotypePolyPredicate[hpotk.TermId]): - """ - `HpoPredicate` tests if a patient is annotated with an HPO term. - - Note, `query` must be a term of the provided `hpo`! - - See :ref:`hpo-predicate` section for an example usage. - - :param hpo: HPO ontology - :param query: the HPO term to test - :param missing_implies_phenotype_excluded: `True` if lack of an explicit annotation implies term's absence`. - """ - - def __init__( - self, - hpo: hpotk.MinimalOntology, - query: hpotk.TermId, - missing_implies_phenotype_excluded: bool = False, - ): - self._hpo = validate_instance(hpo, hpotk.MinimalOntology, "hpo") - self._query = validate_instance( - query, hpotk.TermId, "phenotypic_feature" - ) - self._query_label = self._hpo.get_term_name(query) - assert self._query_label is not None, f"Query {query} is in HPO" - self._missing_implies_phenotype_excluded = validate_instance( - missing_implies_phenotype_excluded, - bool, - "missing_implies_phenotype_excluded", - ) - self._phenotype_observed = PhenotypeCategorization( - category=YES, - phenotype=self._query, - ) - self._phenotype_excluded = PhenotypeCategorization( - category=NO, - phenotype=self._query, - ) - # Some tests depend on the order of `self._categorizations`. - self._categorizations = (self._phenotype_observed, self._phenotype_excluded) - - @property - def name(self) -> str: - return "HPO Predicate" - - @property - def description(self) -> str: - return f"Test for presence of {self._query_label} [{self._query.value}]" - - @property - def variable_name(self) -> str: - return self._query.value - - @property - def phenotype(self) -> hpotk.TermId: - return self._query - - @property - def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]: - return self._phenotype_observed - - def get_categorizations( - self, - ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]: - return self._categorizations - - def test( - self, patient: Patient - ) -> typing.Optional[PhenotypeCategorization[hpotk.TermId]]: - """An HPO TermID is given when initializing the class. - Given a Patient class, this function tests whether the patient has the - given phenotype. - - Args: - patient (Patient): A Patient class representing a patient. - - Returns: - typing.Optional[PhenotypeCategorization[P]]: PhenotypeCategorization, - either "YES" if the phenotype - is listed and is not excluded, or - "NO" if the phenotype is listed and excluded, - otherwise will return None. - Unless _missing_implies_phenotype_excluded is True, then - will return "NO" if the phenotype is listed and excluded - or not listed. - """ - self._check_patient(patient) - - if len(patient.phenotypes) == 0: - return None - - for phenotype in patient.phenotypes: - if phenotype.is_present: - if self._query == phenotype.identifier or any( - self._query == anc - for anc in self._hpo.graph.get_ancestors(phenotype) - ): - return self._phenotype_observed - else: - if self._missing_implies_phenotype_excluded: - return self._phenotype_excluded - else: - if phenotype.identifier == self._query or any( - phenotype.identifier == anc - for anc in self._hpo.graph.get_ancestors(self._query) - ): - return self._phenotype_excluded - - return None - - def __eq__(self, value: object) -> bool: - return isinstance(value, HpoPredicate) \ - and self._hpo.version == value._hpo.version \ - and self._query == value._query \ - and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded - - def __hash__(self) -> int: - return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) - - def __repr__(self): - return f"HpoPredicate(query={self._query})" - - -class DiseasePresencePredicate(PhenotypePolyPredicate[hpotk.TermId]): - """ - `DiseasePresencePredicate` tests if an individual was diagnosed with a disease. - - :param disease_id_query: a disease identifier formatted either as a CURIE `str` (e.g. ``OMIM:256000``) - or as a :class:`~hpotk.TermId`. - """ - - def __init__( - self, - disease_id_query: typing.Union[str, hpotk.TermId], - ): - if isinstance(disease_id_query, str): - self._query = hpotk.TermId.from_curie(disease_id_query) - elif isinstance(disease_id_query, hpotk.TermId): - self._query = disease_id_query - else: - raise AssertionError - - self._diagnosis_present = PhenotypeCategorization( - category=YES, - phenotype=disease_id_query, - ) - self._diagnosis_excluded = PhenotypeCategorization( - category=NO, - phenotype=disease_id_query, - ) - - @property - def name(self) -> str: - return "Disease Predicate" - - @property - def description(self) -> str: - return f"Partition based on a diagnosis of {self._query.value}" - - @property - def variable_name(self) -> str: - return self._query.value - - @property - def phenotype(self) -> hpotk.TermId: - return self._query - - @property - def present_phenotype_categorization(self) -> PhenotypeCategorization[hpotk.TermId]: - return self._diagnosis_present - - def get_categorizations( - self, - ) -> typing.Sequence[PhenotypeCategorization[hpotk.TermId]]: - return self._diagnosis_present, self._diagnosis_excluded - - def test( - self, patient: Patient - ) -> typing.Optional[PhenotypeCategorization[hpotk.TermId]]: - """ - Test if the `patient` was diagnosed with a disease. - - Args: - patient (Patient): a `patient` instance to be tested. - - Returns: - typing.Optional[PhenotypeCategorization[P]]: PhenotypeCategorization, - either "YES" if the phenotype - is listed and is not excluded, or - "NO" if the disease is not listed - or is excluded. - """ - self._check_patient(patient) - - for dis in patient.diseases: - if dis.is_present and dis.identifier == self._query: - return self._diagnosis_present - - return self._diagnosis_excluded - - def __eq__(self, value: object) -> bool: - return isinstance(value, DiseasePresencePredicate) \ - and self._query == value._query - - def __hash__(self) -> int: - return hash((self._query,)) - - def __repr__(self): - return f"DiseasePresencePredicate(query={self._query})" diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py index 8c622ed20..72049f04e 100644 --- a/src/gpsea/analysis/pscore/_api.py +++ b/src/gpsea/analysis/pscore/_api.py @@ -4,7 +4,7 @@ import pandas as pd from gpsea.model import Patient -from ..predicate.genotype import GenotypePolyPredicate +from ..clf import GenotypeClassifier from .stats import PhenotypeScoreStatistic from .._base import MonoPhenotypeAnalysisResult, Statistic, StatisticResult @@ -59,6 +59,7 @@ class FunctionPhenotypeScorer(PhenotypeScorer): """ `FunctionPhenotypeScorer` computes the phenotype score using the provided function/closure. """ + # NOT PART OF THE PUBLIC API @property @@ -95,7 +96,7 @@ class PhenotypeScoreAnalysisResult(MonoPhenotypeAnalysisResult): The :attr:`~gpsea.analysis.MonoPhenotypeAnalysisResult.data` property provides a data frame with phenotype score for each tested individual: - + ========== ======== ========= patient_id genotype phenotype ========== ======== ========= @@ -109,22 +110,22 @@ class PhenotypeScoreAnalysisResult(MonoPhenotypeAnalysisResult): The DataFrame index includes the identifiers of the tested individuals and the values are stored in `genotype` and `phenotype` columns. - The `genotype` includes the genotype category ID (:attr:`~gpsea.analysis.predicate.PatientCategory.cat_id`) + The `genotype` includes the genotype category ID (:attr:`~gpsea.analysis.clf.PatientCategory.cat_id`) or `None` if the patient cannot be assigned into any genotype category. - + The `phenotype` contains a `float` with the phenotype score. A `NaN` value is used if the phenotype score is impossible to compute. """ def __init__( self, - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, phenotype: PhenotypeScorer, statistic: Statistic, data: pd.DataFrame, statistic_result: StatisticResult, ): - super().__init__(gt_predicate, phenotype, statistic, data, statistic_result) + super().__init__(gt_clf, phenotype, statistic, data, statistic_result) assert isinstance(phenotype, PhenotypeScorer) def phenotype_scorer(self) -> PhenotypeScorer: @@ -150,25 +151,24 @@ def plot_boxplots( """ # skip the patients with unassigned genotype group bla = self._data.notna() - not_na_gts = bla.all(axis='columns') + not_na_gts = bla.all(axis="columns") data = self._data.loc[not_na_gts] - + # Check that the provided genotype predicate defines the same categories # as those found in `data.` actual = set(data[MonoPhenotypeAnalysisResult.GT_COL].unique()) - expected = set(c.cat_id for c in self._gt_predicate.get_categories()) - assert actual == expected, 'Mismatch in the genotype categories' - + expected = set(c.cat_id for c in self._gt_clf.get_categories()) + assert actual == expected, "Mismatch in the genotype classes" + x = [ data.loc[ - data[MonoPhenotypeAnalysisResult.GT_COL] == c.category.cat_id, MonoPhenotypeAnalysisResult.PH_COL + data[MonoPhenotypeAnalysisResult.GT_COL] == c.category.cat_id, + MonoPhenotypeAnalysisResult.PH_COL, ].to_list() - for c in self._gt_predicate.get_categorizations() - ] - - gt_cat_names = [ - c.category.name for c in self._gt_predicate.get_categorizations() + for c in self._gt_clf.get_categorizations() ] + + gt_cat_names = [c.category.name for c in self._gt_clf.get_categorizations()] bplot = ax.boxplot( x=x, patch_artist=True, @@ -179,16 +179,17 @@ def plot_boxplots( patch.set_facecolor(color) def __eq__(self, value: object) -> bool: - return isinstance(value, PhenotypeScoreAnalysisResult) \ - and super(MonoPhenotypeAnalysisResult, self).__eq__(value) - + return isinstance(value, PhenotypeScoreAnalysisResult) and super( + MonoPhenotypeAnalysisResult, self + ).__eq__(value) + def __hash__(self) -> int: return super(MonoPhenotypeAnalysisResult, self).__hash__() def __str__(self) -> str: return ( "PhenotypeScoreAnalysisResult(" - f"gt_predicate={self._gt_predicate}, " + f"gt_clf={self._gt_clf}, " f"phenotype_scorer={self._phenotype}, " f"statistic={self._statistic}, " f"data={self._data}, " @@ -201,11 +202,11 @@ def __repr__(self) -> str: class PhenotypeScoreAnalysis: """ - `PhenotypeScoreAnalysis` tests the association between two or more genotype groups + `PhenotypeScoreAnalysis` tests the association between two or more genotype classes and a phenotype score. - The genotype groups are created by a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` - and the phenotype score is computed with :class:`~gpsea.analysis.pscore.PhenotypeScorer`. + A genotype class is assigned by a :class:`~gpsea.analysis.clf.GenotypeClassifier` + and the phenotype score is computed with a :class:`~gpsea.analysis.pscore.PhenotypeScorer`. The association is tested with a :class:`~gpsea.analysis.pscore.stats.PhenotypeScoreStatistic` and the results are reported as a :class:`PhenotypeScoreAnalysisResult`. @@ -221,18 +222,18 @@ def __init__( def compare_genotype_vs_phenotype_score( self, cohort: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, pheno_scorer: PhenotypeScorer, ) -> PhenotypeScoreAnalysisResult: """ Compute the association between genotype groups and phenotype score. :param cohort: the cohort to analyze. - :param gt_predicate: a predicate for assigning an individual into a genotype group. + :param gt_clf: a classifier for assigning an individual into a genotype class. :param pheno_scorer: the scorer to compute phenotype score. """ assert ( - gt_predicate.n_categorizations() == 2 + gt_clf.n_categorizations() == 2 ), "We only support 2 genotype categories at this point" assert isinstance(pheno_scorer, PhenotypeScorer) @@ -243,29 +244,37 @@ def compare_genotype_vs_phenotype_score( columns=MonoPhenotypeAnalysisResult.DATA_COLUMNS, ) - # Apply the predicates on the patients - for patient in cohort: - gt_cat = gt_predicate.test(patient) + # Apply the classifier and scorer on the individuals + for individual in cohort: + gt_cat = gt_clf.test(individual) if gt_cat is None: - data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = None + data.loc[individual.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = None else: - data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = gt_cat.category.cat_id - - data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.PH_COL] = pheno_scorer.score(patient) + data.loc[individual.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = ( + gt_cat.category.cat_id + ) + + data.loc[individual.patient_id, MonoPhenotypeAnalysisResult.PH_COL] = ( + pheno_scorer.score(individual) + ) # Sort by PatientCategory.cat_id and unpack. # For now, we only allow to have up to 2 groups. - x_key, y_key = sorted(data[MonoPhenotypeAnalysisResult.GT_COL].dropna().unique()) + x_key, y_key = sorted( + data[MonoPhenotypeAnalysisResult.GT_COL].dropna().unique() + ) x = data.loc[ - data[MonoPhenotypeAnalysisResult.GT_COL] == x_key, MonoPhenotypeAnalysisResult.PH_COL + data[MonoPhenotypeAnalysisResult.GT_COL] == x_key, + MonoPhenotypeAnalysisResult.PH_COL, ].to_numpy(dtype=float) # type: ignore y = data.loc[ - data[MonoPhenotypeAnalysisResult.GT_COL] == y_key, MonoPhenotypeAnalysisResult.PH_COL + data[MonoPhenotypeAnalysisResult.GT_COL] == y_key, + MonoPhenotypeAnalysisResult.PH_COL, ].to_numpy(dtype=float) # type: ignore result = self._statistic.compute_pval(scores=(x, y)) return PhenotypeScoreAnalysisResult( - gt_predicate=gt_predicate, + gt_clf=gt_clf, phenotype=pheno_scorer, statistic=self._statistic, data=data, diff --git a/src/gpsea/analysis/temporal/_api.py b/src/gpsea/analysis/temporal/_api.py index 05d12e967..0ecffec10 100644 --- a/src/gpsea/analysis/temporal/_api.py +++ b/src/gpsea/analysis/temporal/_api.py @@ -8,12 +8,12 @@ import scipy.stats from gpsea.model import Patient -from ..predicate.genotype import GenotypePolyPredicate from ._base import Survival from ._util import prepare_censored_data from .stats import SurvivalStatistic +from ..clf import GenotypeClassifier from .._base import MonoPhenotypeAnalysisResult, StatisticResult, AnalysisException from .._partition import ContinuousPartitioning @@ -56,7 +56,7 @@ class SurvivalAnalysisResult(MonoPhenotypeAnalysisResult): ============ =========== ============================================ The index includes the individual IDs (`patient_id`), and then there are 2 columns - with the `genotype` group id (:attr:`~gpsea.analysis.predicate.PatientCategory.cat_id`) + with the `genotype` group id (:attr:`~gpsea.analysis.clf.PatientCategory.cat_id`) and the `phenotype` with the survival represented as :class:`~gpsea.analysis.temporal.Survival` object. A `genotype` value may be missing (`None`) if the individual cannot be assigned @@ -66,14 +66,14 @@ class SurvivalAnalysisResult(MonoPhenotypeAnalysisResult): def __init__( self, - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, endpoint: Endpoint, statistic: SurvivalStatistic, data: pd.DataFrame, statistic_result: StatisticResult, ): super().__init__( - gt_predicate=gt_predicate, + gt_clf=gt_clf, phenotype=endpoint, statistic=statistic, data=data, @@ -103,7 +103,7 @@ def plot_kaplan_meier_curves( :param ax: a Matplotlib `Axes` to draw on. """ - for pat_cat in self._gt_predicate.get_categories(): + for pat_cat in self._gt_clf.get_categories(): survivals = self._data.loc[ self._data[MonoPhenotypeAnalysisResult.GT_COL] == pat_cat.cat_id, MonoPhenotypeAnalysisResult.PH_COL, @@ -113,14 +113,13 @@ def plot_kaplan_meier_curves( censored_data = prepare_censored_data(survivals=non_na) data = scipy.stats.ecdf(censored_data) data.sf.plot(ax, label=pat_cat.name) - + ax.legend() def __eq__(self, value: object) -> bool: - return ( - isinstance(value, SurvivalAnalysisResult) - and super(MonoPhenotypeAnalysisResult, self).__eq__(value) - ) + return isinstance(value, SurvivalAnalysisResult) and super( + MonoPhenotypeAnalysisResult, self + ).__eq__(value) def __hash__(self) -> int: return super(MonoPhenotypeAnalysisResult, self).__hash__() @@ -128,7 +127,7 @@ def __hash__(self) -> int: def __str__(self) -> str: return ( "SurvivalAnalysisResult(" - f"gt_predicate={self._gt_predicate}, " + f"gt_clf={self._gt_clf}, " f"endpoint={self._phenotype}, " f"statistic={self._statistic}, " f"data={self._data}, " @@ -143,7 +142,7 @@ class SurvivalAnalysis: """ `SurvivalAnalysis` compares the survivals of genotype groups with respect to an :class:`~gpsea.analysis.temporal.Endpoint`. - + The cohort is partitioned into groups using a genotype predicate and survival is computed for each cohort member. The difference between survivals is tested with selected :class:`~gpsea.analysis.temporal.stats.SurvivalStatistic`. @@ -162,13 +161,13 @@ def __init__( def compare_genotype_vs_survival( self, cohort: typing.Iterable[Patient], - gt_predicate: GenotypePolyPredicate, + gt_clf: GenotypeClassifier, endpoint: Endpoint, ) -> SurvivalAnalysisResult: """ Execute the survival analysis on a given `cohort`. """ - + idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id") data = pd.DataFrame( None, @@ -178,11 +177,13 @@ def compare_genotype_vs_survival( survivals = defaultdict(list) # Apply the predicate and the survival metric on the cohort for patient in cohort: - gt_cat = gt_predicate.test(patient) + gt_cat = gt_clf.test(patient) if gt_cat is None: data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = None else: - data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = gt_cat.category.cat_id + data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = ( + gt_cat.category.cat_id + ) survival = endpoint.compute_survival(patient) data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.PH_COL] = survival # type: ignore @@ -190,7 +191,7 @@ def compare_genotype_vs_survival( if gt_cat is not None and survival is not None: survivals[gt_cat].append(survival) - vals = tuple(survivals[gt_cat] for gt_cat in gt_predicate.get_categorizations()) + vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations()) result = self._statistic.compute_pval(vals) if math.isnan(result.pval): raise AnalysisException( @@ -199,7 +200,7 @@ def compare_genotype_vs_survival( ) return SurvivalAnalysisResult( - gt_predicate=gt_predicate, + gt_clf=gt_clf, endpoint=endpoint, statistic=self._statistic, data=data, diff --git a/src/gpsea/preprocessing/_vep.py b/src/gpsea/preprocessing/_vep.py index a4d3d53de..7c9596027 100644 --- a/src/gpsea/preprocessing/_vep.py +++ b/src/gpsea/preprocessing/_vep.py @@ -10,13 +10,9 @@ class VepFunctionalAnnotator(FunctionalAnnotator): - """A `FunctionalAnnotator` that uses Variant Effect Predictor (VEP) REST API to - do functional variant annotation. - - Args: - include_computational_txs (bool): Include computational transcripts, such as - RefSeq `XM_`. - timeout (int): Timeout in seconds + """ + `VepFunctionalAnnotator` uses the Variant Effect Predictor (VEP) REST API + to perform functional variant annotation. """ NONCODING_EFFECTS = { @@ -51,16 +47,6 @@ def __init__(self, self._timeout = timeout def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[TranscriptAnnotation]: - """Perform functional annotation using Variant Effect Predictor (VEP) REST API. - - Args: - variant_coordinates (VariantCoordinates): A VariantCoordinates object - Returns: - typing.Sequence[TranscriptAnnotation]: A sequence of transcript - annotations for the variant coordinates - Raises: - ValueError if VEP times out or does not return a response or if the response is not formatted as we expect. - """ response = self.fetch_response(variant_coordinates) return self.process_response(variant_coordinates.variant_key, response) diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index d2645e341..8be597356 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -22,23 +22,23 @@ def summarize_hpo_analysis( pheno_idx = pd.Index(result.phenotypes) # Column index: multiindex of counts and percentages for all genotype predicate groups gt_idx = pd.MultiIndex.from_product( - iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), - names=(result.gt_predicate.variable_name, None), + iterables=(result.gt_clf.get_categories(), ("Count", "Percent")), + names=(result.gt_clf.variable_name, None), ) # We'll fill this frame with data df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + for ph_clf, count in zip(result.pheno_clfs, result.all_counts): # Sum across the phenotype categories (collapse the rows). gt_totals = count.sum() for gt_cat in count.columns: - cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + cnt = count.loc[ph_clf.present_phenotype_category, gt_cat] total = gt_totals[gt_cat] - df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + df.loc[ph_clf.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + df.loc[ph_clf.phenotype, (gt_cat, "Percent")] = f"{pct}%" # Add columns with p values and corrected p values (if present) p_val_col_name = "p values" @@ -56,7 +56,9 @@ def summarize_hpo_analysis( # and only report the tested HPO terms with_p_value = df[("", p_val_col_name)].notna() if result.corrected_pvals is not None: - return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]).loc[with_p_value] + return df.sort_values( + by=[("", corrected_p_val_col_name), ("", p_val_col_name)] + ).loc[with_p_value] else: return df.sort_values(by=("", p_val_col_name)).loc[with_p_value] diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/clf/conftest.py similarity index 73% rename from tests/analysis/predicate/genotype/conftest.py rename to tests/analysis/clf/conftest.py index 27da841e4..b270d955e 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/clf/conftest.py @@ -1,17 +1,13 @@ import pytest -import hpotk - from gpsea.model import ( Genotype, Genotypes, - ImpreciseSvInfo, Patient, SampleLabels, Sex, TranscriptAnnotation, Variant, - VariantClass, VariantCoordinates, VariantEffect, VariantInfo, @@ -21,182 +17,9 @@ GenomicRegion, Region, Strand, - ) -@pytest.fixture(scope="package") -def sample_labels() -> SampleLabels: - return SampleLabels("jim") - - -@pytest.fixture(scope="package") -def missense_variant( - genome_build: GenomeBuild, - sample_labels: SampleLabels, -) -> Variant: - chr22 = genome_build.contig_by_name("chr22") - assert chr22 is not None - return Variant( - variant_info=VariantInfo( - variant_coordinates=VariantCoordinates( - region=GenomicRegion( - contig=chr22, - start=100, - end=101, - strand=Strand.POSITIVE, - ), - ref="C", - alt="G", - change_length=0, - ) - ), - tx_annotations=( - TranscriptAnnotation( - gene_id="a_gene", - tx_id="tx:xyz", - hgvs_cdna=None, - is_preferred=False, - variant_effects=( - VariantEffect.MISSENSE_VARIANT, - VariantEffect.SPLICE_DONOR_VARIANT, - ), - affected_exons=(4,), - protein_id="pt:xyz", - hgvsp=None, - protein_effect_coordinates=Region(40, 41), - ), - TranscriptAnnotation( - gene_id="a_gene", - tx_id="tx:abc", - hgvs_cdna=None, - is_preferred=False, - variant_effects=(VariantEffect.INTRON_VARIANT,), - affected_exons=None, - protein_id=None, - hgvsp=None, - protein_effect_coordinates=None, - ), - ), - genotypes=Genotypes.single(sample_labels, Genotype.HETEROZYGOUS), - ) - - -@pytest.fixture(scope="package") -def frameshift_variant( - genome_build: GenomeBuild, - sample_labels: SampleLabels, -) -> Variant: - chr22 = genome_build.contig_by_name("chr22") - assert chr22 is not None - return Variant( - variant_info=VariantInfo( - variant_coordinates=VariantCoordinates( - region=GenomicRegion( - contig=chr22, - start=110, - end=113, - strand=Strand.POSITIVE, - ), - ref="CCC", - alt="C", - change_length=-2, - ) - ), - tx_annotations=( - TranscriptAnnotation( - gene_id="a_gene", - tx_id="tx:xyz", - hgvs_cdna=None, - is_preferred=False, - variant_effects=(VariantEffect.FRAMESHIFT_VARIANT,), - affected_exons=(5,), - protein_id="pt:xyz", - hgvsp=None, - protein_effect_coordinates=Region(43, 44), - ), - TranscriptAnnotation( - gene_id="a_gene", - tx_id="tx:abc", - hgvs_cdna=None, - is_preferred=False, - variant_effects=(VariantEffect.INTRON_VARIANT,), - affected_exons=None, - protein_id=None, - hgvsp=None, - protein_effect_coordinates=None, - ), - ), - genotypes=Genotypes.single(sample_labels, Genotype.HETEROZYGOUS), - ) - - -@pytest.fixture(scope="package") -def structural_variant( - sample_labels: SampleLabels, -) -> Variant: - return Variant( - variant_info=VariantInfo( - sv_info=ImpreciseSvInfo( - structural_type=hpotk.TermId.from_curie( - "SO:1000029" - ), # chromosomal_deletion - variant_class=VariantClass.DEL, - gene_id="HGNC:21316", - gene_symbol="ANKRD11", - ), - ), - tx_annotations=( - TranscriptAnnotation( - gene_id="ANKRD11", - tx_id="NM_013275.6", - hgvs_cdna=None, - is_preferred=True, - variant_effects=(VariantEffect.TRANSCRIPT_ABLATION,), - affected_exons=range(13), # I counted 13 exons - protein_id=None, - hgvsp=None, - protein_effect_coordinates=None, - ), - ), - genotypes=Genotypes.single(sample_labels, Genotype.HETEROZYGOUS), - ) - - -@pytest.fixture(scope="package") -def patient_w_missense( - sample_labels: SampleLabels, - missense_variant: Variant, -) -> Patient: - return Patient.from_raw_parts( - labels=sample_labels, - sex=Sex.UNKNOWN_SEX, - age=None, - vital_status=None, - phenotypes=(), - measurements=(), - diseases=(), - variants=(missense_variant,), - ) - - -@pytest.fixture(scope="package") -def patient_w_frameshift( - sample_labels: SampleLabels, - frameshift_variant: Variant, -) -> Patient: - return Patient.from_raw_parts( - labels=sample_labels, - sex=Sex.UNKNOWN_SEX, - age=None, - vital_status=None, - phenotypes=(), - measurements=(), - diseases=(), - variants=(frameshift_variant,), - ) - - """ Genesis family - Autosomal dominant but can also be used as X dominant. diff --git a/tests/analysis/predicate/genotype/test_allele_counter.py b/tests/analysis/clf/test_allele_counter.py similarity index 96% rename from tests/analysis/predicate/genotype/test_allele_counter.py rename to tests/analysis/clf/test_allele_counter.py index 38bde8192..98b90e532 100644 --- a/tests/analysis/predicate/genotype/test_allele_counter.py +++ b/tests/analysis/clf/test_allele_counter.py @@ -12,7 +12,9 @@ VariantInfo, ) from gpsea.model.genome import Contig, GenomeBuild, GenomicRegion, Region, Strand -from gpsea.analysis.predicate.genotype import AlleleCounter, VariantPredicates +from gpsea.analysis.clf import AlleleCounter +import gpsea.analysis.predicate as vp + @pytest.fixture(scope="module") @@ -209,7 +211,7 @@ def test_count_keys( variant_key: str, expected: int, ): - predicate = VariantPredicates.variant_key(key=variant_key) + predicate = vp.variant_key(key=variant_key) counter = AlleleCounter(predicate) assert counter.count(patient) == expected @@ -226,7 +228,7 @@ def test_count_keys( def test_count_effects( self, patient: Patient, variant_effect: VariantEffect, tx_id: str, expected: int ): - predicate = VariantPredicates.variant_effect(effect=variant_effect, tx_id=tx_id) + predicate = vp.variant_effect(effect=variant_effect, tx_id=tx_id) counter = AlleleCounter(predicate) assert counter.count(patient) == expected diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/clf/test_gt_predicates.py similarity index 73% rename from tests/analysis/predicate/genotype/test_gt_predicates.py rename to tests/analysis/clf/test_gt_predicates.py index bf9957a6d..82e1a0e1e 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/clf/test_gt_predicates.py @@ -1,13 +1,13 @@ import pytest from gpsea.model import Patient, Sex, SampleLabels, VariantEffect -from gpsea.analysis.predicate.genotype import ( - sex_predicate, - monoallelic_predicate, - biallelic_predicate, +from gpsea.analysis.clf import ( + sex_classifier, + monoallelic_classifier, + biallelic_classifier, allele_count, - VariantPredicates, ) +from gpsea.analysis.predicate import variant_effect TX_ID = "tx:xyz" @@ -52,7 +52,7 @@ def test_ad_family__missense_subset( name: str, request: pytest.FixtureRequest, ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=TX_ID) + is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=TX_ID) patient = request.getfixturevalue(patient_name) predicate = allele_count( counts=((0,), (1,)), @@ -105,7 +105,7 @@ def test_ar_family__only_missense( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=TX_ID) + is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=TX_ID) predicate = allele_count( counts=((0,), (1,), (2,)), target=is_missense, @@ -127,7 +127,7 @@ def test_eq_and_hash(self): def test_summarize_groups(self): a = allele_count(counts=((0, 1), (2,))) - assert a.summarize_groups() == "Allele count: 0 OR 1, 2" + assert a.summarize_classes() == "Allele count: 0 OR 1, 2" class TestAllelePredicates: @@ -146,12 +146,12 @@ def test_monoallelic_predicate_ad_family( expected_name: str, request: pytest.FixtureRequest, ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) - gt_predicate = monoallelic_predicate(is_missense, is_synonymous) + is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_clf = monoallelic_classifier(is_missense, is_synonymous) individual = request.getfixturevalue(individual_name) - actual_cat = gt_predicate.test(individual) + actual_cat = gt_clf.test(individual) assert actual_cat is not None assert actual_cat.category.name == expected_name @@ -159,12 +159,12 @@ def test_monoallelic_predicate_ad_family( def test_monoallelic_predicate__general_stuff( self, ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) - gt_predicate = monoallelic_predicate(is_missense, is_synonymous) + gt_predicate = monoallelic_classifier(is_missense, is_synonymous) - assert gt_predicate.summarize_groups() == 'Allele group: A, B' + assert gt_predicate.summarize_classes() == 'Allele group: A, B' @pytest.mark.parametrize( "individual_name,expected_name", @@ -181,9 +181,9 @@ def test_biallelic_predicate( expected_name: str, request: pytest.FixtureRequest, ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) - gt_predicate = biallelic_predicate(is_missense, is_synonymous) + is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = biallelic_classifier(is_missense, is_synonymous) individual = request.getfixturevalue(individual_name) actual_cat = gt_predicate.test(individual) @@ -194,12 +194,12 @@ def test_biallelic_predicate( def test_biallelic_predicate__general_stuff( self, ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) - gt_predicate = biallelic_predicate(is_missense, is_synonymous) + gt_predicate = biallelic_classifier(is_missense, is_synonymous) - assert gt_predicate.summarize_groups() == 'Allele group: A/A, A/B, B/B' + assert gt_predicate.summarize_classes() == 'Allele group: A/A, A/B, B/B' class TestSexPredicate: @@ -211,7 +211,7 @@ def test_sex_predicate( jane = TestSexPredicate.make_patient('Jane', Sex.FEMALE) miffy = TestSexPredicate.make_patient('Miffy', Sex.UNKNOWN_SEX) - gt_predicate = sex_predicate() + gt_predicate = sex_classifier() female, male = gt_predicate.get_categorizations() assert gt_predicate.test(joe) == male @@ -219,9 +219,9 @@ def test_sex_predicate( assert gt_predicate.test(miffy) is None def test_summarize_groups(self): - gt_predicate = sex_predicate() + gt_predicate = sex_classifier() - assert gt_predicate.summarize_groups() == "Sex: FEMALE, MALE" + assert gt_predicate.summarize_classes() == "Sex: FEMALE, MALE" @staticmethod def make_patient(label: str, sex: Sex) -> Patient: diff --git a/tests/analysis/predicate/test_phenotype.py b/tests/analysis/clf/test_phenotype.py similarity index 59% rename from tests/analysis/predicate/test_phenotype.py rename to tests/analysis/clf/test_phenotype.py index 78ec8855c..35f44f67b 100644 --- a/tests/analysis/predicate/test_phenotype.py +++ b/tests/analysis/clf/test_phenotype.py @@ -2,8 +2,8 @@ from gpsea.model import Cohort -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import prepare_hpo_terms_of_interest, prepare_predicates_for_terms_of_interest +from gpsea.analysis.clf import PhenotypeClassifier +from gpsea.analysis.clf import prepare_hpo_terms_of_interest, prepare_classifiers_for_terms_of_interest def test_prepare_hpo_terms_of_interest( @@ -22,10 +22,10 @@ def test_prepare_predicates_for_terms_of_interest( suox_cohort: Cohort, hpo: hpotk.MinimalOntology, ): - predicates = prepare_predicates_for_terms_of_interest( + predicates = prepare_classifiers_for_terms_of_interest( cohort=suox_cohort, hpo=hpo, ) assert len(predicates) == 71 - assert all(isinstance(p, PhenotypePolyPredicate) for p in predicates) + assert all(isinstance(p, PhenotypeClassifier) for p in predicates) diff --git a/tests/analysis/conftest.py b/tests/analysis/conftest.py index bde9f2ac9..b48d29988 100644 --- a/tests/analysis/conftest.py +++ b/tests/analysis/conftest.py @@ -2,22 +2,25 @@ import hpotk -from gpsea.model import * -from gpsea.model.genome import * -from gpsea.analysis.predicate.genotype import VariantPredicate - - -class AlwaysFalseVariantPredicate(VariantPredicate): - def get_question(self) -> str: - return "No question asked, just always returns False" - - def test(self, variant: Variant) -> bool: - return False +from gpsea.model import ( + Cohort, + SampleLabels, + Patient, + Sex, + Phenotype, + Variant, + Genotype, + Genotypes, + VariantCoordinates, + VariantEffect, +) +from gpsea.model.genome import GenomeBuild +from gpsea.analysis.predicate import VariantPredicate, true @pytest.fixture(scope="package") def always_false_variant_predicate() -> VariantPredicate: - return AlwaysFalseVariantPredicate() + return ~true() @pytest.fixture(scope="package") diff --git a/tests/analysis/pcats/test_disease.py b/tests/analysis/pcats/test_disease.py index 9e16b3002..92da0a63c 100644 --- a/tests/analysis/pcats/test_disease.py +++ b/tests/analysis/pcats/test_disease.py @@ -5,20 +5,18 @@ from gpsea.analysis.pcats import DiseaseAnalysis from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import DiseasePresencePredicate +from gpsea.analysis.clf import GenotypeClassifier, DiseasePresenceClassifier class TestDiseaseAnalysis: - - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def count_statistic(self) -> CountStatistic: return FisherExactTest() - @pytest.fixture(scope='class') - def suox_disease(self) -> DiseasePresencePredicate: - sulfite_oxidase_deficiency = hpotk.TermId.from_curie('OMIM:272300') - return DiseasePresencePredicate( + @pytest.fixture(scope="class") + def suox_disease(self) -> DiseasePresenceClassifier: + sulfite_oxidase_deficiency = hpotk.TermId.from_curie("OMIM:272300") + return DiseasePresenceClassifier( disease_id_query=sulfite_oxidase_deficiency, ) @@ -29,21 +27,21 @@ def analysis( ) -> DiseaseAnalysis: return DiseaseAnalysis( count_statistic=count_statistic, - mtc_correction='fdr_bh', - mtc_alpha=.05, + mtc_correction="fdr_bh", + mtc_alpha=0.05, ) def test_compare_genotype_vs_phenotypes( self, analysis: DiseaseAnalysis, suox_cohort: Cohort, - suox_gt_predicate: GenotypePolyPredicate, - suox_disease: DiseasePresencePredicate, + suox_gt_clf: GenotypeClassifier, + suox_disease: DiseasePresenceClassifier, ): results = analysis.compare_genotype_vs_phenotypes( cohort=suox_cohort.all_patients, - gt_predicate=suox_gt_predicate, - pheno_predicates=(suox_disease,), + gt_clf=suox_gt_clf, + pheno_clfs=(suox_disease,), ) assert results is not None diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py index cb9a46a1b..45278867e 100644 --- a/tests/analysis/pcats/test_hpo_term_analysis.py +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -9,12 +9,10 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcFilter, HpoMtcFilter from gpsea.analysis.pcats import HpoTermAnalysis from gpsea.analysis.pcats.stats import CountStatistic, FisherExactTest -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate +from gpsea.analysis.clf import GenotypeClassifier, PhenotypeClassifier class TestHpoTermAnalysis: - @pytest.fixture(scope="class") def count_statistic(self) -> CountStatistic: return FisherExactTest() @@ -47,13 +45,13 @@ def test_compare_genotype_vs_phenotypes( self, analysis: HpoTermAnalysis, suox_cohort: Cohort, - suox_gt_predicate: GenotypePolyPredicate, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_clf: GenotypeClassifier, + suox_pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], ): results = analysis.compare_genotype_vs_phenotypes( cohort=suox_cohort.all_patients, - gt_predicate=suox_gt_predicate, - pheno_predicates=suox_pheno_predicates, + gt_clf=suox_gt_clf, + pheno_clfs=suox_pheno_clfs, ) assert results is not None @@ -86,13 +84,13 @@ def test_compare_genotype_vs_phenotypes_can_handle_if_no_phenotypes_are_left_aft self, analysis: HpoTermAnalysis, degenerated_cohort: Cohort, - suox_gt_predicate: GenotypePolyPredicate, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_clf: GenotypeClassifier, + suox_pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], ): result = analysis.compare_genotype_vs_phenotypes( cohort=degenerated_cohort, - gt_predicate=suox_gt_predicate, - pheno_predicates=suox_pheno_predicates, + gt_clf=suox_gt_clf, + pheno_clfs=suox_pheno_clfs, ) assert ( diff --git a/tests/analysis/predicate/conftest.py b/tests/analysis/predicate/conftest.py new file mode 100644 index 000000000..497479b64 --- /dev/null +++ b/tests/analysis/predicate/conftest.py @@ -0,0 +1,191 @@ +import pytest + +import hpotk + +from gpsea.model import ( + SampleLabels, + Patient, + Variant, + VariantInfo, + VariantCoordinates, + TranscriptAnnotation, + VariantEffect, + Genotype, + Genotypes, + VariantClass, + Sex, + ImpreciseSvInfo, +) +from gpsea.model.genome import GenomeBuild, GenomicRegion, Strand, Region + + +@pytest.fixture(scope="package") +def sample_labels() -> SampleLabels: + return SampleLabels("jim") + + +@pytest.fixture(scope="package") +def missense_variant( + genome_build: GenomeBuild, + sample_labels: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=100, + end=101, + strand=Strand.POSITIVE, + ), + ref="C", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=False, + variant_effects=( + VariantEffect.MISSENSE_VARIANT, + VariantEffect.SPLICE_DONOR_VARIANT, + ), + affected_exons=(4,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(40, 41), + ), + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:abc", + hgvs_cdna=None, + is_preferred=False, + variant_effects=(VariantEffect.INTRON_VARIANT,), + affected_exons=None, + protein_id=None, + hgvsp=None, + protein_effect_coordinates=None, + ), + ), + genotypes=Genotypes.single(sample_labels, Genotype.HETEROZYGOUS), + ) + + +@pytest.fixture(scope="package") +def frameshift_variant( + genome_build: GenomeBuild, + sample_labels: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=110, + end=113, + strand=Strand.POSITIVE, + ), + ref="CCC", + alt="C", + change_length=-2, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=False, + variant_effects=(VariantEffect.FRAMESHIFT_VARIANT,), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(43, 44), + ), + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:abc", + hgvs_cdna=None, + is_preferred=False, + variant_effects=(VariantEffect.INTRON_VARIANT,), + affected_exons=None, + protein_id=None, + hgvsp=None, + protein_effect_coordinates=None, + ), + ), + genotypes=Genotypes.single(sample_labels, Genotype.HETEROZYGOUS), + ) + + +@pytest.fixture(scope="package") +def structural_variant( + sample_labels: SampleLabels, +) -> Variant: + return Variant( + variant_info=VariantInfo( + sv_info=ImpreciseSvInfo( + structural_type=hpotk.TermId.from_curie( + "SO:1000029" + ), # chromosomal_deletion + variant_class=VariantClass.DEL, + gene_id="HGNC:21316", + gene_symbol="ANKRD11", + ), + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="ANKRD11", + tx_id="NM_013275.6", + hgvs_cdna=None, + is_preferred=True, + variant_effects=(VariantEffect.TRANSCRIPT_ABLATION,), + affected_exons=range(13), # I counted 13 exons + protein_id=None, + hgvsp=None, + protein_effect_coordinates=None, + ), + ), + genotypes=Genotypes.single(sample_labels, Genotype.HETEROZYGOUS), + ) + + +@pytest.fixture(scope="package") +def patient_w_missense( + sample_labels: SampleLabels, + missense_variant: Variant, +) -> Patient: + return Patient.from_raw_parts( + labels=sample_labels, + sex=Sex.UNKNOWN_SEX, + age=None, + vital_status=None, + phenotypes=(), + measurements=(), + diseases=(), + variants=(missense_variant,), + ) + + +@pytest.fixture(scope="package") +def patient_w_frameshift( + sample_labels: SampleLabels, + frameshift_variant: Variant, +) -> Patient: + return Patient.from_raw_parts( + labels=sample_labels, + sex=Sex.UNKNOWN_SEX, + age=None, + vital_status=None, + phenotypes=(), + measurements=(), + diseases=(), + variants=(frameshift_variant,), + ) diff --git a/tests/analysis/predicate/genotype/__init__.py b/tests/analysis/predicate/genotype/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/test_predicates.py similarity index 83% rename from tests/analysis/predicate/genotype/test_predicates.py rename to tests/analysis/predicate/test_predicates.py index b1991f1e8..954346083 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/test_predicates.py @@ -1,6 +1,6 @@ import pytest -from gpsea.analysis.predicate.genotype import VariantPredicates +import gpsea.analysis.predicate as vp from gpsea.model import ( Cohort, FeatureInfo, @@ -19,7 +19,7 @@ def test_always_true_predicate( self, suox_cohort: Cohort, ): - predicate = VariantPredicates.true() + predicate = vp.true() assert all(predicate.test(v) for v in suox_cohort.all_variants()) @pytest.mark.parametrize( @@ -37,7 +37,7 @@ def test_variant_effect_predicate( effect: VariantEffect, expected: bool, ): - predicate = VariantPredicates.variant_effect(effect, tx_id='tx:xyz') + predicate = vp.variant_effect(effect, tx_id='tx:xyz') assert predicate.test(missense_variant) == expected @@ -54,7 +54,7 @@ def test_variant_key_predicate( variant_key: str, expected: bool, ): - predicate = VariantPredicates.variant_key(variant_key) + predicate = vp.variant_key(variant_key) assert predicate.test(missense_variant) == expected @@ -72,13 +72,13 @@ def test_exon_predicate( exon: int, expected: bool, ): - predicate = VariantPredicates.exon(exon, tx_id='tx:xyz') + predicate = vp.exon(exon, tx_id='tx:xyz') assert predicate.test(missense_variant) == expected def test_exon_predicate_fails_on_invalid_exon(self): with pytest.raises(AssertionError) as e: - VariantPredicates.exon(0, tx_id='tx:xyz') + vp.exon(0, tx_id='tx:xyz') assert e.value.args[0] == '`exon` must be a positive `int`' @pytest.mark.parametrize( @@ -94,7 +94,7 @@ def test_transcript_predicate( tx_id: str, expected: bool, ): - predicate = VariantPredicates.transcript(tx_id) + predicate = vp.transcript(tx_id) assert predicate.test(missense_variant) == expected @@ -111,7 +111,7 @@ def test_gene_predicate( symbol: str, expected: bool, ): - predicate = VariantPredicates.gene(symbol) + predicate = vp.gene(symbol) assert predicate.test(missense_variant) == expected @@ -120,7 +120,7 @@ def test_is_large_imprecise_sv( missense_variant: Variant, structural_variant: Variant, ): - predicate = VariantPredicates.is_large_imprecise_sv() + predicate = vp.is_large_imprecise_sv() assert predicate.test(missense_variant) is False assert predicate.test(structural_variant) is True @@ -130,7 +130,7 @@ def test_is_structural_predicate( missense_variant: Variant, structural_variant: Variant, ): - predicate = VariantPredicates.is_structural_variant() + predicate = vp.is_structural_variant() assert predicate.test(missense_variant) is False assert predicate.test(structural_variant) is True @@ -140,7 +140,7 @@ def test_structural_type( missense_variant: Variant, structural_variant: Variant, ): - predicate = VariantPredicates.structural_type('SO:1000029') + predicate = vp.structural_type('SO:1000029') assert predicate.test(missense_variant) is False assert predicate.test(structural_variant) is True @@ -149,7 +149,7 @@ def test_variant_class( self, missense_variant: Variant, ): - predicate = VariantPredicates.variant_class(VariantClass.SNV) + predicate = vp.variant_class(VariantClass.SNV) assert predicate.test(missense_variant) is True @@ -158,7 +158,7 @@ def test_change_length( missense_variant: Variant, structural_variant: Variant, ): - predicate = VariantPredicates.change_length('==', 0) + predicate = vp.change_length('==', 0) # variant is an SNP assert predicate.test(missense_variant) is True @@ -171,14 +171,14 @@ def test_change_length_is_false_for_imprecise_SVs_no_matter_what( structural_variant: Variant, ): for op in ("<", "<=", "==", "!=", ">=", ">"): - predicate = VariantPredicates.change_length(op, 0) + predicate = vp.change_length(op, 0) assert predicate.test(structural_variant) is False def test_structural_deletion( self, structural_variant: Variant, ): - predicate = VariantPredicates.is_structural_deletion() + predicate = vp.is_structural_deletion() assert predicate.test(structural_variant) is True @@ -217,7 +217,7 @@ def test_protein_feature_type( protein_metadata: ProteinMetadata, expected: bool, ): - predicate = VariantPredicates.protein_feature_type( + predicate = vp.protein_feature_type( feature_type=feature_type, protein_metadata=protein_metadata, ) @@ -239,7 +239,7 @@ def test_protein_feature_id( protein_metadata: ProteinMetadata, expected: bool, ): - predicate = VariantPredicates.protein_feature( + predicate = vp.protein_feature( feature_id=feature_id, protein_metadata=protein_metadata, ) @@ -253,8 +253,8 @@ class TestLogicalVariantPredicate: """ def test_equivalent_predicates_are_not_chained(self): - a1 = VariantPredicates.gene(symbol='A') - a2 = VariantPredicates.gene(symbol='A') + a1 = vp.gene(symbol='A') + a2 = vp.gene(symbol='A') assert a1 & a2 is a1 assert a1 | a2 is a1 @@ -278,7 +278,7 @@ def test_und_predicate( right: str, expected: bool, ): - predicate = VariantPredicates.transcript(tx_id=left) & VariantPredicates.transcript(tx_id=right) + predicate = vp.transcript(tx_id=left) & vp.transcript(tx_id=right) assert predicate.test(missense_variant) == expected @@ -298,7 +298,7 @@ def test_or_predicate( right: str, expected: bool, ): - predicate = VariantPredicates.transcript(tx_id=left) | VariantPredicates.transcript(tx_id=right) + predicate = vp.transcript(tx_id=left) | vp.transcript(tx_id=right) assert predicate.test(missense_variant) == expected @@ -315,14 +315,14 @@ def test_inv_predicate( tx_id: str, expected: bool, ): - predicate = ~VariantPredicates.transcript(tx_id) + predicate = ~vp.transcript(tx_id) assert predicate.test(missense_variant) == expected def test_no_double_inv_happens( self, ): - predicate = VariantPredicates.gene('FBN1') + predicate = vp.gene('FBN1') # Inverting a predicate must produce a new predicate. inv_predicate = ~predicate @@ -337,7 +337,7 @@ def test_empty_all_predicate_raises_error( ): with pytest.raises(ValueError) as e: empty = () - VariantPredicates.all(empty) + vp.allof(empty) assert e.value.args[0] == 'Predicates must not be empty!' def test_empty_any_predicate_raises_error( @@ -345,12 +345,12 @@ def test_empty_any_predicate_raises_error( ): with pytest.raises(ValueError) as e: empty = () - VariantPredicates.any(empty) + vp.anyof(empty) assert e.value.args[0] == 'Predicates must not be empty!' def test_logical_predicates_are_hashable(self): - a = VariantPredicates.gene(symbol='A') - b = VariantPredicates.gene(symbol='B') + a = vp.gene(symbol='A') + b = vp.gene(symbol='B') a_and_b = a & b assert isinstance(hash(a_and_b), int) diff --git a/tests/analysis/pscore/test_pscore_api.py b/tests/analysis/pscore/test_pscore_api.py index 24040b46e..2190eab22 100644 --- a/tests/analysis/pscore/test_pscore_api.py +++ b/tests/analysis/pscore/test_pscore_api.py @@ -4,13 +4,12 @@ import pandas as pd from gpsea.analysis import StatisticResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.clf import GenotypeClassifier from gpsea.analysis.pscore import PhenotypeScoreAnalysisResult, PhenotypeScorer from gpsea.analysis.pscore.stats import MannWhitneyStatistic class TestPhenotypeScoreAnalysisResult: - @pytest.fixture(scope="class") def phenotype_scorer(self) -> PhenotypeScorer: return PhenotypeScorer.wrap_scoring_function( @@ -21,7 +20,7 @@ def phenotype_scorer(self) -> PhenotypeScorer: @pytest.fixture(scope="class") def result( self, - suox_gt_predicate: GenotypePolyPredicate, + suox_gt_clf: GenotypeClassifier, phenotype_scorer: PhenotypeScorer, ) -> PhenotypeScoreAnalysisResult: data = pd.DataFrame( @@ -29,25 +28,25 @@ def result( "patient_id": ["A", "B", "C"], "genotype": [0, 1, None], "phenotype": [ - 10., - float('nan'), - -4., + 10.0, + float("nan"), + -4.0, ], } ).set_index("patient_id") return PhenotypeScoreAnalysisResult( - gt_predicate=suox_gt_predicate, + gt_clf=suox_gt_clf, phenotype=phenotype_scorer, statistic=MannWhitneyStatistic(), data=data, - statistic_result=StatisticResult(statistic=.2, pval=0.1234), + statistic_result=StatisticResult(statistic=0.2, pval=0.1234), ) def test_properties( self, result: PhenotypeScoreAnalysisResult, ): - assert tuple(result.gt_predicate.group_labels) == ( + assert tuple(result.gt_clf.class_labels) == ( "0", "1", ) @@ -62,4 +61,4 @@ def test_complete_records( assert records.shape == (1, 2) assert records.loc["A", "genotype"] == 0 - assert records.loc["A", "phenotype"] == pytest.approx(10.) + assert records.loc["A", "phenotype"] == pytest.approx(10.0) diff --git a/tests/analysis/temporal/test_temporal_api.py b/tests/analysis/temporal/test_temporal_api.py index c486fbf7b..3ca549c41 100644 --- a/tests/analysis/temporal/test_temporal_api.py +++ b/tests/analysis/temporal/test_temporal_api.py @@ -11,12 +11,8 @@ from gpsea.model import Cohort, Patient, Status, VitalStatus, Disease, Age from gpsea.io import GpseaJSONDecoder from gpsea.analysis import StatisticResult -from gpsea.analysis.predicate.genotype import ( - GenotypePolyPredicate, - VariantPredicates, - diagnosis_predicate, - monoallelic_predicate, -) +from gpsea.analysis.clf import GenotypeClassifier, diagnosis_classifier, monoallelic_classifier +from gpsea.analysis.predicate import exon from gpsea.analysis.temporal import SurvivalAnalysis, SurvivalAnalysisResult, Survival from gpsea.analysis.temporal.endpoint import hpo_onset, death from gpsea.analysis.temporal.stats import LogRankTest @@ -32,9 +28,9 @@ def umod_cohort( @pytest.fixture(scope="module") -def umod_gt_predicate() -> GenotypePolyPredicate: - in_exon_3 = VariantPredicates.exon(3, tx_id="NM_003361.4") - return monoallelic_predicate( +def umod_gt_clf() -> GenotypeClassifier: + in_exon_3 = exon(3, tx_id="NM_003361.4") + return monoallelic_classifier( a_predicate=in_exon_3, b_predicate=~in_exon_3, a_label="Exon 3", @@ -43,7 +39,6 @@ def umod_gt_predicate() -> GenotypePolyPredicate: class TestSurvivalAnalysis: - @pytest.fixture(scope="class") def survival_analysis(self) -> SurvivalAnalysis: return SurvivalAnalysis(statistic=LogRankTest()) @@ -53,9 +48,8 @@ def test_compare_genotype_vs_survival( survival_analysis: SurvivalAnalysis, hpo: hpotk.MinimalOntology, umod_cohort: Cohort, - umod_gt_predicate: GenotypePolyPredicate, + umod_gt_clf: GenotypeClassifier, ): - endpoint = hpo_onset( hpo=hpo, term_id="HP:0003774", # Stage 5 chronic kidney disease @@ -63,7 +57,7 @@ def test_compare_genotype_vs_survival( result = survival_analysis.compare_genotype_vs_survival( cohort=umod_cohort, - gt_predicate=umod_gt_predicate, + gt_clf=umod_gt_clf, endpoint=endpoint, ) @@ -75,7 +69,7 @@ def test_raises_an_exception_for_an_invalid_dataset( ): d_one = Disease.from_raw_parts("OMIM:100000", name="One", is_observed=True) d_two = Disease.from_raw_parts("OMIM:200000", name="Two", is_observed=True) - gt_predicate = diagnosis_predicate( + gt_predicate = diagnosis_classifier( diagnoses=(d.identifier for d in (d_one, d_two)), labels=(d.name for d in (d_one, d_two)), ) @@ -96,7 +90,7 @@ def test_raises_an_exception_for_an_invalid_dataset( with pytest.raises(AnalysisException) as e: survival_analysis.compare_genotype_vs_survival( cohort=cohort, - gt_predicate=gt_predicate, + gt_clf=gt_predicate, endpoint=endpoint, ) @@ -106,11 +100,10 @@ def test_raises_an_exception_for_an_invalid_dataset( class TestSurvivalAnalysisResult: - @pytest.fixture(scope="class") def result( self, - umod_gt_predicate: GenotypePolyPredicate, + umod_gt_clf: GenotypeClassifier, ) -> SurvivalAnalysisResult: data = pd.DataFrame( data={ @@ -129,7 +122,7 @@ def result( } ).set_index("patient_id") return SurvivalAnalysisResult( - gt_predicate=umod_gt_predicate, + gt_clf=umod_gt_clf, endpoint=death(), statistic=LogRankTest(), data=data, @@ -140,7 +133,7 @@ def test_properties( self, result: SurvivalAnalysisResult, ): - assert tuple(result.gt_predicate.group_labels) == ( + assert tuple(result.gt_clf.class_labels) == ( "Exon 3", "Other exon", ) diff --git a/tests/analysis/test_analysis_result.py b/tests/analysis/test_analysis_result.py index f256a8bd2..3e8f9e657 100644 --- a/tests/analysis/test_analysis_result.py +++ b/tests/analysis/test_analysis_result.py @@ -5,37 +5,36 @@ import pytest from gpsea.analysis import MultiPhenotypeAnalysisResult, StatisticResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import HpoPredicate +from gpsea.analysis.clf import GenotypeClassifier, HpoClassifier from gpsea.analysis.pcats.stats import FisherExactTest @pytest.fixture(scope="class") def multi_phenotype_analysis_result( hpo: hpotk.MinimalOntology, - suox_gt_predicate: GenotypePolyPredicate, + suox_gt_clf: GenotypeClassifier, ) -> MultiPhenotypeAnalysisResult: - is_arachnodactyly = HpoPredicate( + is_arachnodactyly = HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001166"), # Arachnodactyly ) - is_seizure = HpoPredicate( + is_seizure = HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure ) - is_polydactyly = HpoPredicate( + is_polydactyly = HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0100259"), # Postaxial polydactyly ) - is_clinodactyly = HpoPredicate( + is_clinodactyly = HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0030084"), # Clinodactyly ) return MultiPhenotypeAnalysisResult( - gt_predicate=suox_gt_predicate, + gt_clf=suox_gt_clf, statistic=FisherExactTest(), mtc_correction="fdr_bh", - pheno_predicates=( + pheno_clfs=( is_arachnodactyly, is_seizure, is_polydactyly, @@ -46,27 +45,27 @@ def multi_phenotype_analysis_result( pd.DataFrame( data=[[10, 5], [10, 15]], index=pd.Index(is_arachnodactyly.get_categories()), - columns=pd.Index(suox_gt_predicate.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), ), pd.DataFrame( data=[[5, 0], [5, 10]], index=pd.Index(is_seizure.get_categories()), - columns=pd.Index(suox_gt_predicate.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), ), pd.DataFrame( data=[[50, 0], [5, 60]], index=pd.Index(is_polydactyly.get_categories()), - columns=pd.Index(suox_gt_predicate.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), ), pd.DataFrame( data=[[0, 0], [10, 0]], index=pd.Index(is_clinodactyly.get_categories()), - columns=pd.Index(suox_gt_predicate.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), ), ), statistic_results=( None, - StatisticResult(statistic=1., pval=0.005), + StatisticResult(statistic=1.0, pval=0.005), StatisticResult(statistic=10.0, pval=0.0005), StatisticResult(statistic=0.1, pval=0.05), ), @@ -75,7 +74,6 @@ def multi_phenotype_analysis_result( class TestMultiPhenotypeAnalysisResult: - def test_significant_phenotype_indices( self, multi_phenotype_analysis_result: MultiPhenotypeAnalysisResult, diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index 2599eedc1..62664b082 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -6,28 +6,26 @@ import pytest from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate -from gpsea.analysis.pcats import apply_predicates_on_patients +from gpsea.analysis.clf import GenotypeClassifier, PhenotypeClassifier, HpoClassifier +from gpsea.analysis.pcats import apply_classifiers_on_individuals from gpsea.model import Cohort -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def patient_counts( suox_cohort: Cohort, - suox_gt_predicate: GenotypePolyPredicate, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_clf: GenotypeClassifier, + suox_pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], ) -> typing.Sequence[pd.DataFrame]: - _, counts = apply_predicates_on_patients( - patients=suox_cohort.all_patients, - gt_predicate=suox_gt_predicate, - pheno_predicates=suox_pheno_predicates, + _, counts = apply_classifiers_on_individuals( + individuals=suox_cohort.all_patients, + gt_clf=suox_gt_clf, + pheno_clfs=suox_pheno_clfs, ) return counts class TestHpoMtcFilter: - @pytest.fixture def mtc_filter( self, @@ -36,26 +34,26 @@ def mtc_filter( return HpoMtcFilter.default_filter( hpo=hpo, term_frequency_threshold=0.2, - annotation_frequency_threshold=.1, + annotation_frequency_threshold=0.1, ) - @pytest.fixture(scope='class') - def gt_predicate( + @pytest.fixture(scope="class") + def gt_clf( self, - suox_gt_predicate: GenotypePolyPredicate, - ) -> GenotypePolyPredicate: - return suox_gt_predicate + suox_gt_clf: GenotypeClassifier, + ) -> GenotypeClassifier: + return suox_gt_clf - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def ph_predicate( self, hpo: hpotk.MinimalOntology, - ) -> PhenotypePolyPredicate[hpotk.TermId]: + ) -> PhenotypeClassifier[hpotk.TermId]: """ For the purpose of testing counts, let's pretend the counts were created by this predicate. """ - return HpoPredicate( + return HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure missing_implies_phenotype_excluded=False, @@ -64,8 +62,8 @@ def ph_predicate( @staticmethod def prepare_counts_df( counts, - gt_predicate: GenotypePolyPredicate, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + gt_predicate: GenotypeClassifier, + ph_predicate: PhenotypeClassifier[hpotk.TermId], ): values = np.array(counts).reshape((2, 2)) index = pd.Index(ph_predicate.get_categories()) @@ -79,20 +77,20 @@ def prepare_counts_df( ((10, 0, 15, 1), False), ((10, 0, 15, 0), True), ((0, 15, 0, 19), True), - ] + ], ) def test_one_genotype_has_zero_hpo_observations( self, counts: typing.Sequence[int], expected: bool, - gt_predicate: GenotypePolyPredicate, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + gt_clf: GenotypeClassifier, + ph_predicate: PhenotypeClassifier[hpotk.TermId], ): - counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_clf, ph_predicate) actual = HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts=counts_df, - gt_predicate=gt_predicate, + gt_clf=gt_clf, ) assert actual == expected @@ -102,26 +100,24 @@ def test_one_genotype_has_zero_hpo_observations( [ ((1, 0, 20, 30), False), ((2, 0, 20, 30), True), - ((0, 1, 20, 30), False), ((0, 2, 20, 30), True), - ((0, 0, 20, 30), False), ((2, 2, 20, 30), True), - ] + ], ) def test_some_cell_has_greater_than_one_count( - self, - counts: typing.Tuple[int], - expected: bool, - gt_predicate: GenotypePolyPredicate, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + self, + counts: typing.Tuple[int], + expected: bool, + gt_clf: GenotypeClassifier, + ph_predicate: PhenotypeClassifier[hpotk.TermId], ): - counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_clf, ph_predicate) actual = HpoMtcFilter.some_cell_has_greater_than_one_count( counts=counts_df, - ph_predicate=ph_predicate, + ph_clf=ph_predicate, ) assert actual == expected @@ -129,25 +125,25 @@ def test_some_cell_has_greater_than_one_count( @pytest.mark.parametrize( "counts, expected", [ - ((1, 2, 99, 198), .01), - ((1, 3, 99, 197), .015), - ((0, 0, 100, 200), 0.), - ((0, 0, 0, 200), 0.), - ((0, 0, 0, 0), 0.), - ] + ((1, 2, 99, 198), 0.01), + ((1, 3, 99, 197), 0.015), + ((0, 0, 100, 200), 0.0), + ((0, 0, 0, 200), 0.0), + ((0, 0, 0, 0), 0.0), + ], ) def test_get_maximum_group_observed_HPO_frequency( self, counts: typing.Tuple[int], expected: float, - gt_predicate: GenotypePolyPredicate, - ph_predicate: PhenotypePolyPredicate[hpotk.TermId], + gt_clf: GenotypeClassifier, + ph_predicate: PhenotypeClassifier[hpotk.TermId], ): - counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_predicate, ph_predicate) + counts_df = TestHpoMtcFilter.prepare_counts_df(counts, gt_clf, ph_predicate) actual = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( counts_frame=counts_df, - ph_predicate=ph_predicate, + ph_clf=ph_predicate, ) assert actual == pytest.approx(expected) @@ -155,14 +151,14 @@ def test_get_maximum_group_observed_HPO_frequency( def test_filter_terms_to_test( self, mtc_filter: HpoMtcFilter, - suox_gt_predicate: GenotypePolyPredicate, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_clf: GenotypeClassifier, + suox_pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], patient_counts: typing.Sequence[pd.DataFrame], suox_cohort: Cohort, ): mtc_report = mtc_filter.filter( - gt_predicate=suox_gt_predicate, - ph_predicates=suox_pheno_predicates, + gt_clf=suox_gt_clf, + pheno_clfs=suox_pheno_clfs, counts=patient_counts, cohort_size=len(suox_cohort), ) @@ -175,14 +171,16 @@ def test_filter_terms_to_test( reasons = [r.reason for r in mtc_report] assert reasons == [ - None, None, - 'Skipping term with less than 7 observations (not powered for 2x2)', - None, None, + None, + None, + "Skipping term with less than 7 observations (not powered for 2x2)", + None, + None, ] def test_min_observed_HPO_threshold( self, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], patient_counts: typing.Sequence[pd.DataFrame], ): """ @@ -196,68 +194,62 @@ def test_min_observed_HPO_threshold( for all of the HPO terms in the dictionary. """ EPSILON = 0.001 - curie2idx = { - p.phenotype.value: i - for i, p - in enumerate(suox_pheno_predicates) - } + curie2idx = {p.phenotype.value: i for i, p in enumerate(suox_pheno_clfs)} # Ectopia lentis HP:0001083 (1 2 3 1), freqs are 1/4=0.25 and 3/4=0.75 idx = curie2idx["HP:0001083"] ectopia = patient_counts[idx] - ectopia_predicate = suox_pheno_predicates[idx] + ectopia_predicate = suox_pheno_clfs[idx] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( ectopia, - ph_predicate=ectopia_predicate, + ph_clf=ectopia_predicate, ) assert max_f == pytest.approx(0.75, abs=EPSILON) # Seizure HP:0001250 (11 5 0 1), freqs are 11/11=1.0 and 5/6=0.8333333 idx = curie2idx["HP:0001250"] seizure = patient_counts[idx] - seizure_predicate = suox_pheno_predicates[idx] + seizure_predicate = suox_pheno_clfs[idx] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( - seizure, - ph_predicate=seizure_predicate + seizure, ph_clf=seizure_predicate ) assert max_f == pytest.approx(1.0, abs=EPSILON) # Sulfocysteinuria HP:0032350 (2 3 0 0), freqs are both 1 idx = curie2idx["HP:0032350"] sulfocysteinuria = patient_counts[idx] - sulfocysteinuria_predicate = suox_pheno_predicates[idx] + sulfocysteinuria_predicate = suox_pheno_clfs[idx] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( sulfocysteinuria, - ph_predicate=sulfocysteinuria_predicate, + ph_clf=sulfocysteinuria_predicate, ) assert max_f == pytest.approx(1.0, abs=EPSILON) # Neurodevelopmental delay HP:0012758 (4 0 4 5), freqs are 4/8 = 0.5 and 0/5=0.0 idx = curie2idx["HP:0012758"] ndelay = patient_counts[idx] - ndelay_predicate = suox_pheno_predicates[idx] + ndelay_predicate = suox_pheno_clfs[idx] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( ndelay, - ph_predicate=ndelay_predicate, + ph_clf=ndelay_predicate, ) assert max_f == pytest.approx(0.5, abs=EPSILON) # Hypertonia HP:0001276 (4 2 3 3) freqs are 4/7=0.4375 and 2/5=0.5714 idx = curie2idx["HP:0001276"] hypertonia = patient_counts[idx] - hypertonia_predicate = suox_pheno_predicates[idx] + hypertonia_predicate = suox_pheno_clfs[idx] max_f = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( hypertonia, - ph_predicate=hypertonia_predicate, + ph_clf=hypertonia_predicate, ) assert max_f == pytest.approx(0.5714, abs=EPSILON) class TestSpecifyTermsMtcFilter: - def test_specified_term_mtc_filter( self, - suox_gt_predicate: GenotypePolyPredicate, - suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_clf: GenotypeClassifier, + suox_pheno_clfs: typing.Sequence[PhenotypeClassifier[hpotk.TermId]], patient_counts: typing.Sequence[pd.DataFrame], suox_cohort: Cohort, ): @@ -268,14 +260,12 @@ def test_specified_term_mtc_filter( reason for filtering out is 'Non-specified term' """ specified_filter = SpecifiedTermsMtcFilter( - terms_to_test=( - hpotk.TermId.from_curie("HP:0032350"), - ), + terms_to_test=(hpotk.TermId.from_curie("HP:0032350"),), ) mtc_report = specified_filter.filter( - gt_predicate=suox_gt_predicate, - ph_predicates=suox_pheno_predicates, + gt_clf=suox_gt_clf, + pheno_clfs=suox_pheno_clfs, counts=patient_counts, cohort_size=len(suox_cohort), ) @@ -284,16 +274,20 @@ def test_specified_term_mtc_filter( is_passed = [r.is_passed() for r in mtc_report] assert is_passed == [ - False, False, True, False, False, + False, + False, + True, + False, + False, ] reasons = [r.reason for r in mtc_report] assert reasons == [ - 'Non-specified term', - 'Non-specified term', + "Non-specified term", + "Non-specified term", None, - 'Non-specified term', - 'Non-specified term', + "Non-specified term", + "Non-specified term", ] @pytest.mark.parametrize( @@ -301,13 +295,9 @@ def test_specified_term_mtc_filter( [ ("NotACurie", "The CURIE NotACurie has no colon `:` or underscore `_`"), (0, "0 is neither `str` nor `hpotk.TermId`"), - ] + ], ) - def test_explodes_if_invalid_terms_provided( - self, - val: typing.Any, - msg: str - ): + def test_explodes_if_invalid_terms_provided(self, val: typing.Any, msg: str): with pytest.raises(ValueError) as e: SpecifiedTermsMtcFilter( terms_to_test=(val,), diff --git a/tests/conftest.py b/tests/conftest.py index eb3725662..ae9d9f216 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,13 +12,9 @@ ValidationRunner, ) -from gpsea.analysis.predicate.genotype import ( - GenotypePolyPredicate, - VariantPredicates, - biallelic_predicate, - allele_count, -) -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate +from gpsea.analysis.clf import GenotypeClassifier, biallelic_classifier, allele_count +from gpsea.analysis.clf import PhenotypeClassifier, HpoClassifier +from gpsea.analysis.predicate import variant_effect from gpsea.io import GpseaJSONDecoder from gpsea.model import ( Cohort, @@ -155,45 +151,45 @@ def suox_mane_tx_id() -> str: @pytest.fixture(scope="session") -def suox_gt_predicate( +def suox_gt_clf( suox_mane_tx_id: str, -) -> GenotypePolyPredicate: +) -> GenotypeClassifier: return allele_count( counts=((0,), (1,)), - target=VariantPredicates.variant_effect( + target=variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id ), ) @pytest.fixture(scope="session") -def suox_pheno_predicates( +def suox_pheno_clfs( hpo: hpotk.MinimalOntology, -) -> typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]]: +) -> typing.Sequence[PhenotypeClassifier[hpotk.TermId]]: """ - Get predicates for test for presence of 5 HPO terms: + Get classifiers to test for presence of 5 HPO terms: Seizure, Ectopia lentis, Sulfocysteinuria, Neurodevelopmental delay, and Hypertonia. Note, these are just a *SUBSET* of all phenotypes that can be tested for in the *SUOX* cohort. """ return ( - HpoPredicate( + HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure ), - HpoPredicate( + HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001083"), # Ectopia lentis ), - HpoPredicate( + HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0032350"), # Sulfocysteinuria ), - HpoPredicate( + HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0012758"), # Neurodevelopmental delay ), - HpoPredicate( + HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001276"), # Hypertonia ), @@ -250,11 +246,11 @@ def cyp21a2_mane_tx_id() -> str: @pytest.fixture(scope="session") -def cyp21a2_gt_predicate( +def cyp21a2_gt_clf( cyp21a2_mane_tx_id: str, -) -> GenotypePolyPredicate: - return biallelic_predicate( - a_predicate=VariantPredicates.variant_effect( +) -> GenotypeClassifier: + return biallelic_classifier( + a_predicate=variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=cyp21a2_mane_tx_id, ), diff --git a/tests/test_predicates.py b/tests/test_predicates.py index 43e8d8d25..ba71ffdfb 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -1,7 +1,7 @@ import hpotk import pytest -from gpsea.analysis.predicate.phenotype import HpoPredicate, DiseasePresencePredicate +from gpsea.analysis.clf import HpoClassifier, DiseasePresenceClassifier from gpsea.model import Cohort, Patient @@ -42,7 +42,7 @@ def test_phenotype_predicate__present_or_excluded( ): patient = find_patient(patient_id, toy_cohort) term_id = hpotk.TermId.from_curie(curie) - predicate = HpoPredicate(hpo=hpo, query=term_id) + predicate = HpoClassifier(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual is not None @@ -57,7 +57,7 @@ def test_phenotype_predicate__unknown( # Not Measured and not Observed - 'HP:0006280', # Chronic pancreatitis patient = find_patient('HetSingleVar', toy_cohort) term_id = hpotk.TermId.from_curie('HP:0006280') - predicate = HpoPredicate(hpo=hpo, query=term_id) + predicate = HpoClassifier(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual is None @@ -80,7 +80,7 @@ def test_disease_predicate( ): patient = find_patient(patient_id, toy_cohort) disease_id = hpotk.TermId.from_curie("OMIM:148050") - predicate = DiseasePresencePredicate(disease_id) + predicate = DiseasePresenceClassifier(disease_id) actual = predicate.test(patient) assert actual is not None diff --git a/tests/view/test_view.py b/tests/view/test_view.py index ace650cd3..9dea42967 100644 --- a/tests/view/test_view.py +++ b/tests/view/test_view.py @@ -8,16 +8,19 @@ from gpsea.analysis import StatisticResult from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.pcats.stats import FisherExactTest -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import HpoPredicate +from gpsea.analysis.clf import GenotypeClassifier, HpoClassifier from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.model import Cohort -from gpsea.view import CohortViewer, CohortVariantViewer, MtcStatsViewer, summarize_hpo_analysis +from gpsea.view import ( + CohortViewer, + CohortVariantViewer, + MtcStatsViewer, + summarize_hpo_analysis, +) @pytest.mark.skip("Just for manual testing and debugging") class TestCohortViewer: - @pytest.fixture def cohort_viewer( self, @@ -69,26 +72,25 @@ def test_viewer( class TestMtcStatsViewer: - @pytest.fixture(scope="class") def hpo_term_analysis_result( self, hpo: hpotk.MinimalOntology, - suox_gt_predicate: GenotypePolyPredicate, + suox_gt_clf: GenotypeClassifier, ) -> HpoTermAnalysisResult: - is_arachnodactyly = HpoPredicate( + is_arachnodactyly = HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001166"), # Arachnodactyly ) - is_seizure = HpoPredicate( + is_seizure = HpoClassifier( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure ) return HpoTermAnalysisResult( - gt_predicate=suox_gt_predicate, + gt_clf=suox_gt_clf, statistic=FisherExactTest(), mtc_correction="fdr_bh", - pheno_predicates=( + pheno_clfs=( is_arachnodactyly, is_seizure, ), @@ -97,17 +99,17 @@ def hpo_term_analysis_result( pd.DataFrame( data=[[10, 5], [10, 15]], index=pd.Index(is_arachnodactyly.get_categories()), - columns=pd.Index(suox_gt_predicate.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), ), pd.DataFrame( data=[[5, 0], [5, 10]], index=pd.Index(is_seizure.get_categories()), - columns=pd.Index(suox_gt_predicate.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), ), ), statistic_results=( - StatisticResult(statistic=None, pval=math.nan), - StatisticResult(statistic=1.23, pval=0.01), + StatisticResult(statistic=None, pval=math.nan), + StatisticResult(statistic=1.23, pval=0.01), ), corrected_pvals=(math.nan, 0.01), mtc_filter_name="Random MTC filter",