From b694fec4faf6fbacf2bfb0dea40103879083bdee Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 13:20:51 +0200 Subject: [PATCH 1/8] Remove X-linked modes of inheritance. --- .../predicate/genotype/_gt_predicates.py | 175 +----------------- .../predicate/genotype/test_gt_predicates.py | 56 +----- 2 files changed, 13 insertions(+), 218 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 341d17fb..a9972044 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,5 +1,4 @@ import dataclasses -import enum import typing from collections import defaultdict @@ -291,23 +290,6 @@ class GenotypeGroup: categorization: Categorization -class MendelianInheritanceAspect(enum.Enum): - AUTOSOMAL = 0 - """ - Related to chromosomes that do *not* determine the sex of an individual. - """ - - GONOSOMAL = 1 - """ - Related to chromosomes that determine the sex of an individual. - """ - - MITOCHONDRIAL = 2 - """ - Related to mitochondrial DNA. - """ - - class ModeOfInheritanceInfo: # NOT PART OF THE PUBLIC API!!! @@ -333,13 +315,6 @@ class ModeOfInheritanceInfo: description="Homozygous alternate or compound heterozygous", ), ) - HEMI = Categorization( - PatientCategory( - cat_id=3, - name="HEMI", - description="Hemizygous", - ), - ) @staticmethod def autosomal_dominant() -> "ModeOfInheritanceInfo": @@ -356,7 +331,6 @@ def autosomal_dominant() -> "ModeOfInheritanceInfo": ), ) return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, groups=groups, ) @@ -380,62 +354,11 @@ def autosomal_recessive() -> "ModeOfInheritanceInfo": ), ) return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, - groups=groups, - ) - - @staticmethod - def x_dominant() -> "ModeOfInheritanceInfo": - groups = ( - GenotypeGroup( - allele_count=0, - sex=None, - categorization=ModeOfInheritanceInfo.HOM_REF, - ), - GenotypeGroup( - allele_count=1, - sex=None, - categorization=ModeOfInheritanceInfo.HET, - ), - ) - return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, - groups=groups, - ) - - @staticmethod - def x_recessive() -> "ModeOfInheritanceInfo": - groups = ( - GenotypeGroup( - allele_count=0, - sex=None, - categorization=ModeOfInheritanceInfo.HOM_REF, - ), - GenotypeGroup( - allele_count=1, - sex=Sex.FEMALE, - categorization=ModeOfInheritanceInfo.HET, - ), - GenotypeGroup( - allele_count=2, - sex=Sex.FEMALE, - categorization=ModeOfInheritanceInfo.BIALLELIC_ALT, - ), - GenotypeGroup( - allele_count=1, - sex=Sex.MALE, - categorization=ModeOfInheritanceInfo.HEMI, - ), - ) - - return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, groups=groups, ) def __init__( self, - mendelian_inheritance_aspect: MendelianInheritanceAspect, groups: typing.Iterable[GenotypeGroup], ): # We want this to be hashable but also keep a non-hashable dict @@ -443,10 +366,6 @@ def __init__( # The correctness depends on two default dicts with same keys and values # comparing equal. hash_value = 17 - assert isinstance(mendelian_inheritance_aspect, MendelianInheritanceAspect) - self._aspect = mendelian_inheritance_aspect - - hash_value += 31 * hash(self._aspect) self._groups = defaultdict(list) for group in groups: @@ -461,10 +380,6 @@ def groups(self) -> typing.Iterator[GenotypeGroup]: # Flatten `values()` which is an iterable of lists. return (group for meta_group in self._groups.values() for group in meta_group) - @property - def mendelian_inheritance_aspect(self) -> MendelianInheritanceAspect: - return self._aspect - def get_groups_for_allele_count( self, allele_count: int, @@ -475,19 +390,9 @@ def get_groups_for_allele_count( # No group for this allele count is OK return () - def is_autosomal(self) -> bool: - return self._aspect == MendelianInheritanceAspect.AUTOSOMAL - - def is_gonosomal(self) -> bool: - return self._aspect == MendelianInheritanceAspect.GONOSOMAL - - def is_mitochondrial(self) -> bool: - return self._aspect == MendelianInheritanceAspect.MITOCHONDRIAL - def __eq__(self, value: object) -> bool: return ( isinstance(value, ModeOfInheritanceInfo) - and self._aspect == value._aspect and self._groups == value._groups ) @@ -495,7 +400,7 @@ def __hash__(self) -> int: return self._hash def __str__(self) -> str: - return f"ModeOfInheritanceInfo(aspect={self._aspect}, groups={self._groups})" + return f"ModeOfInheritanceInfo(groups={self._groups})" def __repr__(self) -> str: return str(self) @@ -518,7 +423,7 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate.from_moi_info( + return ModeOfInheritancePredicate._from_moi_info( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), ) @@ -535,46 +440,13 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate.from_moi_info( + return ModeOfInheritancePredicate._from_moi_info( variant_predicate=variant_predicate, mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), ) @staticmethod - def x_dominant( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": - """ - Create a predicate that assigns the patient either into - homozygous reference or heterozygous - group in line with the X-linked dominant mode of inheritance. - - :param variant_predicate: a predicate for choosing the variants for testing. - """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.x_dominant(), - ) - - @staticmethod - def x_recessive( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": - """ - Create a predicate that assigns the patient either into - homozygous reference, heterozygous, biallelic alternative allele - (homozygous alternative or compound heterozygous), or hemizygous - group in line with the X-linked recessive mode of inheritance. - - :param variant_predicate: a predicate for choosing the variants for testing. - """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.x_recessive(), - ) - - @staticmethod - def from_moi_info( + def _from_moi_info( variant_predicate: VariantPredicate, mode_of_inheritance_data: ModeOfInheritanceInfo, ) -> "ModeOfInheritancePredicate": @@ -620,41 +492,12 @@ def test( ) -> typing.Optional[Categorization]: self._check_patient(patient) - if self._moi_info.is_autosomal(): - allele_count = self._allele_counter.count(patient) - groups = self._moi_info.get_groups_for_allele_count(allele_count) - if len(groups) == 1: - return groups[0].categorization - else: - return None - elif self._moi_info.is_gonosomal(): - if patient.sex.is_provided(): - allele_count = self._allele_counter.count(patient) - groups = self._moi_info.get_groups_for_allele_count(allele_count) - if len(groups) == 0: - # Unable to assign the individual. - return None - elif len(groups) == 1: - # We can only assign into one category no matter what the individual's sex is. - return groups[0].categorization - else: - # We choose depending on the sex. - for group in groups: - if group.sex is not None and group.sex == patient.sex: - return group.categorization - return None - else: - # We must have patient's sex - # to do any meaningful analysis - # in the non-autosomal scenario. - return None - - elif self._moi_info.is_mitochondrial(): - # Cannot deal with mitochondrial inheritance right now. - return None + allele_count = self._allele_counter.count(patient) + groups = self._moi_info.get_groups_for_allele_count(allele_count) + if len(groups) == 1: + return groups[0].categorization else: - # Bug, please report to the developers - raise ValueError("Unexpected mode of inheritance condition") + return None def __eq__(self, value: object) -> bool: return ( diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 61ff29c3..6d82201c 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -136,62 +136,13 @@ def test_autosomal_recessive( assert categorization.category.name == name - @pytest.mark.parametrize( - "patient_name,name", - [ - ("adam", "HOM_REF"), - ("eve", "HET"), - ("cain", "HET"), - ], - ) - def test_x_dominant( - self, - patient_name: str, - name: str, - variant_predicate: VariantPredicate, - request: pytest.FixtureRequest, - ): - patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.x_dominant(variant_predicate) - - categorization = predicate.test(patient) - - assert categorization is not None - - assert categorization.category.name == name - - @pytest.mark.parametrize( - "patient_name,name", - [ - ("anakin", "HOM_REF"), - ("padme", "HET"), - ("luke", "HEMI"), - ("leia", "HET"), - ], - ) - def test_x_recessive( - self, - patient_name: str, - name: str, - variant_predicate: VariantPredicate, - request: pytest.FixtureRequest, - ): - patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.x_recessive(variant_predicate) - - categorization = predicate.test(patient) - - assert categorization is not None - - assert categorization.category.name == name - class TestFilteringPredicate: @pytest.fixture(scope="class") def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: affects_suox = VariantPredicates.gene("SUOX") - return ModeOfInheritancePredicate.x_recessive( + return ModeOfInheritancePredicate.autosomal_recessive( variant_predicate=affects_suox, ) @@ -232,17 +183,18 @@ def test_filtering_predicate__explodes_when_not_subsetting( assert ( ve.value.args[0] - == "It makes no sense to subset the a predicate with 4 categorizations with the same number (4) of targets" + == "It makes no sense to subset the a predicate with 3 categorizations with the same number (3) of targets" ) def test_filtering_predicate__explodes_when_using_random_junk( self, x_recessive_gt_predicate: GenotypePolyPredicate, ): + random_junk = (0, 1) with pytest.raises(ValueError) as ve: filtering_predicate( predicate=x_recessive_gt_predicate, - targets=(0, 1), + targets=random_junk, # type: ignore ) assert ( From 134f85eabe6f62a407d132f26f9bb3b9a5b21de3 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:03:45 +0200 Subject: [PATCH 2/8] Implement monoallelic and biallelic predicates. --- .../analysis/predicate/genotype/__init__.py | 2 + .../predicate/genotype/_gt_predicates.py | 168 ++++++++++++++++-- tests/analysis/predicate/genotype/conftest.py | 86 +++++++-- .../predicate/genotype/test_gt_predicates.py | 81 +++++++++ 4 files changed, 312 insertions(+), 25 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 23ad8cfb..d684972c 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -2,12 +2,14 @@ from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'diagnosis_predicate', + 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index a9972044..d21be631 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -15,7 +15,7 @@ # TODO: implement __hash__, __eq__ on predicates -class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): +class AlleleCountingGenotypePredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API """ The predicate tests presence of at least one matching allele in the patient. @@ -26,7 +26,7 @@ class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): @staticmethod def for_variant_predicate(predicate: VariantPredicate): allele_counter = AlleleCounter(predicate=predicate) - return AlleleCountingGenotypeBooleanPredicate(allele_counter=allele_counter) + return AlleleCountingGenotypePredicate(allele_counter=allele_counter) def __init__(self, allele_counter: AlleleCounter): self._allele_counter = allele_counter @@ -34,12 +34,12 @@ def __init__(self, allele_counter: AlleleCounter): def get_categorizations(self) -> typing.Sequence[Categorization]: """ The predicate bins a patient into - :attr:`AlleleCountingGenotypeBooleanPredicate.NO` - or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category. + :attr:`AlleleCountingGenotypePredicate.NO` + or :class:`AlleleCountingGenotypePredicate.YES` category. """ return ( - AlleleCountingGenotypeBooleanPredicate.YES, - AlleleCountingGenotypeBooleanPredicate.NO, + AlleleCountingGenotypePredicate.YES, + AlleleCountingGenotypePredicate.NO, ) def get_question_base(self) -> str: @@ -50,9 +50,9 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: allele_count = self._allele_counter.count(patient) if allele_count > 0: - return AlleleCountingGenotypeBooleanPredicate.YES + return AlleleCountingGenotypePredicate.YES elif allele_count == 0: - return AlleleCountingGenotypeBooleanPredicate.NO + return AlleleCountingGenotypePredicate.NO else: raise ValueError( f"Allele counter should return a non-negative allele count: {allele_count}" @@ -60,7 +60,7 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def __eq__(self, value: object) -> bool: return ( - isinstance(value, AlleleCountingGenotypeBooleanPredicate) + isinstance(value, AlleleCountingGenotypePredicate) and self._allele_counter == value._allele_counter ) @@ -68,7 +68,7 @@ def __hash__(self) -> int: return hash((self._allele_counter,)) def __str__(self) -> str: - return f"AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})" + return f"AlleleCountingGenotypePredicate(allele_counter={self._allele_counter})" def __repr__(self) -> str: return str(self) @@ -79,7 +79,7 @@ def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredic Create a genotype boolean predicate from given `variant_predicate` to test for presence of at least one matching allele in the patient. """ - return AlleleCountingGenotypeBooleanPredicate.for_variant_predicate( + return AlleleCountingGenotypePredicate.for_variant_predicate( predicate=variant_predicate, ) @@ -186,6 +186,144 @@ def groups_predicate( ) +class MonoallelicGenotypePredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + ZERO = Categorization( + PatientCategory(cat_id=0, name="Zero", description="Zero alleles") + ) + ONE = Categorization( + PatientCategory(cat_id=1, name="One", description="One allele") + ) + + @staticmethod + def for_variant_predicate(predicate: VariantPredicate): + allele_counter = AlleleCounter(predicate=predicate) + return MonoallelicGenotypePredicate(allele_counter=allele_counter) + + def __init__( + self, + allele_counter: AlleleCounter, + ): + self._allele_counter = allele_counter + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return (MonoallelicGenotypePredicate.ZERO, MonoallelicGenotypePredicate.ONE) + + def get_question_base(self) -> str: + return "The variant allele count" + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + self._check_patient(patient) + + allele_count = self._allele_counter.count(patient) + if allele_count == 0: + return MonoallelicGenotypePredicate.ZERO + elif allele_count == 1: + return MonoallelicGenotypePredicate.ONE + else: + return None + + def __eq__(self, value: object) -> bool: + return ( + isinstance(value, MonoallelicGenotypePredicate) + and self._allele_counter == value._allele_counter + ) + + def __hash__(self) -> int: + return hash((self._allele_counter,)) + + def __str__(self) -> str: + return f"MonoallelicGenotypePredicate(allele_counter={self._allele_counter})" + + def __repr__(self) -> str: + return str(self) + + +def monoallelic_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: + return MonoallelicGenotypePredicate.for_variant_predicate(variant_predicate) + + +class BiallelicGenotypePredicate(GenotypePolyPredicate): + # NOT PART OF THE PUBLIC API + + AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) + AB = Categorization( + PatientCategory( + cat_id=1, name="AB", description="One copy of A and one copy of B" + ) + ) + BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) + + @staticmethod + def for_variant_predicates( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + ): + return BiallelicGenotypePredicate( + a_counter=AlleleCounter(a_predicate), + b_counter=AlleleCounter(b_predicate), + ) + + def __init__( + self, + a_counter: AlleleCounter, + b_counter: AlleleCounter, + ): + self._a_counter = a_counter + self._b_counter = b_counter + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return ( + BiallelicGenotypePredicate.AA, + BiallelicGenotypePredicate.AB, + BiallelicGenotypePredicate.BB, + ) + + def get_question_base(self) -> str: + return "The variant allele count" + + def test(self, patient: Patient) -> typing.Optional[Categorization]: + self._check_patient(patient) + + a_count = self._a_counter.count(patient) + b_count = self._b_counter.count(patient) + if a_count == 2 and b_count == 0: + return BiallelicGenotypePredicate.AA + elif a_count == 1 and b_count == 1: + return BiallelicGenotypePredicate.AB + elif a_count == 0 and b_count == 2: + return BiallelicGenotypePredicate.BB + else: + return None + + def __eq__(self, value: object) -> bool: + return ( + isinstance(value, BiallelicGenotypePredicate) + and self._a_counter == value._a_counter + and self._b_counter == value._b_counter + ) + + def __hash__(self) -> int: + return hash((self._a_counter, self._b_counter)) + + def __str__(self) -> str: + return f"BiallelicGenotypePredicate(a_counter={self._a_counter}, b_counter={self._b_counter})" + + def __repr__(self) -> str: + return str(self) + + +def biallelic_predicate( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, +) -> GenotypePolyPredicate: + return BiallelicGenotypePredicate.for_variant_predicates( + a_predicate=a_predicate, + b_predicate=b_predicate, + ) + + class FilteringGenotypePolyPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API @@ -592,7 +730,7 @@ def create( pass else: raise ValueError(f"{d} is neither `str` nor `hpotk.TermId`") - + diagnosis_ids.append(d) if labels is None: @@ -631,7 +769,7 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations def get_question_base(self) -> str: - return 'What disease was diagnosed' + return "What disease was diagnosed" def test(self, patient: Patient) -> typing.Optional[Categorization]: categorization = None @@ -641,14 +779,14 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: except KeyError: # No match for this disease, no problem. continue - + if categorization is None: # First time we found a candidate disease categorization = candidate else: # Ambiguous match. We found several matching diagnoses! return None - + return categorization diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 7138e93f..88fb5537 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -336,7 +336,7 @@ def cain( @pytest.fixture(scope="package") -def white_mutation( +def white_missense_mutation( genome_build: GenomeBuild, walt_label: SampleLabels, skyler_label: SampleLabels, @@ -364,7 +364,7 @@ def white_mutation( gene_id="a_gene", tx_id="tx:xyz", hgvs_cdna=None, - is_preferred=False, + is_preferred=True, variant_effects=( VariantEffect.MISSENSE_VARIANT, VariantEffect.SPLICE_DONOR_VARIANT, @@ -386,6 +386,56 @@ def white_mutation( ) +@pytest.fixture(scope="package") +def white_stop_gain_mutation( + genome_build: GenomeBuild, + walt_label: SampleLabels, + skyler_label: SampleLabels, + flynn_label: SampleLabels, + holly_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=200, + end=201, + strand=Strand.POSITIVE, + ), + ref="T", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=True, + variant_effects=( + VariantEffect.STOP_GAINED, + ), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(80, 81), + ), + ), + genotypes=Genotypes.from_mapping( + { + walt_label: Genotype.HETEROZYGOUS, + skyler_label: Genotype.HETEROZYGOUS, + flynn_label: Genotype.HOMOZYGOUS_REFERENCE, + holly_label: Genotype.HOMOZYGOUS_ALTERNATE, + } + ), + ) + + @pytest.fixture(scope="package") def walt_label() -> SampleLabels: return SampleLabels("Walt") @@ -394,14 +444,18 @@ def walt_label() -> SampleLabels: @pytest.fixture(scope="package") def walt( walt_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( walt_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) @@ -413,14 +467,18 @@ def skyler_label() -> SampleLabels: @pytest.fixture(scope="package") def skyler( skyler_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( skyler_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) @@ -432,14 +490,18 @@ def flynn_label() -> SampleLabels: @pytest.fixture(scope="package") def flynn( flynn_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( flynn_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) @@ -451,14 +513,18 @@ def holly_label() -> SampleLabels: @pytest.fixture(scope="package") def holly( holly_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_stop_gain_mutation: Variant, ) -> Patient: return Patient( holly_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_stop_gain_mutation, + ), ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 6d82201c..529f5fed 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -8,6 +8,8 @@ groups_predicate, filtering_predicate, sex_predicate, + monoallelic_predicate, + biallelic_predicate, VariantPredicates, VariantPredicate, ModeOfInheritancePredicate, @@ -137,6 +139,85 @@ def test_autosomal_recessive( assert categorization.category.name == name +class TestAllelePredicates: + + @pytest.mark.parametrize( + "individual_name,expected_name", + [ + ("adam", "Zero"), # 0/0 + ("eve", "One"), # 0/1 + ("cain", "One"), # 0/1 + ], + ) + def test_monoallelic_predicate_ad_family( + self, + individual_name: str, + expected_name: str, + request: pytest.FixtureRequest, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) + + assert actual_cat is not None + assert actual_cat.category.name == expected_name + + @pytest.mark.parametrize( + "individual_name,not_none,expected_name", + [ + ("walt", True, "One"), # 1/2 + ("skyler", True, "One"), # 1/2 + ("flynn", False, None), # 1/1, hence 2 alleles which is too much to work with monoallelic predicate. + ("holly", True, "Zero"), # 2/2 + ], + ) + def test_monoallelic_predicate_ar_family( + self, + individual_name: str, + not_none: bool, + expected_name: str, + request: pytest.FixtureRequest, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) + + if not_none: + assert actual_cat is not None + assert actual_cat.category.name == expected_name + else: + assert actual_cat is None + + @pytest.mark.parametrize( + "individual_name,expected_name", + [ + ("walt", "AB"), # 1/2 + ("skyler", "AB"), # 1/2 + ("flynn", "AA"), # 1/1 + ("holly", "BB"), # 2/2 + ], + ) + def test_biallelic_predicate( + self, + individual_name: str, + expected_name: str, + request: pytest.FixtureRequest, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_stop_gained = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, TX_ID) + gt_predicate = biallelic_predicate(is_missense, is_stop_gained) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) + + assert actual_cat is not None + assert actual_cat.category.name == expected_name + + class TestFilteringPredicate: @pytest.fixture(scope="class") From 81e3d31b0791053282e93c6ff1c96b341d8ebcde Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:09:01 +0200 Subject: [PATCH 3/8] Remove filtering predicate. --- .../predicates/filtering_predicate.rst | 56 ----------- .../predicates/genotype_predicates.rst | 1 - .../analysis/predicate/genotype/__init__.py | 4 +- .../predicate/genotype/_gt_predicates.py | 97 ------------------- .../predicate/genotype/test_gt_predicates.py | 84 ---------------- 5 files changed, 2 insertions(+), 240 deletions(-) delete mode 100644 docs/user-guide/predicates/filtering_predicate.rst diff --git a/docs/user-guide/predicates/filtering_predicate.rst b/docs/user-guide/predicates/filtering_predicate.rst deleted file mode 100644 index a5acef31..00000000 --- a/docs/user-guide/predicates/filtering_predicate.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. _filtering-predicate: - - -=================== -Filtering predicate -=================== - -Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need -to consider only a subset of the groups. A `GenotypePolyPredicate` -created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset -of the target categorizations of interest. - -Example -------- - -Let's suppose we want test the genotype-phenotype association between variants -that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, -and we are specifically interested in comparing the heterozygous variants -in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -to bin according to a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' - -We see that the `gt_predicate` bins the patients into three groups: - ->>> cats = gt_predicate.get_categorizations() ->>> cats -(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) - -We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, -and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -that includes only the categories of interest: - ->>> from gpsea.analysis.predicate.genotype import filtering_predicate ->>> fgt_predicate = filtering_predicate( -... predicate=gt_predicate, -... targets=(cats[1], cats[2]), -... ) ->>> fgt_predicate.display_question() -'What is the genotype group: HET, BIALLELIC_ALT' \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst index 70fe391d..f1ff3bf7 100644 --- a/docs/user-guide/predicates/genotype_predicates.rst +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -24,7 +24,6 @@ the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. variant_predicates mode_of_inheritance_predicate - filtering_predicate male_female_predicate diagnosis_predicate groups_predicate diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index d684972c..14036885 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,14 +1,14 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import boolean_predicate, groups_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'diagnosis_predicate', + 'boolean_predicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index d21be631..89abe068 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -324,103 +324,6 @@ def biallelic_predicate( ) -class FilteringGenotypePolyPredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - - @staticmethod - def create( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "FilteringGenotypePolyPredicate": - # At least 2 target categorizations must be provided - if len(targets) <= 1: - raise ValueError( - f"At least 2 target categorizations must be provided but got {len(targets)}" - ) - - good_boys = tuple(isinstance(cat, Categorization) for cat in targets) - if not all(good_boys): - offenders = ", ".join( - str(i) for i, is_instance in enumerate(good_boys) if not is_instance - ) - raise ValueError( - f"The targets at following indices are not categorizations: [{offenders}]" - ) - - # All `allowed` categorizations must in fact be present in the `base` predicate. - cats_are_in_fact_present = tuple( - cat in predicate.get_categorizations() for cat in targets - ) - if not all(cats_are_in_fact_present): - missing = ", ".join( - c.category.name - for c, is_present in zip(targets, cats_are_in_fact_present) - if not is_present - ) - raise ValueError(f"Some from the categories are not present: {missing}") - - if len(targets) == predicate.n_categorizations(): - raise ValueError( - f"It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations " - f"with the same number ({len(targets)}) of targets" - ) - - return FilteringGenotypePolyPredicate( - predicate=predicate, - allowed=targets, - ) - - def __init__( - self, - predicate: "GenotypePolyPredicate", - allowed: typing.Iterable[Categorization], - ): - self._predicate = predicate - self._allowed = tuple(allowed) - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return self._allowed - - def get_question_base(self) -> str: - return self._predicate.get_question_base() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - cat = self._predicate.test(patient) - if cat in self._allowed: - return cat - else: - return None - - def __repr__(self): - return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" - - -def filtering_predicate( - predicate: GenotypePolyPredicate, - targets: typing.Collection[Categorization], -) -> GenotypePolyPredicate: - """ - Filtering predicate applies the base `predicate` but only returns the categorizations - from the provided `targets` collection. - - This can be useful if only some of the categorizations are interesting. - For instance, if we only seek to compare the differences between heterozygous and hemizygous variants, - but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups. - - See the :ref:`filtering-predicate` section for an example. - - The `predicate` is checked for being able to produce the all items in `targets` - and the `targets` must include at least 2 categorizations. - - :param predicate: the base predicate whose categorizations are subject to filteration. - :param targets: the categorizations to retain - """ - return FilteringGenotypePolyPredicate.create( - predicate=predicate, - targets=targets, - ) - - @dataclasses.dataclass(eq=True, frozen=True) class GenotypeGroup: allele_count: int diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 529f5fed..cc2e50f6 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -1,12 +1,9 @@ -import typing - import pytest from gpsea.model import * from gpsea.analysis.predicate.genotype import ( GenotypePolyPredicate, groups_predicate, - filtering_predicate, sex_predicate, monoallelic_predicate, biallelic_predicate, @@ -218,87 +215,6 @@ def test_biallelic_predicate( assert actual_cat.category.name == expected_name -class TestFilteringPredicate: - - @pytest.fixture(scope="class") - def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: - affects_suox = VariantPredicates.gene("SUOX") - return ModeOfInheritancePredicate.autosomal_recessive( - variant_predicate=affects_suox, - ) - - @pytest.mark.parametrize( - "indices, expected", - [ - ((0, 1), 2), - ((1, 0), 2), - ((1, 2), 2), - ], - ) - def test_filtering_predicate( - self, - indices: typing.Sequence[int], - expected: int, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - cats = x_recessive_gt_predicate.get_categorizations() - targets = [cats[i] for i in indices] - predicate = filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=targets, - ) - - actual = len(predicate.get_categorizations()) - - assert actual == expected - - def test_filtering_predicate__explodes_when_not_subsetting( - self, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=x_recessive_gt_predicate.get_categorizations(), - ) - - assert ( - ve.value.args[0] - == "It makes no sense to subset the a predicate with 3 categorizations with the same number (3) of targets" - ) - - def test_filtering_predicate__explodes_when_using_random_junk( - self, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - random_junk = (0, 1) - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=random_junk, # type: ignore - ) - - assert ( - ve.value.args[0] - == "The targets at following indices are not categorizations: [0, 1]" - ) - - def test_filtering_predicate__explodes_when_using_one_category( - self, - x_recessive_gt_predicate: GenotypePolyPredicate, - ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=(x_recessive_gt_predicate.get_categorizations()[0],), - ) - - assert ( - ve.value.args[0] - == "At least 2 target categorizations must be provided but got 1" - ) - - class TestSexPredicate: def test_sex_predicate( From 328ec489d3c206d18c89476c74af6a79f025f589 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:21:54 +0200 Subject: [PATCH 4/8] Add docs. --- .../predicate/genotype/_gt_predicates.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 89abe068..420110d8 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -241,6 +241,22 @@ def __repr__(self) -> str: def monoallelic_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: + """ + + The predicate bins patient into one of two groups: `Zero` and `One`: + + +-----------+------------------+ + | Group | Allele count | + +===========+==================+ + | Zero | 0 | + +-----------+------------------+ + | One | 1 | + +-----------+------------------+ + + Individuals with different allele counts (e.g. `2) + are assigned the ``None`` group and, thus, omitted from the analysis. + """ + return MonoallelicGenotypePredicate.for_variant_predicate(variant_predicate) @@ -318,6 +334,26 @@ def biallelic_predicate( a_predicate: VariantPredicate, b_predicate: VariantPredicate, ) -> GenotypePolyPredicate: + """ + Get a predicate for binning the individuals into groups, + with respect to allele counts of variants selected by `a_predicate` and `b_predicate`. + + The predicate bins patient into one of three groups: `AA`, `AB` and `BB`: + + +-----------+------------------+------------------+ + | Group | `A` allele count | `B` allele count | + +===========+==================+==================+ + | AA | 2 | 0 | + +-----------+------------------+------------------+ + | AB | 1 | 1 | + +-----------+------------------+------------------+ + | AA | 0 | 2 | + +-----------+------------------+------------------+ + + Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) + are assigned the ``None`` group and, thus, omitted from the analysis. + + """ return BiallelicGenotypePredicate.for_variant_predicates( a_predicate=a_predicate, b_predicate=b_predicate, From 7422eb2285235de4f601355d3a1f88b35322fa85 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 15:54:06 +0200 Subject: [PATCH 5/8] Simplify the "allelic" predicates. --- .../predicate/genotype/_gt_predicates.py | 178 +++++++----------- tests/analysis/predicate/genotype/conftest.py | 94 +++++++-- .../predicate/genotype/test_gt_predicates.py | 49 ++--- 3 files changed, 159 insertions(+), 162 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 420110d8..c2bbb5f1 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -186,148 +186,112 @@ def groups_predicate( ) -class MonoallelicGenotypePredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API +class PolyCountingGenotypePredicate(GenotypePolyPredicate): - ZERO = Categorization( - PatientCategory(cat_id=0, name="Zero", description="Zero alleles") - ) - ONE = Categorization( - PatientCategory(cat_id=1, name="One", description="One allele") - ) + A = Categorization(PatientCategory(cat_id=0, name="A", description="Monoallelic A")) + B = Categorization(PatientCategory(cat_id=1, name="B", description="Monoallelic B")) + + AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) + AB = Categorization(PatientCategory(cat_id=1, name="AB", description="Monoallelic A + monoallelic B")) + BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) @staticmethod - def for_variant_predicate(predicate: VariantPredicate): - allele_counter = AlleleCounter(predicate=predicate) - return MonoallelicGenotypePredicate(allele_counter=allele_counter) - - def __init__( - self, - allele_counter: AlleleCounter, + def monoallelic( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, ): - self._allele_counter = allele_counter - - def get_categorizations(self) -> typing.Sequence[Categorization]: - return (MonoallelicGenotypePredicate.ZERO, MonoallelicGenotypePredicate.ONE) - - def get_question_base(self) -> str: - return "The variant allele count" - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - self._check_patient(patient) - - allele_count = self._allele_counter.count(patient) - if allele_count == 0: - return MonoallelicGenotypePredicate.ZERO - elif allele_count == 1: - return MonoallelicGenotypePredicate.ONE - else: - return None + count2cat = { + (0, 1): PolyCountingGenotypePredicate.A, + (1, 0): PolyCountingGenotypePredicate.B, + } - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, MonoallelicGenotypePredicate) - and self._allele_counter == value._allele_counter + return PolyCountingGenotypePredicate._for_predicates_and_categories( + count2cat=count2cat, + a_predicate=a_predicate, + b_predicate=b_predicate, ) - def __hash__(self) -> int: - return hash((self._allele_counter,)) - - def __str__(self) -> str: - return f"MonoallelicGenotypePredicate(allele_counter={self._allele_counter})" - - def __repr__(self) -> str: - return str(self) - - -def monoallelic_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: - """ - - The predicate bins patient into one of two groups: `Zero` and `One`: - - +-----------+------------------+ - | Group | Allele count | - +===========+==================+ - | Zero | 0 | - +-----------+------------------+ - | One | 1 | - +-----------+------------------+ - - Individuals with different allele counts (e.g. `2) - are assigned the ``None`` group and, thus, omitted from the analysis. - """ - - return MonoallelicGenotypePredicate.for_variant_predicate(variant_predicate) - - -class BiallelicGenotypePredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API + @staticmethod + def biallelic( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + ): + count2cat = { + (2, 0): PolyCountingGenotypePredicate.AA, + (1, 1): PolyCountingGenotypePredicate.AB, + (0, 2): PolyCountingGenotypePredicate.BB, + } - AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) - AB = Categorization( - PatientCategory( - cat_id=1, name="AB", description="One copy of A and one copy of B" + return PolyCountingGenotypePredicate._for_predicates_and_categories( + count2cat=count2cat, + a_predicate=a_predicate, + b_predicate=b_predicate, ) - ) - BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) - + @staticmethod - def for_variant_predicates( + def _for_predicates_and_categories( + count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], a_predicate: VariantPredicate, b_predicate: VariantPredicate, - ): - return BiallelicGenotypePredicate( + ) -> "PolyCountingGenotypePredicate": + return PolyCountingGenotypePredicate( a_counter=AlleleCounter(a_predicate), b_counter=AlleleCounter(b_predicate), + count2cat=count2cat, ) def __init__( self, + count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], a_counter: AlleleCounter, b_counter: AlleleCounter, ): + self._count2cat = dict(count2cat) + self._categorizations = tuple(count2cat.values()) self._a_counter = a_counter self._b_counter = b_counter - + def get_categorizations(self) -> typing.Sequence[Categorization]: - return ( - BiallelicGenotypePredicate.AA, - BiallelicGenotypePredicate.AB, - BiallelicGenotypePredicate.BB, - ) + return self._categorizations def get_question_base(self) -> str: - return "The variant allele count" + raise NotImplementedError def test(self, patient: Patient) -> typing.Optional[Categorization]: self._check_patient(patient) a_count = self._a_counter.count(patient) b_count = self._b_counter.count(patient) - if a_count == 2 and b_count == 0: - return BiallelicGenotypePredicate.AA - elif a_count == 1 and b_count == 1: - return BiallelicGenotypePredicate.AB - elif a_count == 0 and b_count == 2: - return BiallelicGenotypePredicate.BB - else: - return None + counts = (a_count, b_count) + + return self._count2cat.get(counts, None) - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, BiallelicGenotypePredicate) - and self._a_counter == value._a_counter - and self._b_counter == value._b_counter - ) + # TODO: implement __hash__, __eq__ - def __hash__(self) -> int: - return hash((self._a_counter, self._b_counter)) - def __str__(self) -> str: - return f"BiallelicGenotypePredicate(a_counter={self._a_counter}, b_counter={self._b_counter})" +def monoallelic_predicate( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, +) -> GenotypePolyPredicate: + """ + + The predicate bins patient into one of two groups: `Zero` and `One`: - def __repr__(self) -> str: - return str(self) + +-----------+------------------+------------------+ + | Group | `A` allele count | `B` allele count | + +===========+==================+==================+ + | A | 1 | 0 | + +-----------+------------------+------------------+ + | B | 0 | 1 | + +-----------+------------------+------------------+ + + Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) + are assigned the ``None`` group and, thus, omitted from the analysis. + """ + return PolyCountingGenotypePredicate.monoallelic( + a_predicate=a_predicate, + b_predicate=b_predicate, + ) def biallelic_predicate( @@ -354,7 +318,7 @@ def biallelic_predicate( are assigned the ``None`` group and, thus, omitted from the analysis. """ - return BiallelicGenotypePredicate.for_variant_predicates( + return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, b_predicate=b_predicate, ) diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 88fb5537..037e3712 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -220,7 +220,7 @@ def patient_w_frameshift( @pytest.fixture(scope="package") -def genesis_mutation( +def genesis_missense_mutation( genome_build: GenomeBuild, adam_label: SampleLabels, eve_label: SampleLabels, @@ -268,6 +268,54 @@ def genesis_mutation( ) +@pytest.fixture(scope="package") +def genesis_synonymous_mutation( + genome_build: GenomeBuild, + adam_label: SampleLabels, + eve_label: SampleLabels, + cain_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=200, + end=201, + strand=Strand.POSITIVE, + ), + ref="T", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=True, + variant_effects=( + VariantEffect.SYNONYMOUS_VARIANT, + ), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(80, 81), + ), + ), + genotypes=Genotypes.from_mapping( + { + adam_label: Genotype.HETEROZYGOUS, + eve_label: Genotype.HOMOZYGOUS_REFERENCE, + cain_label: Genotype.HOMOZYGOUS_REFERENCE, + } + ), + ) + + @pytest.fixture(scope="package") def adam_label() -> SampleLabels: return SampleLabels("Adam") @@ -276,14 +324,18 @@ def adam_label() -> SampleLabels: @pytest.fixture(scope="package") def adam( adam_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( adam_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -295,14 +347,18 @@ def eve_label() -> SampleLabels: @pytest.fixture(scope="package") def eve( eve_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( eve_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -314,14 +370,18 @@ def cain_label() -> SampleLabels: @pytest.fixture(scope="package") def cain( cain_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( cain_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -387,7 +447,7 @@ def white_missense_mutation( @pytest.fixture(scope="package") -def white_stop_gain_mutation( +def white_synonymous_mutation( genome_build: GenomeBuild, walt_label: SampleLabels, skyler_label: SampleLabels, @@ -417,7 +477,7 @@ def white_stop_gain_mutation( hgvs_cdna=None, is_preferred=True, variant_effects=( - VariantEffect.STOP_GAINED, + VariantEffect.SYNONYMOUS_VARIANT, ), affected_exons=(5,), protein_id="pt:xyz", @@ -445,7 +505,7 @@ def walt_label() -> SampleLabels: def walt( walt_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( walt_label, @@ -454,7 +514,7 @@ def walt( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) @@ -468,7 +528,7 @@ def skyler_label() -> SampleLabels: def skyler( skyler_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( skyler_label, @@ -477,7 +537,7 @@ def skyler( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) @@ -491,7 +551,7 @@ def flynn_label() -> SampleLabels: def flynn( flynn_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( flynn_label, @@ -500,7 +560,7 @@ def flynn( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) @@ -514,7 +574,7 @@ def holly_label() -> SampleLabels: def holly( holly_label: SampleLabels, white_missense_mutation: Variant, - white_stop_gain_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( holly_label, @@ -523,7 +583,7 @@ def holly( diseases=(), variants=( white_missense_mutation, - white_stop_gain_mutation, + white_synonymous_mutation, ), ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index cc2e50f6..1afa391a 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -141,9 +141,9 @@ class TestAllelePredicates: @pytest.mark.parametrize( "individual_name,expected_name", [ - ("adam", "Zero"), # 0/0 - ("eve", "One"), # 0/1 - ("cain", "One"), # 0/1 + ("adam", "A"), # 0/0 & 0/1 + ("eve", "B"), # 0/1 & 0/0 + ("cain", "B"), # 0/1 & 0/0 ], ) def test_monoallelic_predicate_ad_family( @@ -153,7 +153,8 @@ def test_monoallelic_predicate_ad_family( request: pytest.FixtureRequest, ): is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - gt_predicate = monoallelic_predicate(is_missense) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense, is_synonymous) individual = request.getfixturevalue(individual_name) actual_cat = gt_predicate.test(individual) @@ -161,41 +162,13 @@ def test_monoallelic_predicate_ad_family( assert actual_cat is not None assert actual_cat.category.name == expected_name - @pytest.mark.parametrize( - "individual_name,not_none,expected_name", - [ - ("walt", True, "One"), # 1/2 - ("skyler", True, "One"), # 1/2 - ("flynn", False, None), # 1/1, hence 2 alleles which is too much to work with monoallelic predicate. - ("holly", True, "Zero"), # 2/2 - ], - ) - def test_monoallelic_predicate_ar_family( - self, - individual_name: str, - not_none: bool, - expected_name: str, - request: pytest.FixtureRequest, - ): - is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - gt_predicate = monoallelic_predicate(is_missense) - individual = request.getfixturevalue(individual_name) - - actual_cat = gt_predicate.test(individual) - - if not_none: - assert actual_cat is not None - assert actual_cat.category.name == expected_name - else: - assert actual_cat is None - @pytest.mark.parametrize( "individual_name,expected_name", [ - ("walt", "AB"), # 1/2 - ("skyler", "AB"), # 1/2 - ("flynn", "AA"), # 1/1 - ("holly", "BB"), # 2/2 + ("walt", "AB"), # 0/1 & 0/1 + ("skyler", "AB"), # 0/1 & 0/1 + ("flynn", "AA"), # 1/1 & 0/0 + ("holly", "BB"), # 0/0 & 1/1 ], ) def test_biallelic_predicate( @@ -205,8 +178,8 @@ def test_biallelic_predicate( request: pytest.FixtureRequest, ): is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) - is_stop_gained = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, TX_ID) - gt_predicate = biallelic_predicate(is_missense, is_stop_gained) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = biallelic_predicate(is_missense, is_synonymous) individual = request.getfixturevalue(individual_name) actual_cat = gt_predicate.test(individual) From 5016de1dd6cb1bbaea96dbc238627c3344246182 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 16:25:15 +0200 Subject: [PATCH 6/8] Use `monoallelic_predicate` in tutorial. --- docs/report/tbx5_frameshift_vs_missense.csv | 4 +- docs/tutorial.rst | 16 +++--- docs/user-guide/stats.rst | 2 +- .../predicate/genotype/_gt_predicates.py | 55 +++++++++++-------- .../predicate/genotype/test_gt_predicates.py | 26 ++++++++- 5 files changed, 66 insertions(+), 37 deletions(-) diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 76666c2a..4bffc6ea 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,4 +1,4 @@ -Genotype group,Missense,Missense,Frameshift,Frameshift,, +Allele group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 @@ -12,7 +12,7 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.299992259972 Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6857142857142857,0.42857142857142855 Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.831168831168831,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 -Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 35b0d278..5f8d7a06 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -189,21 +189,19 @@ Prepare genotype and phenotype predicates ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We will create a predicate to bin patients into group -depending on presence of a missense and frameshift variant to test +depending on presence of a single allele of a missense or frameshift variant to test if there is a difference between frameshift and non-frameshift variants in the individuals of the *TBX5* cohort. >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate ->>> gt_predicate = groups_predicate( -... predicates=( -... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) -... ), -... group_names=('Missense', 'Frameshift'), +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, monoallelic_predicate +>>> gt_predicate = monoallelic_predicate( +... a_predicate=VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... b_predicate=VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... names=('Missense', 'Frameshift') ... ) >>> gt_predicate.display_question() -'Genotype group: Missense, Frameshift' +'Allele group: Missense, Frameshift' .. note:: diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 5e9c8316..3247a121 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -127,7 +127,7 @@ First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` to test if the variant leads to a frameshift (in this case): >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate +>>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> tx_id = 'NM_181486.4' >>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) >>> is_frameshift.get_question() diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index c2bbb5f1..e6645473 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -188,21 +188,15 @@ def groups_predicate( class PolyCountingGenotypePredicate(GenotypePolyPredicate): - A = Categorization(PatientCategory(cat_id=0, name="A", description="Monoallelic A")) - B = Categorization(PatientCategory(cat_id=1, name="B", description="Monoallelic B")) - - AA = Categorization(PatientCategory(cat_id=0, name="AA", description="Biallelic A")) - AB = Categorization(PatientCategory(cat_id=1, name="AB", description="Monoallelic A + monoallelic B")) - BB = Categorization(PatientCategory(cat_id=2, name="BB", description="Biallelic B")) - @staticmethod def monoallelic( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str], ): count2cat = { - (0, 1): PolyCountingGenotypePredicate.A, - (1, 0): PolyCountingGenotypePredicate.B, + (1, 0): Categorization(PatientCategory(cat_id=0, name=names[0], description=f"Monoallelic {names[0]}")), + (0, 1): Categorization(PatientCategory(cat_id=1, name=names[1], description=f"Monoallelic {names[1]}")), } return PolyCountingGenotypePredicate._for_predicates_and_categories( @@ -215,11 +209,24 @@ def monoallelic( def biallelic( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str], ): count2cat = { - (2, 0): PolyCountingGenotypePredicate.AA, - (1, 1): PolyCountingGenotypePredicate.AB, - (0, 2): PolyCountingGenotypePredicate.BB, + (2, 0): Categorization( + PatientCategory( + cat_id=0, name=f'{names[0]}/{names[0]}', description=f"Biallelic {names[0]}", + ) + ), + (1, 1): Categorization( + PatientCategory( + cat_id=1, name=f'{names[0]}/{names[1]}', description=f"{names[0]}/{names[1]}", + ), + ), + (0, 2): Categorization( + PatientCategory( + cat_id=2, name=f'{names[1]}/{names[1]}', description=f"Biallelic {names[1]}" + ), + ), } return PolyCountingGenotypePredicate._for_predicates_and_categories( @@ -255,7 +262,7 @@ def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations def get_question_base(self) -> str: - raise NotImplementedError + return 'Allele group' def test(self, patient: Patient) -> typing.Optional[Categorization]: self._check_patient(patient) @@ -272,31 +279,34 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: def monoallelic_predicate( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ - The predicate bins patient into one of two groups: `Zero` and `One`: + The predicate bins patient into one of two groups: `A` and `B`: - +-----------+------------------+------------------+ - | Group | `A` allele count | `B` allele count | - +===========+==================+==================+ - | A | 1 | 0 | - +-----------+------------------+------------------+ - | B | 0 | 1 | - +-----------+------------------+------------------+ + +-----------+------------+------------+ + | Group | `A` count | `B` count | + +===========+============+============+ + | A | 1 | 0 | + +-----------+------------+------------+ + | B | 0 | 1 | + +-----------+------------+------------+ - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) + Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) are assigned the ``None`` group and, thus, omitted from the analysis. """ return PolyCountingGenotypePredicate.monoallelic( a_predicate=a_predicate, b_predicate=b_predicate, + names=names, ) def biallelic_predicate( a_predicate: VariantPredicate, b_predicate: VariantPredicate, + names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ Get a predicate for binning the individuals into groups, @@ -321,6 +331,7 @@ def biallelic_predicate( return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, b_predicate=b_predicate, + names=names, ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 1afa391a..222660ad 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -141,9 +141,9 @@ class TestAllelePredicates: @pytest.mark.parametrize( "individual_name,expected_name", [ - ("adam", "A"), # 0/0 & 0/1 - ("eve", "B"), # 0/1 & 0/0 - ("cain", "B"), # 0/1 & 0/0 + ("adam", "B"), # 0/0 & 0/1 + ("eve", "A"), # 0/1 & 0/0 + ("cain", "A"), # 0/1 & 0/0 ], ) def test_monoallelic_predicate_ad_family( @@ -162,6 +162,16 @@ def test_monoallelic_predicate_ad_family( assert actual_cat is not None assert actual_cat.category.name == expected_name + def test_monoallelic_predicate__general_stuff( + self, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + + gt_predicate = monoallelic_predicate(is_missense, is_synonymous) + + assert gt_predicate.display_question() == 'Allele group: A, B' + @pytest.mark.parametrize( "individual_name,expected_name", [ @@ -187,6 +197,16 @@ def test_biallelic_predicate( assert actual_cat is not None assert actual_cat.category.name == expected_name + def test_biallelic_predicate__general_stuff( + self, + ): + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + + gt_predicate = biallelic_predicate(is_missense, is_synonymous) + + assert gt_predicate.display_question() == 'Allele group: A/A, A/B, B/B' + class TestSexPredicate: From 640cf406c917b10ad0e01d3e9ed93561fde4352b Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 16:53:24 +0200 Subject: [PATCH 7/8] Remove `boolean_predicate`. --- .../analysis/predicate/genotype/__init__.py | 4 +- .../predicate/genotype/_gt_predicates.py | 71 +------------- .../analysis/pcats/test_hpo_term_analysis.py | 18 ++-- .../predicate/genotype/test_gt_predicates.py | 8 +- tests/analysis/test_mtc_filter.py | 20 ++-- tests/conftest.py | 6 +- tests/test_predicates.py | 96 +------------------ 7 files changed, 29 insertions(+), 194 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 14036885..b9bd8e8a 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,14 +1,14 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', + 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index e6645473..9bc8c6bd 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -7,7 +7,7 @@ from gpsea.model import Patient, Sex -from .._api import Categorization, PatientCategory, PatientCategories +from .._api import Categorization, PatientCategory from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter @@ -15,75 +15,6 @@ # TODO: implement __hash__, __eq__ on predicates -class AlleleCountingGenotypePredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - """ - The predicate tests presence of at least one matching allele in the patient. - """ - YES = Categorization(PatientCategories.YES) - NO = Categorization(PatientCategories.NO) - - @staticmethod - def for_variant_predicate(predicate: VariantPredicate): - allele_counter = AlleleCounter(predicate=predicate) - return AlleleCountingGenotypePredicate(allele_counter=allele_counter) - - def __init__(self, allele_counter: AlleleCounter): - self._allele_counter = allele_counter - - def get_categorizations(self) -> typing.Sequence[Categorization]: - """ - The predicate bins a patient into - :attr:`AlleleCountingGenotypePredicate.NO` - or :class:`AlleleCountingGenotypePredicate.YES` category. - """ - return ( - AlleleCountingGenotypePredicate.YES, - AlleleCountingGenotypePredicate.NO, - ) - - def get_question_base(self) -> str: - return self._allele_counter.get_question() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - self._check_patient(patient) - - allele_count = self._allele_counter.count(patient) - if allele_count > 0: - return AlleleCountingGenotypePredicate.YES - elif allele_count == 0: - return AlleleCountingGenotypePredicate.NO - else: - raise ValueError( - f"Allele counter should return a non-negative allele count: {allele_count}" - ) - - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, AlleleCountingGenotypePredicate) - and self._allele_counter == value._allele_counter - ) - - def __hash__(self) -> int: - return hash((self._allele_counter,)) - - def __str__(self) -> str: - return f"AlleleCountingGenotypePredicate(allele_counter={self._allele_counter})" - - def __repr__(self) -> str: - return str(self) - - -def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: - """ - Create a genotype boolean predicate from given `variant_predicate` - to test for presence of at least one matching allele in the patient. - """ - return AlleleCountingGenotypePredicate.for_variant_predicate( - predicate=variant_predicate, - ) - - class AlleleCountingGroupsPredicate(GenotypePolyPredicate): # NOT PART OF THE PUBLIC API diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py index dbd8d508..9d18f87a 100644 --- a/tests/analysis/pcats/test_hpo_term_analysis.py +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -57,25 +57,25 @@ def test_compare_genotype_vs_phenotypes( assert results is not None assert results.total_tests == 4 - assert results.n_usable == (35, 18, 13, 25, 23) + assert results.n_usable == (17, 7, 5, 13, 12) assert results.pvals == pytest.approx( [ - 0.0721291631224236, - 1.0, + 0.35294117647058815, + 0.48571428571428565, float("nan"), - 0.35921595820909313, - 0.6668461434917216, + 0.1048951048951049, + 1., ], nan_ok=True, ) assert results.corrected_pvals is not None assert results.corrected_pvals == pytest.approx( [ - 0.2885166524896944, - 1.0, + 0.6476190476190475, + 0.6476190476190475, float("nan"), - 0.7184319164181863, - 0.8891281913222954, + 0.4195804195804196, + 1.0, ], nan_ok=True, ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 222660ad..6d1b0988 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -175,10 +175,10 @@ def test_monoallelic_predicate__general_stuff( @pytest.mark.parametrize( "individual_name,expected_name", [ - ("walt", "AB"), # 0/1 & 0/1 - ("skyler", "AB"), # 0/1 & 0/1 - ("flynn", "AA"), # 1/1 & 0/0 - ("holly", "BB"), # 0/0 & 1/1 + ("walt", "A/B"), # 0/1 & 0/1 + ("skyler", "A/B"), # 0/1 & 0/1 + ("flynn", "A/A"), # 1/1 & 0/0 + ("holly", "B/B"), # 0/0 & 1/1 ], ) def test_biallelic_predicate( diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index 88e085dd..b0ab43de 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -67,8 +67,8 @@ def prepare_counts_df( ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): values = np.array(counts).reshape((2, 2)) - index = pd.Index(gt_predicate.get_categories()) - columns = pd.Index(ph_predicate.get_categories()) + index = pd.Index(ph_predicate.get_categories()) + columns = pd.Index(gt_predicate.get_categories()) return pd.DataFrame(data=values, index=index, columns=columns) @pytest.mark.parametrize( @@ -82,7 +82,7 @@ def prepare_counts_df( ) def test_one_genotype_has_zero_hpo_observations( self, - counts: typing.Tuple[int], + counts: typing.Sequence[int], expected: bool, gt_predicate: GenotypePolyPredicate, ph_predicate: PhenotypePolyPredicate[hpotk.TermId], @@ -200,7 +200,7 @@ def test_filter_terms_to_test( reasons = [r.reason for r in mtc_report] assert reasons == [ None, None, - 'Skipping term because all genotypes have same HPO observed proportions', + 'Skipping term with less than 7 observations (not powered for 2x2)', None, None, ] @@ -263,7 +263,7 @@ def test_min_observed_HPO_threshold( for i, p in enumerate(suox_pheno_predicates) } - # Ectopia lentis HP:0001083 (6 9 1 2), freqs are 6/15=0.4 and 1/3=0.33 + # Ectopia lentis HP:0001083 (1 2 3 1), freqs are 1/4=0.25 and 3/4=0.75 idx = curie2idx["HP:0001083"] ectopia = patient_counts[idx] ectopia_predicate = suox_pheno_predicates[idx] @@ -271,9 +271,9 @@ def test_min_observed_HPO_threshold( ectopia, ph_predicate=ectopia_predicate, ) - assert max_f == pytest.approx(0.4, abs=EPSILON) + assert max_f == pytest.approx(0.75, abs=EPSILON) - # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1 + # Seizure HP:0001250 (11 5 0 1), freqs are 11/11=1.0 and 5/6=0.8333333 idx = curie2idx["HP:0001250"] seizure = patient_counts[idx] seizure_predicate = suox_pheno_predicates[idx] @@ -283,7 +283,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(1.0, abs=EPSILON) - # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1 + # Sulfocysteinuria HP:0032350 (2 3 0 0), freqs are both 1 idx = curie2idx["HP:0032350"] sulfocysteinuria = patient_counts[idx] sulfocysteinuria_predicate = suox_pheno_predicates[idx] @@ -293,7 +293,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(1.0, abs=EPSILON) - # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5 + # Neurodevelopmental delay HP:0012758 (4 0 4 5), freqs are 4/8 = 0.5 and 0/5=0.0 idx = curie2idx["HP:0012758"] ndelay = patient_counts[idx] ndelay_predicate = suox_pheno_predicates[idx] @@ -303,7 +303,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(0.5, abs=EPSILON) - # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714 + # Hypertonia HP:0001276 (4 2 3 3) freqs are 4/7=0.4375 and 2/5=0.5714 idx = curie2idx["HP:0001276"] hypertonia = patient_counts[idx] hypertonia_predicate = suox_pheno_predicates[idx] diff --git a/tests/conftest.py b/tests/conftest.py index 907b7d83..5459ce3f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, ModeOfInheritancePredicate, VariantPredicates from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONDecoder from gpsea.model import * @@ -119,9 +119,7 @@ def suox_mane_tx_id() -> str: def suox_gt_predicate( suox_mane_tx_id: str, ) -> GenotypePolyPredicate: - # To bin the patients to a group with >1 MISSENSE variant or 0 MISSENSE variants. - - return boolean_predicate( + return ModeOfInheritancePredicate.autosomal_dominant( variant_predicate=VariantPredicates.variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id diff --git a/tests/test_predicates.py b/tests/test_predicates.py index 467a5df1..71308b8e 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -4,8 +4,7 @@ from gpsea.analysis.predicate import PatientCategory, PatientCategories from gpsea.analysis.predicate.phenotype import HpoPredicate, DiseasePresencePredicate from gpsea.analysis.predicate.genotype import * -from gpsea.model import Cohort, Patient, FeatureType, VariantEffect -from gpsea.model.genome import Region +from gpsea.model import Cohort, Patient from gpsea.preprocessing import ProteinMetadataService @@ -89,100 +88,7 @@ def test_disease_predicate( assert actual.category == patient_category -@pytest.mark.parametrize('patient_id, variant_effect, expected_result', - (['HetSingleVar', VariantEffect.FRAMESHIFT_VARIANT, PatientCategories.YES], - ['HetSingleVar', VariantEffect.MISSENSE_VARIANT, PatientCategories.NO], - ['HetDoubleVar1', VariantEffect.STOP_GAINED, PatientCategories.YES], - ['HomoVar', VariantEffect.FRAMESHIFT_VARIANT, PatientCategories.YES], - ['LargeCNV', VariantEffect.STOP_LOST, PatientCategories.YES], - ['LargeCNV', VariantEffect.FEATURE_TRUNCATION, PatientCategories.YES])) -def test_VariantEffectPredicate(patient_id: str, - variant_effect: VariantEffect, - expected_result: PatientCategory, - toy_cohort: Cohort): - patient = find_patient(patient_id, toy_cohort) - predicate = boolean_predicate(VariantPredicates.variant_effect(effect=variant_effect, tx_id='NM_013275.6')) - result = predicate.test(patient) - assert result.category == expected_result - - -@pytest.mark.parametrize('patient_id, variant, hasVarResult', - (['HetSingleVar', '16_89279850_89279850_G_GC', PatientCategories.YES], - # the `HetSingleVar` patient does NOT have the variant. - ['HetSingleVar', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', PatientCategories.NO], - # but `HetDoubleVar2` below DOES have the variant. - ['HetDoubleVar2', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', PatientCategories.YES], - ['HetDoubleVar1', '16_89284601_89284602_GG_A', PatientCategories.YES], - ['HetDoubleVar1', '16_89280752_89280752_G_T', PatientCategories.YES], - # the `HomoVar` patient does NOT have the variant - ['HomoVar', '16_89280752_89280752_G_T', PatientCategories.NO], - ['HomoVar', '16_89279458_89279459_TG_T', PatientCategories.YES], - ['LargeCNV', '16_89190071_89439815_DEL', PatientCategories.YES])) -def test_VariantKeyPredicate(patient_id, variant, hasVarResult, toy_cohort): - predicate = boolean_predicate(VariantPredicates.variant_key(key=variant)) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult - - -@pytest.mark.parametrize('patient_id, exon, hasVarResult', - (['HetSingleVar', 9, PatientCategories.YES], - ['HetSingleVar', 13, PatientCategories.NO], - ['HetDoubleVar1', 9, PatientCategories.YES], - ['HetDoubleVar2', 10, PatientCategories.YES], - ['HetDoubleVar2', 9, PatientCategories.YES], - ['HomoVar', 10, PatientCategories.NO], - ['HomoVar', 9, PatientCategories.YES], - ['LargeCNV', 1, PatientCategories.NO], - ['LargeCNV', 13, PatientCategories.YES])) -def test_ExonPredicate(patient_id, exon, hasVarResult, toy_cohort): - patient = find_patient(patient_id, toy_cohort) - predicate = VariantPredicates.exon(exon=exon, tx_id='NM_013275.6') - predicate = boolean_predicate(predicate) - result = predicate.test(patient) - assert result.category == hasVarResult - @pytest.fixture(scope='module') def protein_predicates(protein_test_service: ProteinMetadataService) -> ProteinPredicates: return ProteinPredicates(protein_metadata_service=protein_test_service) - - -@pytest.mark.parametrize('patient_id, feature_type, hasVarResult', - (['HetDoubleVar2', FeatureType.REGION, PatientCategories.YES], - ['HetDoubleVar2', FeatureType.REPEAT, PatientCategories.NO], - ['HetSingleVar', FeatureType.REGION, PatientCategories.YES], - ['HomoVar', FeatureType.REGION, PatientCategories.YES], - ['HetDoubleVar1', FeatureType.REPEAT, PatientCategories.NO])) -## TODO Why do CNV not show as affecting a feature? -##['LargeCNV', FeatureType.REGION , HETEROZYGOUS])) -def test_ProteinFeatureTypePredicate(patient_id, feature_type, hasVarResult, toy_cohort, protein_predicates): - patient = find_patient(patient_id, toy_cohort) - predicate = boolean_predicate(protein_predicates.protein_feature_type(feature_type=feature_type, tx_id='NM_013275.6')) - result = predicate.test(patient) - assert result.category == hasVarResult - - -@pytest.mark.parametrize('patient_id, feature, hasVarResult', - (['HetDoubleVar2', 'Disordered', PatientCategories.YES], - ['HetDoubleVar2', 'BadFeature', PatientCategories.NO], - ['HetSingleVar', 'Disordered', PatientCategories.YES], - ['HomoVar', 'Disordered', PatientCategories.YES], - ['HetDoubleVar1', 'Disordered', PatientCategories.YES])) -def test_ProteinFeaturePredicate(patient_id, feature, hasVarResult, toy_cohort, protein_predicates): - predicate = boolean_predicate(protein_predicates.protein_feature(feature_id=feature, tx_id='NM_013275.6')) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult - -@pytest.mark.parametrize('patient_id, region, hasVarResult', - (['HetDoubleVar2', Region(2000, 2500), PatientCategories.YES], - ['HetDoubleVar2', Region(1000, 1500), PatientCategories.NO], - ['HomoVar', Region(2000, 2500), PatientCategories.YES], - ['HetSingleVar', Region(2000, 2500), PatientCategories.YES], - ['HetDoubleVar1', Region(600, 650), PatientCategories.YES])) -def test_ProteinRegionPredicate(patient_id, region, hasVarResult, toy_cohort): - predicate = boolean_predicate(VariantPredicates.region(region=region, tx_id='NM_013275.6')) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult From b0b157c0f462fbabaf7a123e008fea7d516c9d73 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 11 Sep 2024 17:01:24 +0200 Subject: [PATCH 8/8] Fix bug in `HpoPredicate`. --- src/gpsea/analysis/predicate/phenotype/_pheno.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 01e692d8..004ea672 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -198,7 +198,7 @@ def test( return None def __eq__(self, value: object) -> bool: - return isinstance(value, PropagatingPhenotypePredicate) \ + return isinstance(value, HpoPredicate) \ and self._hpo.version == value._hpo.version \ and self._query == value._query \ and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded @@ -207,7 +207,7 @@ def __hash__(self) -> int: return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) def __repr__(self): - return f"PropagatingPhenotypeBooleanPredicate(query={self._query})" + return f"HpoPredicate(query={self._query})" class DiseasePresencePredicate(PhenotypePolyPredicate[hpotk.TermId]):