From 58bf50aa2578d2aaed957871f54c040832c30071 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 26 Oct 2023 11:51:35 -0400 Subject: [PATCH] Fix branches in parsing GenomicInterpretation. Test parsing HGVS expression. --- .../preprocessing/_phenopacket.py | 32 +++++++++++++++---- .../preprocessing/_test_variant.py | 30 ++++++++++++----- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py index 159e3d8d..9f13dec6 100644 --- a/src/genophenocorr/preprocessing/_phenopacket.py +++ b/src/genophenocorr/preprocessing/_phenopacket.py @@ -47,7 +47,7 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC variant_descriptor = item.variant_interpretation.variation_descriptor vc = None - if variant_descriptor.vcf_record.IsInitialized(): + if self._vcf_is_available(variant_descriptor.vcf_record): # We have a VCF record. contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom) start = int(variant_descriptor.vcf_record.pos) - 1 @@ -58,9 +58,10 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC region = GenomicRegion(contig, start, end, Strand.POSITIVE) vc = VariantCoordinates(region, ref, alt, change_length) - elif variant_descriptor.variation.IsInitialized(): + elif self._cnv_is_available(variant_descriptor.variation): # We have a CNV. - seq_location = variant_descriptor.variation.copy_number.allele.sequence_location + variation = variant_descriptor.variation + seq_location = variation.copy_number.allele.sequence_location refseq_contig_name = seq_location.sequence_id.split(':')[1] contig = self._build.contig_by_name(refseq_contig_name) @@ -69,7 +70,7 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC start = int(seq_location.sequence_interval.start_number.value) - 1 end = int(seq_location.sequence_interval.end_number.value) ref = 'N' - number = int(variant_descriptor.variation.copy_number.number.value) + number = int(variation.copy_number.number.value) if number > 2: alt = '' else: @@ -84,8 +85,8 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC if expression.syntax == 'hgvs.c': vc = self._hgvs_finder.find_coordinates(expression.value) break - else: - # No way we can get something from this `GenomicInterpretation`! + + if vc is None: raise ValueError('Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` ' 'but did not find one') @@ -95,6 +96,25 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC return vc, gt + @staticmethod + def _vcf_is_available(vcf_record) -> bool: + """ + Check if we can parse data out of VCF record. + """ + return (vcf_record.genome_assembly != '' + and vcf_record.chrom != '' + and vcf_record.pos >= 0 + and vcf_record.ref != '' + and vcf_record.alt != '') + + @staticmethod + def _cnv_is_available(variation): + seq_location = variation.copy_number.allele.sequence_location + return (seq_location.sequence_id != '' + and seq_location.sequence_interval.start_number.value >= 0 + and seq_location.sequence_interval.end_number.value >= 0 + and variation.copy_number.number.value != '') + @staticmethod def _map_geno_genotype_label(genotype: str) -> Genotype: """ diff --git a/src/genophenocorr/preprocessing/_test_variant.py b/src/genophenocorr/preprocessing/_test_variant.py index 4682373c..d78abc01 100644 --- a/src/genophenocorr/preprocessing/_test_variant.py +++ b/src/genophenocorr/preprocessing/_test_variant.py @@ -7,28 +7,42 @@ # pyright: reportGeneralTypeIssues=false from phenopackets import Phenopacket, GenomicInterpretation -from genophenocorr.model.genome import GRCh38 +from genophenocorr.model.genome import GenomeBuild, GRCh38 +from ._api import VariantCoordinateFinder from ._phenopacket import PhenopacketVariantCoordinateFinder from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator from ._uniprot import UniprotProteinMetadataService from ._variant import VariantAnnotationCache, VarCachingFunctionalAnnotator from ._vep import VepFunctionalAnnotator +from ._vv import VVHgvsVariantCoordinateFinder @pytest.fixture -def pp_vc_finder() -> PhenopacketVariantCoordinateFinder: - return PhenopacketVariantCoordinateFinder(GRCh38) +def build() -> GenomeBuild: + return GRCh38 + + +@pytest.fixture +def hgvs_vc_finder(build: GenomeBuild) -> VariantCoordinateFinder: + return VVHgvsVariantCoordinateFinder(build) + + +@pytest.fixture +def pp_vc_finder(build: GenomeBuild, + hgvs_vc_finder: VariantCoordinateFinder) -> PhenopacketVariantCoordinateFinder: + return PhenopacketVariantCoordinateFinder(build, hgvs_vc_finder) @pytest.mark.parametrize("pp_path, expected", [('test_data/deletion_test.json', '16_89284129_89284134_CTTTTT_C'), - ('test_data/insertion_test.json', '16_89280829_89280830_C_CA'), + ('test_data/insertion_test.json', '16_89280829_89280829_C_CA'), ('test_data/missense_test.json', '16_89279135_89279135_G_C'), - ('test_data/duplication_test.json', '16_89279850_89279851_G_GC'), + ('test_data/missense_hgvs_test.json', '16_89279135_89279135_G_C'), + ('test_data/duplication_test.json', '16_89279850_89279850_G_GC'), ('test_data/delinsert_test.json', '16_89284601_89284602_GG_A'), - ('test_data/CVDup_test.json', '16_89284524_89373231_DUP'), - ('test_data/CVDel_test.json', '16_89217282_89506042_DEL') + ('test_data/CVDup_test.json', '16_89284523_89373231_DUP'), + ('test_data/CVDel_test.json', '16_89217281_89506042_DEL') ]) def test_find_coordinates(pp_path, expected, pp_vc_finder): fname = resource_filename(__name__, pp_path) @@ -36,7 +50,7 @@ def test_find_coordinates(pp_path, expected, pp_vc_finder): vc, gt = pp_vc_finder.find_coordinates(gi) - assert expected == vc.variant_key + assert vc.variant_key == expected def read_genomic_interpretation_json(fpath: str) -> GenomicInterpretation: