Skip to content

Commit

Permalink
Merge branch 'fix-change-length-bug' into work-on-predicates
Browse files Browse the repository at this point in the history
  • Loading branch information
ielis committed Oct 26, 2023
2 parents bc91a9a + 58bf50a commit 6ff4fc8
Show file tree
Hide file tree
Showing 13 changed files with 395 additions and 327 deletions.
4 changes: 3 additions & 1 deletion src/genophenocorr/preprocessing/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ._uniprot import UniprotProteinMetadataService
from ._variant import VarCachingFunctionalAnnotator, VariantAnnotationCache
from ._vep import VepFunctionalAnnotator
from ._vv import VVHgvsVariantCoordinateFinder


def configure_caching_patient_creator(hpo: hpotk.MinimalOntology,
Expand Down Expand Up @@ -48,7 +49,8 @@ def configure_caching_patient_creator(hpo: hpotk.MinimalOntology,

phenotype_creator = _setup_phenotype_creator(hpo, validation_runner)
functional_annotator = _configure_functional_annotator(cache_dir, variant_fallback, protein_fallback)
return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator)
hgvs_annotator = VVHgvsVariantCoordinateFinder(build)
return PhenopacketPatientCreator(build, phenotype_creator, functional_annotator, hgvs_annotator)


def _setup_phenotype_creator(hpo: hpotk.MinimalOntology,
Expand Down
120 changes: 76 additions & 44 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import re
import os
import typing

Expand All @@ -20,15 +19,19 @@


class PhenopacketVariantCoordinateFinder(VariantCoordinateFinder[GenomicInterpretation]):
"""A class that creates VariantCoordinates from a Phenopacket
"""
`PhenopacketVariantCoordinateFinder` figures out :class:`genophenocorr.model.VariantCoordinates`
and :class:`genophenocorr.model.Genotype` from `GenomicInterpretation` element of Phenopacket Schema.
Methods:
find_coordinates(item:GenomicInterpretation): Creates VariantCoordinates from the data in a given Phenopacket
:param build: genome build to use in `VariantCoordinates
:param hgvs_coordinate_finder: the coordinate finder to use for parsing HGVS expressions
"""
def __init__(self, build: GenomeBuild):
"""Constructs all necessary attributes for a PhenopacketVariantCoordinateFinder object"""
def __init__(self, build: GenomeBuild,
hgvs_coordinate_finder: VariantCoordinateFinder[str]):
self._logger = logging.getLogger(__name__)
self._build = build
self._build = hpotk.util.validate_instance(build, GenomeBuild, 'build')
self._hgvs_finder = hpotk.util.validate_instance(hgvs_coordinate_finder, VariantCoordinateFinder,
'hgvs_coordinate_finder')

def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantCoordinates, Genotype]:
"""Creates a VariantCoordinates object from the data in a given Phenopacket
Expand All @@ -40,41 +43,78 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC
"""
if not isinstance(item, GenomicInterpretation):
raise ValueError(f"item must be a Phenopacket GenomicInterpretation but was type {type(item)}")

variant_descriptor = item.variant_interpretation.variation_descriptor
if len(variant_descriptor.vcf_record.chrom) == 0 and len(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) != 0:

vc = None
if self._vcf_is_available(variant_descriptor.vcf_record):
# We have a VCF record.
contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom)
start = int(variant_descriptor.vcf_record.pos) - 1
ref = variant_descriptor.vcf_record.ref
alt = variant_descriptor.vcf_record.alt
end = start + len(ref)
change_length = end - start

region = GenomicRegion(contig, start, end, Strand.POSITIVE)
vc = VariantCoordinates(region, ref, alt, change_length)
elif self._cnv_is_available(variant_descriptor.variation):
# We have a CNV.
variation = variant_descriptor.variation
seq_location = variation.copy_number.allele.sequence_location
refseq_contig_name = seq_location.sequence_id.split(':')[1]
contig = self._build.contig_by_name(refseq_contig_name)

# Assuming SV coordinates are 1-based (VCF style),
# so we subtract 1 to transform to 0-based coordinate system
start = int(seq_location.sequence_interval.start_number.value) - 1
end = int(seq_location.sequence_interval.end_number.value)
ref = 'N'
start = int(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_interval.start_number.value)
end = int(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_interval.end_number.value)
number = int(variant_descriptor.variation.copy_number.number.value)
number = int(variation.copy_number.number.value)
if number > 2:
alt = '<DUP>'
else:
alt = '<DEL>'
refseq_contig_name = variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id.split(':')[1]
contig = self._build.contig_by_name(refseq_contig_name)
elif len(variant_descriptor.vcf_record.chrom) != 0 and len(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) == 0:
ref = variant_descriptor.vcf_record.ref
alt = variant_descriptor.vcf_record.alt
start = int(variant_descriptor.vcf_record.pos) - 1
end = int(variant_descriptor.vcf_record.pos) + abs(len(alt) - len(ref))
contig = self._build.contig_by_name(variant_descriptor.vcf_record.chrom[3:])
else:
raise ValueError('Expected a VCF record or a VRS CNV but did not find one')
change_length = end - start

region = GenomicRegion(contig, start, end, Strand.POSITIVE)
vc = VariantCoordinates(region, ref, alt, change_length)
elif len(variant_descriptor.expressions) > 0:
# We have some expressions. Let's try to find the 1st expression with `hgvs.c` syntax.
for expression in variant_descriptor.expressions:
if expression.syntax == 'hgvs.c':
vc = self._hgvs_finder.find_coordinates(expression.value)
break

if vc is None:
raise ValueError('Expected a VCF record, a VRS CNV, or an expression with `hgvs.c` '
'but did not find one')

# Last, parse genotype.
genotype = variant_descriptor.allelic_state.label

if any(field is None for field in (contig, ref, alt, genotype)):
raise ValueError(f'Cannot determine variant coordinate from genomic interpretation {item}')
region = GenomicRegion(contig, start, end, Strand.POSITIVE)

vc = VariantCoordinates(region, ref, alt, len(alt) - len(ref))
gt = self._map_geno_genotype_label(genotype)

return vc, gt

@staticmethod
def _vcf_is_available(vcf_record) -> bool:
"""
Check if we can parse data out of VCF record.
"""
return (vcf_record.genome_assembly != ''
and vcf_record.chrom != ''
and vcf_record.pos >= 0
and vcf_record.ref != ''
and vcf_record.alt != '')

@staticmethod
def _cnv_is_available(variation):
seq_location = variation.copy_number.allele.sequence_location
return (seq_location.sequence_id != ''
and seq_location.sequence_interval.start_number.value >= 0
and seq_location.sequence_interval.end_number.value >= 0
and variation.copy_number.number.value != '')

@staticmethod
def _map_geno_genotype_label(genotype: str) -> Genotype:
"""
Expand All @@ -91,25 +131,17 @@ def _map_geno_genotype_label(genotype: str) -> Genotype:


class PhenopacketPatientCreator(PatientCreator[Phenopacket]):
"""A class that creates a Patient object
Methods:
create_patient(item:Phenopacket): Creates a Patient from the data in a given Phenopacket
"""
`PhenopacketPatientCreator` transforms `Phenopacket` into :class:`genophenocorr.model.Patient`.
"""

def __init__(self, build: GenomeBuild,
phenotype_creator: PhenotypeCreator,
var_func_ann: FunctionalAnnotator):
"""Constructs all necessary attributes for a PhenopacketPatientCreator object
Args:
build (GenomeBuild): A genome build to use to load variant coordinates.
phenotype_creator (PhenotypeCreator): A PhenotypeCreator object for Phenotype creation
var_func_ann (FunctionalAnnotator): A FunctionalAnnotator object for Variant creation
"""
var_func_ann: FunctionalAnnotator,
hgvs_coordinate_finder: VariantCoordinateFinder[str]):
self._logger = logging.getLogger(__name__)
# Violates DI, but it is specific to this class, so I'll leave it "as is".
self._coord_finder = PhenopacketVariantCoordinateFinder(build)
self._coord_finder = PhenopacketVariantCoordinateFinder(build, hgvs_coordinate_finder)
self._phenotype_creator = hpotk.util.validate_instance(phenotype_creator, PhenotypeCreator, 'phenotype_creator')
self._func_ann = hpotk.util.validate_instance(var_func_ann, FunctionalAnnotator, 'var_func_ann')

Expand Down
30 changes: 22 additions & 8 deletions src/genophenocorr/preprocessing/_test_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,50 @@
# pyright: reportGeneralTypeIssues=false
from phenopackets import Phenopacket, GenomicInterpretation

from genophenocorr.model.genome import GRCh38
from genophenocorr.model.genome import GenomeBuild, GRCh38

from ._api import VariantCoordinateFinder
from ._phenopacket import PhenopacketVariantCoordinateFinder
from ._protein import ProteinAnnotationCache, ProtCachingFunctionalAnnotator
from ._uniprot import UniprotProteinMetadataService
from ._variant import VariantAnnotationCache, VarCachingFunctionalAnnotator
from ._vep import VepFunctionalAnnotator
from ._vv import VVHgvsVariantCoordinateFinder


@pytest.fixture
def pp_vc_finder() -> PhenopacketVariantCoordinateFinder:
return PhenopacketVariantCoordinateFinder(GRCh38)
def build() -> GenomeBuild:
return GRCh38


@pytest.fixture
def hgvs_vc_finder(build: GenomeBuild) -> VariantCoordinateFinder:
return VVHgvsVariantCoordinateFinder(build)


@pytest.fixture
def pp_vc_finder(build: GenomeBuild,
hgvs_vc_finder: VariantCoordinateFinder) -> PhenopacketVariantCoordinateFinder:
return PhenopacketVariantCoordinateFinder(build, hgvs_vc_finder)


@pytest.mark.parametrize("pp_path, expected",
[('test_data/deletion_test.json', '16_89284129_89284134_CTTTTT_C'),
('test_data/insertion_test.json', '16_89280829_89280830_C_CA'),
('test_data/insertion_test.json', '16_89280829_89280829_C_CA'),
('test_data/missense_test.json', '16_89279135_89279135_G_C'),
('test_data/duplication_test.json', '16_89279850_89279851_G_GC'),
('test_data/missense_hgvs_test.json', '16_89279135_89279135_G_C'),
('test_data/duplication_test.json', '16_89279850_89279850_G_GC'),
('test_data/delinsert_test.json', '16_89284601_89284602_GG_A'),
('test_data/CVDup_test.json', '16_89284524_89373231_DUP'),
('test_data/CVDel_test.json', '16_89217282_89506042_DEL')
('test_data/CVDup_test.json', '16_89284523_89373231_DUP'),
('test_data/CVDel_test.json', '16_89217281_89506042_DEL')
])
def test_find_coordinates(pp_path, expected, pp_vc_finder):
fname = resource_filename(__name__, pp_path)
gi = read_genomic_interpretation_json(fname)

vc, gt = pp_vc_finder.find_coordinates(gi)

assert expected == vc.variant_key
assert vc.variant_key == expected


def read_genomic_interpretation_json(fpath: str) -> GenomicInterpretation:
Expand Down
5 changes: 5 additions & 0 deletions src/genophenocorr/preprocessing/_test_vv.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def test_load_hgvs_SURF2_ins(self, coordinate_finder: VVHgvsVariantCoordinateFin
assert vc.alt == 'TAGC'
assert vc.change_length == 3

@pytest.mark.skip('Online tests are disabled by default')
def test_find_coordinates(self, coordinate_finder: VVHgvsVariantCoordinateFinder):
vc = coordinate_finder.find_coordinates('NM_013275.6:c.7407C>G')
print(vc)


def load_response_json(path: str):
with open(path) as fh:
Expand Down
34 changes: 14 additions & 20 deletions src/genophenocorr/preprocessing/_vv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,20 @@
import hpotk
import requests

from genophenocorr.model import VariantCoordinates, Genotype
from genophenocorr.model import VariantCoordinates
from genophenocorr.model.genome import GenomeBuild, GenomicRegion, Strand
from ._api import VariantCoordinateFinder


class VVHgvsVariantCoordinateFinder(VariantCoordinateFinder[typing.Tuple[str, str]]):
class VVHgvsVariantCoordinateFinder(VariantCoordinateFinder[str]):
"""
`VVHgvsVariantCoordinateFinder` uses Variant Validator's REST API to build :class:`VariantCoordinates`
from an HGVS string.
The finder takes a tuple with two strings:
The finder takes an HGVS `str` (e.g. `NM_005912.3:c.253A>G`) and extracts the variant coordinates from the response.
* HGVS `str` (e.g. `NM_005912.3:c.253A>G`)
* genotype `str` (e.g. `heterozygous`)
and extracts the variant coordinates from the response.
:param genome_build: the genome build to use to construct :class:`VariantCoordinates`.
:param timeout: the REST API request timeout (default: 10).
:param genome_build: the genome build to use to construct :class:`VariantCoordinates`
:param timeout: the REST API request timeout
"""

def __init__(self, genome_build: GenomeBuild, timeout: int = 10):
Expand All @@ -33,33 +28,32 @@ def __init__(self, genome_build: GenomeBuild, timeout: int = 10):
self._headers = {'Content-type': 'application/json'}
self._hgvs_pattern = re.compile(r'^(?P<tx>NM_\d+\.\d+):c.\d+(_\d+)?.*')

def find_coordinates(self, item: typing.Tuple[str, str]) -> typing.Tuple[VariantCoordinates, Genotype]:
def find_coordinates(self, item: str) -> VariantCoordinates:
"""
Extracts variant coordinates from an HGVS string using Variant Validator's REST API.
:param item: Tuple of hgvs string and genotype string
:return: variant coordinates and genotype
:param item: a hgvs string
:return: variant coordinates
"""
hgvs, genotype = item
matcher = self._hgvs_pattern.match(hgvs)
matcher = self._hgvs_pattern.match(item)
if matcher:
transcript = matcher.group('tx')
request_url = self._url % (self._build.genome_build_id.major_assembly, hgvs, transcript)
request_url = self._url % (self._build.genome_build_id.major_assembly, item, transcript)

try:
response = requests.get(request_url, headers=self._headers, timeout=self._timeout)
response.raise_for_status()
response = response.json()
variant_coordinates = self._extract_variant_coordinates(response)
except (requests.exceptions.RequestException, VariantValidatorDecodeException) as e:
raise ValueError(f'Error processing {hgvs}', e)
raise ValueError(f'Error processing {item}', e)
else:
# The HGVS did not pass validation by a regular expression.
# Please submit an issue to our GitHub tracker if you believe
# the HGVS expression is correct.
raise ValueError(f'Invalid HGVS string: {hgvs}')
raise ValueError(f'Invalid HGVS string: {item}')

genotype = Genotype[genotype.upper()]
return variant_coordinates, genotype
return variant_coordinates

def _extract_variant_coordinates(self, response: typing.Dict) -> typing.Optional[VariantCoordinates]:
"""
Expand Down
Loading

0 comments on commit 6ff4fc8

Please sign in to comment.