Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2.0 model changes #226

Merged
merged 1 commit into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/ga4gh/core/_internal/enderef.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""

from .identifiers import ga4gh_identify, is_ga4gh_identifier
from .pydantic import is_pydantic_instance, is_list, is_curie_type, is_identifiable, pydantic_copy


def ga4gh_enref(o, cra_map, object_store=None):
Expand Down Expand Up @@ -39,7 +40,7 @@ def _enref(o):
o[ran] = [_enref(o2) for o2 in v]
elif isinstance(v, str):
pass
elif is_curie(v): # already a reference
elif is_curie_type(v): # already a reference
assert is_ga4gh_identifier(v), "Identifiable attribute CURIE is contains an invalid identifier"
elif v is not None:
_id = _id_and_store(v)
Expand All @@ -54,7 +55,7 @@ def _enref(o):
raise ValueError("Called ga4gh_enref() with non-identifiable object")

# in-place replacement on object copy
o = pjs_copy(o)
o = pydantic_copy(o)
_enref(o)
return o

Expand Down Expand Up @@ -92,6 +93,6 @@ def _deref(o):
raise ValueError("Called ga4gh_deref() with non-identifiable object")

# in-place replacement on object copy
o = pjs_copy(o)
o = pydantic_copy(o)
_deref(o)
return o
10 changes: 5 additions & 5 deletions src/ga4gh/vrs/extras/localizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self):
def localize_allele(self, allele):
# copy input variant and replace location
# N.B. deepcopy leads to recursion errors
allele_sl = ga4gh.vrs.models.Variation(**allele.as_dict())
allele_sl = ga4gh.vrs.models.Variation(**allele.model_dump())
del allele_sl.id
allele_sl.location = self.localize(allele.location)
return allele_sl
Expand All @@ -59,7 +59,7 @@ def localize_named_feature(self, loc, assembly_name):

"""

assert loc.type._value == "ChromosomeLocation", "Expected a ChromosomeLocation object"
assert loc.type == "ChromosomeLocation", "Expected a ChromosomeLocation object"

def _get_coords(m, cb):
"""return (start,end) of band `cb` in map `m`"""
Expand Down Expand Up @@ -103,14 +103,14 @@ def _get_coords(m, cb):
return ga4gh.vrs.models.SequenceLocation(
sequence_id=coerce_namespace(ac),
interval=ga4gh.vrs.models.SequenceInterval(
start=ga4gh.vrs.models.Number(value=start),
end=ga4gh.vrs.models.Number(value=end))
start=ga4gh.vrs.models.start,
end=ga4gh.vrs.models.end)
)





if __name__ == "__main__":
cbl = ga4gh.vrs.models.ChromosomeLocation(chr="11", start="q22.3", end="q23.1")
cbl = ga4gh.vrs.models.ChromosomeLocation(chr="11", start="q22.3", end="q23.1") # TODO non-existent ChromosomeLocation
lr = Localizer()
109 changes: 53 additions & 56 deletions src/ga4gh/vrs/extras/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ def _from_beacon(self, beacon_expr, assembly_name=None):
"""Parse beacon expression into VRS Allele

#>>> a = tlr.from_beacon("13 : 32936732 G > C")
#>>> a.as_dict()
#>>> a.model_dump()
{'location': {
'end': {'value': 32936732, 'type': Number},
'start': {'value': 32936731, 'type': Number},
'sequence_id': 'GRCh38:13 ',
'end': 32936732,
'start': 32936731,,
'sequence: 'GRCh38:13 ',
'type': 'SequenceLocation'},
'state': {'sequence': 'C', 'type': 'LiteralSequenceExpression'},
'type': 'Allele'}
Expand All @@ -113,16 +113,14 @@ def _from_beacon(self, beacon_expr, assembly_name=None):
g = m.groupdict()
if assembly_name is None:
assembly_name = self.default_assembly_name
sequence_id = assembly_name + ":" + g["chr"]
sequence = assembly_name + ":" + g["chr"]
start = int(g["pos"]) - 1
ref = g["ref"]
alt = g["alt"]
end = start + len(ref)
ins_seq = alt

location = models.SequenceLocation(sequence_id=sequence_id,
start=models.Number(value=start),
end=models.Number(value=end))
location = models.SequenceLocation(sequence=sequence, start=start, end=end)
state = models.LiteralSequenceExpression(sequence=ins_seq)
allele = models.Allele(location=location, state=state)
allele = self._post_process_imported_allele(allele)
Expand All @@ -133,14 +131,14 @@ def _from_gnomad(self, gnomad_expr, assembly_name=None):
"""Parse gnomAD-style VCF expression into VRS Allele

#>>> a = tlr.from_gnomad("1-55516888-G-GA")
#>>> a.as_dict()
#>>> a.model_dump()
{'location': {
'end': {'value': 55516888, 'type': Number},
'start': {'value': 55516887, 'type': Number},
'sequence_id': 'GRCh38:1',
'end': 55516888,
'start': 55516887,
'sequence': 'GRCh38:1',
'type': 'SequenceLocation'},
'state': {'sequence': 'GA', 'type': 'LiteralSequenceExpression'},
'type': 'Allele'}
'state': {'sequence': 'GA', 'type': 'LiteralSequenceExpression'},
'type': 'Allele'}

"""

Expand All @@ -153,16 +151,14 @@ def _from_gnomad(self, gnomad_expr, assembly_name=None):
g = m.groupdict()
if assembly_name is None:
assembly_name = self.default_assembly_name
sequence_id = assembly_name + ":" + g["chr"]
sequence = assembly_name + ":" + g["chr"]
start = int(g["pos"]) - 1
ref = g["ref"]
alt = g["alt"]
end = start + len(ref)
ins_seq = alt

location = models.SequenceLocation(sequence_id=sequence_id,
start=models.Number(value=start),
end=models.Number(value=end))
location = models.SequenceLocation(sequence=sequence, start=start, end=end)
sstate = models.LiteralSequenceExpression(sequence=ins_seq)
allele = models.Allele(location=location, state=sstate)
allele = self._post_process_imported_allele(allele)
Expand All @@ -173,12 +169,12 @@ def _from_hgvs(self, hgvs_expr):
"""parse hgvs into a VRS object (typically an Allele)

#>>> a = tlr.from_hgvs("NM_012345.6:c.22A>T")
#>>> a.as_dict()
#>>> a.model_dump()
{
'location': {
'end': {'value': 22, 'type': Number},
'start': {'value': 21, 'type': Number},
'sequence_id': 'refseq:NM_012345.6',
'end': 22,
'start': 21,
'sequence': 'refseq:NM_012345.6',
'type': 'SequenceLocation'
},
'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression'},
Expand All @@ -195,36 +191,36 @@ def _from_hgvs(self, hgvs_expr):
sv = self._hgvs_parser.parse_hgvs_variant(hgvs_expr)

# prefix accession with namespace
sequence_id = coerce_namespace(sv.ac)
sequence = coerce_namespace(sv.ac)

if isinstance(sv.posedit.pos, hgvs.location.BaseOffsetInterval):
if sv.posedit.pos.start.is_intronic or sv.posedit.pos.end.is_intronic:
raise ValueError("Intronic HGVS variants are not supported ({sv.posedit})")

if sv.posedit.edit.type == "ins":
start=models.Number(value=sv.posedit.pos.start.base)
end=models.Number(value=sv.posedit.pos.start.base)
start = sv.posedit.pos.start.base
end = sv.posedit.pos.start.base
state = sv.posedit.edit.alt
elif sv.posedit.edit.type in ("sub", "del", "delins", "identity"):
start=models.Number(value=sv.posedit.pos.start.base - 1)
end=models.Number(value=sv.posedit.pos.end.base)
start = sv.posedit.pos.start.base - 1
end = sv.posedit.pos.end.base
if sv.posedit.edit.type == "identity":
state = self.data_proxy.get_sequence(sv.ac,
sv.posedit.pos.start.base - 1,
sv.posedit.pos.end.base)
else:
state = sv.posedit.edit.alt or ""
elif sv.posedit.edit.type == "dup":
start=models.Number(value=sv.posedit.pos.start.base - 1)
end=models.Number(value=sv.posedit.pos.end.base)
start = sv.posedit.pos.start.base - 1
end = sv.posedit.pos.end.base
ref = self.data_proxy.get_sequence(sv.ac,
sv.posedit.pos.start.base - 1,
sv.posedit.pos.end.base)
state = ref + ref
else:
raise ValueError(f"HGVS variant type {sv.posedit.edit.type} is unsupported")

location = models.SequenceLocation(sequence_id=sequence_id,
location = models.SequenceLocation(sequence=sequence,
start=start,
end=end)
sstate = models.LiteralSequenceExpression(sequence=state)
Expand All @@ -238,12 +234,12 @@ def _from_spdi(self, spdi_expr):
"""Parse SPDI expression in to a GA4GH Allele

#>>> a = tlr.from_spdi("NM_012345.6:21:1:T")
#>>> a.as_dict()
#>>> a.model_dump()
{
'location': {
'end': {'value': 22, 'type': Number},
'start': {'value': 21, 'type': Number},
'sequence_id': 'refseq:NM_012345.6',
'end': 22,
'start': 21,
'sequence': 'refseq:NM_012345.6',
'type': 'SequenceLocation'
},
'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression'},
Expand All @@ -258,7 +254,7 @@ def _from_spdi(self, spdi_expr):
return None

g = m.groupdict()
sequence_id = coerce_namespace(g["ac"])
sequence = coerce_namespace(g["ac"])
start = int(g["pos"])
try:
del_len = int(g["del_len_or_seq"])
Expand All @@ -267,9 +263,9 @@ def _from_spdi(self, spdi_expr):
end = start + del_len
ins_seq = g["ins_seq"]

location = models.SequenceLocation(sequence_id=sequence_id,
start=models.Number(value=start),
end=models.Number(value=end))
location = models.SequenceLocation(sequence=sequence,
start=start,
end=end)
sstate = models.LiteralSequenceExpression(sequence=ins_seq)
allele = models.Allele(location=location, state=sstate)
allele = self._post_process_imported_allele(allele)
Expand Down Expand Up @@ -328,14 +324,14 @@ def ir_stype(a):
if not self.is_valid_allele(vo):
raise ValueError("_to_hgvs requires a VRS Allele with SequenceLocation and LiteralSequenceExpression")

sequence_id = str(vo.location.sequence_id)
aliases = self.data_proxy.translate_sequence_identifier(sequence_id, namespace)
sequence = str(vo.location.sequence)
aliases = self.data_proxy.translate_sequence_identifier(sequence, namespace)

# infer type of sequence based on accession
# TODO: move to bioutils
stypes = list(set(t for t in (ir_stype(a) for a in aliases) if t))
if len(stypes) != 1:
raise ValueError(f"Couldn't infer sequence type for {sequence_id} ({stypes})")
raise ValueError(f"Couldn't infer sequence type for {sequence} ({stypes})")
stype = stypes[0]

# build interval and edit depending on sequence type
Expand All @@ -344,18 +340,18 @@ def ir_stype(a):
# ival = hgvs.location.Interval(start=start, end=end)
# edit = hgvs.edit.AARefAlt(ref=None, alt=vo.state.sequence)
else: # pylint: disable=no-else-raise
start, end = vo.location.start.value, vo.location.end.value
start, end = vo.location.start, vo.location.end
# ib: 0 1 2 3 4 5
# h: 1 2 3 4 5
if start == end: # insert: hgvs uses *exclusive coords*
ref = None
end += 1
else: # else: hgvs uses *inclusive coords*
ref = self.data_proxy.get_sequence(sequence_id, start, end)
ref = self.data_proxy.get_sequence(sequence, start, end)
start += 1
ival = hgvs.location.Interval(
start=hgvs.location.SimplePosition(base=start),
end=hgvs.location.SimplePosition(base=end))
start=hgvs.location.start,
end=hgvs.location.end)
alt = str(vo.state.sequence) or None # "" => None
edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)

Expand Down Expand Up @@ -415,10 +411,10 @@ def _to_spdi(self, vo, namespace="refseq"):
if not self.is_valid_allele(vo):
raise ValueError("_to_spdi requires a VRS Allele with SequenceLocation and LiteralSequenceExpression")

sequence_id = str(vo.location.sequence_id)
aliases = self.data_proxy.translate_sequence_identifier(sequence_id, namespace)
sequence = str(vo.location.sequence)
aliases = self.data_proxy.translate_sequence_identifier(sequence, namespace)
aliases = [a.split(":")[1] for a in aliases]
start, end = vo.location.start.value, vo.location.end.value
start, end = vo.location.start, vo.location.end
spdi_tail = f":{start}:{end-start}:{vo.state.sequence}"
spdis = [a + spdi_tail for a in aliases]
return spdis
Expand All @@ -443,8 +439,8 @@ def _post_process_imported_allele(self, allele):
"""

if self.translate_sequence_identifiers:
seq_id = self.data_proxy.translate_sequence_identifier(allele.location.sequence_id._value, "ga4gh")[0]
allele.location.sequence_id = seq_id
seq_id = self.data_proxy.translate_sequence_identifier(allele.location.sequence.root, "ga4gh")[0]
allele.location.sequence = seq_id

if self.normalize:
allele = normalize(allele, self.data_proxy)
Expand Down Expand Up @@ -486,7 +482,8 @@ def _seq_id_mapper(self, ir):
coloredlogs.install(level="INFO")

from ga4gh.vrs.dataproxy import create_dataproxy
dp = create_dataproxy("seqrepo+file:///usr/local/share/seqrepo/latest")
# dp = create_dataproxy("seqrepo+file:///usr/local/share/seqrepo/latest")
dp = create_dataproxy("seqrepo + http://localhost:5555/seqrepo")
tlr = Translator(data_proxy=dp)

expressions = [
Expand All @@ -496,9 +493,9 @@ def _seq_id_mapper(self, ir):
"NC_000013.11:g.32936732G>C",
"NM_000551.3:21:1:T", {
"location": {
"end": {"value": 22, "type": "Number"},
"start": {"value": 21, "type": "Number"},
"sequence_id": "ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_",
"end": 22,
"start": 21,
"sequence": "ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_",
"type": "SequenceLocation"
},
"state": {
Expand All @@ -507,8 +504,8 @@ def _seq_id_mapper(self, ir):
},
"type": "Allele"
}, {
"end": {"value": 22, "type": "Number"},
"start": {"value": 21, "type": "Number"}
"end": 22,
"start": 21,
}
]
formats = ["hgvs", "gnomad", "beacon", "spdi", "vrs", None]
Expand Down
2 changes: 1 addition & 1 deletion src/ga4gh/vrs/extras/variation_normalizer_rest_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def to_hgvs(self, vo, namespace="refseq"):
Use this method if you don't have UTA installed locally or are unable
to reach the public UTA database due to port issues.
"""
vo = vo.as_dict()
vo = vo.model_dump()
data = dict(
variation = vo,
namespace = namespace
Expand Down
8 changes: 4 additions & 4 deletions src/ga4gh/vrs/extras/vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,16 @@ def _record_digests(self, record, vrs_data):
# payloads like ['20:14369:1', '20:14369:1:G', '20:14369:1:A']
reference_allele = f"{gnomad_loc}-{record.ref}-{record.ref}"
vrs_ref_object = self.tlr.translate_from(reference_allele, "gnomad")
vrs_data[reference_allele] = str(vrs_ref_object.as_dict())
vrs_data[reference_allele] = str(vrs_ref_object.model_dump())
alleles = [f"{gnomad_loc}-{record.ref}-{a}" for a in [*alts]] # using gnomad format
vrs_allele_ids = [vrs_ref_object.id._value]
vrs_allele_ids = [vrs_ref_object.id]
for allele in alleles:
if "*" in allele:
vrs_allele_ids.append("")
else:
vrs_object = self.tlr.translate_from(allele, "gnomad")
vrs_allele_ids.append(vrs_object.id._value)
vrs_data[data] = str(vrs_object.as_dict())
vrs_allele_ids.append(vrs_object.id)
vrs_data[data] = str(vrs_object.model_dump())

return vrs_allele_ids

Expand Down
Loading
Loading