Skip to content

Commit

Permalink
Merge pull request #2617 from hyanwong/variant-strings
Browse files Browse the repository at this point in the history
Add `states()` method to return actual genotype states as characters
  • Loading branch information
benjeffery authored Oct 16, 2024
2 parents 05f8bef + 0e3f1fa commit edf6a3c
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 1 deletion.
6 changes: 5 additions & 1 deletion python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
- Edges now have an ``.interval`` attribute returning a ``tskit.Interval`` object.
(:user:`hyanwong`, :pr:`2531`)

- Variants now have a `states()` method that returns the genotypes as an
(inefficient) array of strings, rather than integer indexes, to
aid comparison of genetic variation (:user:`hyanwong`, :pr:`2617`)

- Added ``distance_between`` that calculates the total distance between two nodes in a tree.
(:user:`Billyzhang1229`, :pr:`2771`)

Expand Down Expand Up @@ -96,6 +100,7 @@
- Add ``Tree.rf_distance`` method to calculate the unweighted Robinson-Foulds distance
between two trees. (:user:`Billyzhang1229`, :issue:`995`, :pr:`2643`, :pr:`3032`)


--------------------
[0.5.8] - 2024-06-27
--------------------
Expand Down Expand Up @@ -217,7 +222,6 @@
to ``None``, which is treated as ``True``. Previously, passing ``None``
would result in an error. (:user:`hyanwong`, :pr:`2609`, :issue:`2608`)


--------------------
[0.5.3] - 2022-10-03
--------------------
Expand Down
59 changes: 59 additions & 0 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,65 @@ def test_snipped_tree_sequence_mutations_over_isolated(self):
assert non_missing_found
assert missing_found

def get_missing_data_ts(self):
tables = tskit.TableCollection(1.0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
s = tables.sites.add_row(0, "A")
tables.mutations.add_row(site=s, derived_state="B", node=1)
tables.mutations.add_row(site=s, derived_state="C", node=2)
s = tables.sites.add_row(0.5, "")
tables.mutations.add_row(site=s, derived_state="A long string", node=2)
return tables.tree_sequence()

def test_states(self):
ts = self.get_missing_data_ts()
v_iter = ts.variants(isolated_as_missing=False)
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["A", "B", "C"]))
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["", "", "A long string"]))
# With no mssing data, it shouldn't matter if the missing string = an allele
assert np.array_equal(
v.states(missing_data_string=""), np.array(["", "", "A long string"])
)

v_iter = ts.variants(isolated_as_missing=True)
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["N", "B", "C"]))
v = next(v_iter)
assert np.array_equal(v.states(), np.array(["N", "N", "A long string"]))
assert np.array_equal(
v.states(missing_data_string="MISSING"),
np.array(["MISSING", "MISSING", "A long string"]),
)

@pytest.mark.parametrize("missing", [True, False])
def test_states_haplotypes_equiv(self, missing):
ts = msprime.sim_ancestry(2, sequence_length=20, random_seed=1)
ts = msprime.sim_mutations(ts, rate=0.1, random_seed=1)
assert ts.num_sites > 5
tables = ts.dump_tables()
tables.delete_intervals([[0, ts.site(4).position]])
tables.sites.replace_with(ts.tables.sites)
ts = tables.tree_sequence()
states = np.array(
[v.states() for v in ts.variants(isolated_as_missing=missing)]
)
for h1, h2 in zip(ts.haplotypes(isolated_as_missing=missing), states.T):
assert h1 == "".join(h2)

@pytest.mark.parametrize("s", ["", "A long string", True, np.nan, 0, -1])
def test_bad_states(self, s):
ts = self.get_missing_data_ts()
v_iter = ts.variants(isolated_as_missing=True)
v = next(v_iter)
v = next(v_iter)
match = "existing allele" if isinstance(s, str) else "not a string"
with pytest.raises(ValueError, match=match):
v.states(missing_data_string=s)


class TestLimitInterval:
def test_simple_case(self, ts_fixture):
Expand Down
29 changes: 29 additions & 0 deletions python/tskit/genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,35 @@ def copy(self) -> Variant:
variant_copy._ll_variant = self._ll_variant.restricted_copy()
return variant_copy

def states(self, missing_data_string=None) -> np.ndarray:
"""
Returns the allelic states at this site as an array of strings.
.. warning::
Using this method is inefficient compared to working with the
underlying integer representation of genotypes as returned by
the :attr:`~Variant.genotypes` property.
:param str missing_data_string: A string that will be used to represent missing
data. If any normal allele contains this character, an error is raised.
Default: `None`, treated as `'N'`.
:return: An numpy array of strings of length ``num_sites``.
"""
if missing_data_string is None:
missing_data_string = "N"
elif not isinstance(missing_data_string, str):
# Must explicitly test here, otherwise we output a numpy object array
raise ValueError("Missing data string is not a string")
alleles = self.alleles
if alleles[-1] is None:
if missing_data_string in alleles:
raise ValueError(
"An existing allele is equal to the "
f"missing data string '{missing_data_string}'"
)
alleles = alleles[:-1] + (missing_data_string,)
return np.array(alleles)[self.genotypes]

def counts(self) -> typing.Counter[str | None]:
"""
Returns a :class:`python:collections.Counter` object providing counts for each
Expand Down

0 comments on commit edf6a3c

Please sign in to comment.