diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index c56924323f..8c0ba3fcab 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -42,6 +42,10 @@ - Edges now have an ``.interval`` attribute returning a ``tskit.Interval`` object. (:user:`hyanwong`, :pr:`2531`) +- Variants now have a `states()` method that returns the genotypes as an + (inefficient) array of strings, rather than integer indexes, to + aid comparison of genetic variation (:user:`hyanwong`, :pr:`2617`) + - Added ``distance_between`` that calculates the total distance between two nodes in a tree. (:user:`Billyzhang1229`, :pr:`2771`) @@ -96,6 +100,7 @@ - Add ``Tree.rf_distance`` method to calculate the unweighted Robinson-Foulds distance between two trees. (:user:`Billyzhang1229`, :issue:`995`, :pr:`2643`, :pr:`3032`) + -------------------- [0.5.8] - 2024-06-27 -------------------- @@ -217,7 +222,6 @@ to ``None``, which is treated as ``True``. Previously, passing ``None`` would result in an error. (:user:`hyanwong`, :pr:`2609`, :issue:`2608`) - -------------------- [0.5.3] - 2022-10-03 -------------------- diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 329867b600..8f1194d90a 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -655,6 +655,65 @@ def test_snipped_tree_sequence_mutations_over_isolated(self): assert non_missing_found assert missing_found + def get_missing_data_ts(self): + tables = tskit.TableCollection(1.0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) + s = tables.sites.add_row(0, "A") + tables.mutations.add_row(site=s, derived_state="B", node=1) + tables.mutations.add_row(site=s, derived_state="C", node=2) + s = tables.sites.add_row(0.5, "") + tables.mutations.add_row(site=s, derived_state="A long string", node=2) + return tables.tree_sequence() + + def test_states(self): + ts = self.get_missing_data_ts() + v_iter = ts.variants(isolated_as_missing=False) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["A", "B", "C"])) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["", "", "A long string"])) + # With no mssing data, it shouldn't matter if the missing string = an allele + assert np.array_equal( + v.states(missing_data_string=""), np.array(["", "", "A long string"]) + ) + + v_iter = ts.variants(isolated_as_missing=True) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["N", "B", "C"])) + v = next(v_iter) + assert np.array_equal(v.states(), np.array(["N", "N", "A long string"])) + assert np.array_equal( + v.states(missing_data_string="MISSING"), + np.array(["MISSING", "MISSING", "A long string"]), + ) + + @pytest.mark.parametrize("missing", [True, False]) + def test_states_haplotypes_equiv(self, missing): + ts = msprime.sim_ancestry(2, sequence_length=20, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.1, random_seed=1) + assert ts.num_sites > 5 + tables = ts.dump_tables() + tables.delete_intervals([[0, ts.site(4).position]]) + tables.sites.replace_with(ts.tables.sites) + ts = tables.tree_sequence() + states = np.array( + [v.states() for v in ts.variants(isolated_as_missing=missing)] + ) + for h1, h2 in zip(ts.haplotypes(isolated_as_missing=missing), states.T): + assert h1 == "".join(h2) + + @pytest.mark.parametrize("s", ["", "A long string", True, np.nan, 0, -1]) + def test_bad_states(self, s): + ts = self.get_missing_data_ts() + v_iter = ts.variants(isolated_as_missing=True) + v = next(v_iter) + v = next(v_iter) + match = "existing allele" if isinstance(s, str) else "not a string" + with pytest.raises(ValueError, match=match): + v.states(missing_data_string=s) + class TestLimitInterval: def test_simple_case(self, ts_fixture): diff --git a/python/tskit/genotypes.py b/python/tskit/genotypes.py index cf82ca8902..15fffba0bf 100644 --- a/python/tskit/genotypes.py +++ b/python/tskit/genotypes.py @@ -245,6 +245,35 @@ def copy(self) -> Variant: variant_copy._ll_variant = self._ll_variant.restricted_copy() return variant_copy + def states(self, missing_data_string=None) -> np.ndarray: + """ + Returns the allelic states at this site as an array of strings. + + .. warning:: + Using this method is inefficient compared to working with the + underlying integer representation of genotypes as returned by + the :attr:`~Variant.genotypes` property. + + :param str missing_data_string: A string that will be used to represent missing + data. If any normal allele contains this character, an error is raised. + Default: `None`, treated as `'N'`. + :return: An numpy array of strings of length ``num_sites``. + """ + if missing_data_string is None: + missing_data_string = "N" + elif not isinstance(missing_data_string, str): + # Must explicitly test here, otherwise we output a numpy object array + raise ValueError("Missing data string is not a string") + alleles = self.alleles + if alleles[-1] is None: + if missing_data_string in alleles: + raise ValueError( + "An existing allele is equal to the " + f"missing data string '{missing_data_string}'" + ) + alleles = alleles[:-1] + (missing_data_string,) + return np.array(alleles)[self.genotypes] + def counts(self) -> typing.Counter[str | None]: """ Returns a :class:`python:collections.Counter` object providing counts for each