From c9f26c8ed7775d3b3353af0e762f8c961a52336b Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 13 Jul 2023 16:49:43 +0100 Subject: [PATCH 1/2] Backwards algorithm in LSHMM --- c/tests/test_haplotype_matching.c | 91 +-- c/tskit/haplotype_matching.c | 211 +++++- c/tskit/haplotype_matching.h | 6 +- python/_tskitmodule.c | 62 ++ python/tests/test_haplotype_matching.py | 856 ++++++++++++++---------- python/tests/test_lowlevel.py | 42 +- 6 files changed, 834 insertions(+), 434 deletions(-) diff --git a/c/tests/test_haplotype_matching.c b/c/tests/test_haplotype_matching.c index 7a8bda84dc..943654ff91 100644 --- a/c/tests/test_haplotype_matching.c +++ b/c/tests/test_haplotype_matching.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2022 Tskit Developers + * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,46 +28,6 @@ #include #include -/**************************************************************** - * TestHMM - ****************************************************************/ - -static double -tsk_ls_hmm_compute_normalisation_factor_site_test(tsk_ls_hmm_t *TSK_UNUSED(self)) -{ - return 1.0; -} - -static int -tsk_ls_hmm_next_probability_test(tsk_ls_hmm_t *TSK_UNUSED(self), - tsk_id_t TSK_UNUSED(site_id), double TSK_UNUSED(p_last), bool TSK_UNUSED(is_match), - tsk_id_t TSK_UNUSED(node), double *result) -{ - *result = rand(); - /* printf("next proba = %f\n", *result); */ - return 0; -} - -static int -run_test_hmm(tsk_ls_hmm_t *hmm, int32_t *haplotype, tsk_compressed_matrix_t *output) -{ - int ret = 0; - - srand(1); - - ret = tsk_ls_hmm_run(hmm, haplotype, tsk_ls_hmm_next_probability_test, - tsk_ls_hmm_compute_normalisation_factor_site_test, output); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -/**************************************************************** - * TestHMM - ****************************************************************/ - static void test_single_tree_missing_alleles(void) { @@ -206,6 +166,7 @@ test_single_tree_match_impossible(void) tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward; + tsk_compressed_matrix_t backward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0.0, 0.25, 0.25 }; @@ -228,8 +189,16 @@ test_single_tree_match_impossible(void) tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); + ret = tsk_ls_hmm_backward(&ls_hmm, h, forward.normalisation_factor, &backward, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MATCH_IMPOSSIBLE); + tsk_compressed_matrix_print_state(&backward, _devnull); + /* tsk_compressed_matrix_print_state(&forward, stdout); */ + /* tsk_compressed_matrix_print_state(&backward, stdout); */ + tsk_ls_hmm_print_state(&ls_hmm, _devnull); + tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); + tsk_compressed_matrix_free(&backward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } @@ -275,12 +244,15 @@ test_single_tree_errors(void) ret = tsk_compressed_matrix_store_site(&forward, 4, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); - T[0].tree_node = -1; - T[0].value = 0; - ret = tsk_compressed_matrix_store_site(&forward, 0, 1, 1, T); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_compressed_matrix_decode(&forward, (double *) decoded); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); + /* FIXME disabling this tests for now because we filter out negative + * nodes when storing now, to accomodate some oddness in the initial + * conditions of the backward matrix. */ + /* T[0].tree_node = -1; */ + /* T[0].value = 0; */ + /* ret = tsk_compressed_matrix_store_site(&forward, 0, 1, 1, T); */ + /* CU_ASSERT_EQUAL_FATAL(ret, 0); */ + /* ret = tsk_compressed_matrix_decode(&forward, (double *) decoded); */ + /* CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); */ T[0].tree_node = 7; T[0].value = 0; @@ -443,7 +415,7 @@ test_multi_tree_exact_match(void) int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; - tsk_compressed_matrix_t forward; + tsk_compressed_matrix_t forward, backward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0.0, 0.25, 0.25 }; @@ -465,6 +437,13 @@ test_multi_tree_exact_match(void) ret = tsk_compressed_matrix_decode(&forward, decoded_compressed_matrix); CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_ls_hmm_backward(&ls_hmm, h, forward.normalisation_factor, &backward, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_ls_hmm_print_state(&ls_hmm, _devnull); + tsk_compressed_matrix_print_state(&backward, _devnull); + ret = tsk_compressed_matrix_decode(&backward, decoded_compressed_matrix); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); @@ -492,6 +471,7 @@ test_multi_tree_exact_match(void) tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); + tsk_compressed_matrix_free(&backward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } @@ -529,7 +509,8 @@ test_caterpillar_tree_many_values(void) int ret = 0; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t matrix; - double unused[] = { 0, 0, 0, 0, 0 }; + double rho[] = { 0.1, 0.1, 0.1, 0.1, 0.1 }; + double mu[] = { 0.0, 0.0, 0.0, 0.0, 0.0 }; int32_t h[] = { 0, 0, 0, 0, 0 }; tsk_size_t n[] = { 8, @@ -542,11 +523,11 @@ test_caterpillar_tree_many_values(void) for (j = 0; j < sizeof(n) / sizeof(*n); j++) { ts = caterpillar_tree(n[j], 5, n[j] - 2); - ret = tsk_ls_hmm_init(&ls_hmm, ts, unused, unused, 0); + ret = tsk_ls_hmm_init(&ls_hmm, ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_init(&matrix, ts, 1 << 10, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = run_test_hmm(&ls_hmm, h, &matrix); + ret = tsk_ls_hmm_forward(&ls_hmm, h, &matrix, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_print_state(&matrix, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); @@ -559,13 +540,13 @@ test_caterpillar_tree_many_values(void) j = 40; ts = caterpillar_tree(j, 5, j - 2); - ret = tsk_ls_hmm_init(&ls_hmm, ts, unused, unused, 0); + ret = tsk_ls_hmm_init(&ls_hmm, ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_init(&matrix, ts, 1 << 20, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - /* Short circuit this value so we can run the test in reasonable time */ - ls_hmm.max_parsimony_words = 1; - ret = run_test_hmm(&ls_hmm, h, &matrix); + /* Short circuit this value so we can run the test */ + ls_hmm.max_parsimony_words = 0; + ret = tsk_ls_hmm_forward(&ls_hmm, h, &matrix, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_VALUES); tsk_ls_hmm_free(&ls_hmm); diff --git a/c/tskit/haplotype_matching.c b/c/tskit/haplotype_matching.c index d6fdfd7f46..ea8853e72e 100644 --- a/c/tskit/haplotype_matching.c +++ b/c/tskit/haplotype_matching.c @@ -230,10 +230,9 @@ tsk_ls_hmm_free(tsk_ls_hmm_t *self) } static int -tsk_ls_hmm_reset(tsk_ls_hmm_t *self) +tsk_ls_hmm_reset(tsk_ls_hmm_t *self, double value) { int ret = 0; - double n = (double) self->num_samples; tsk_size_t j; tsk_id_t u; const tsk_id_t *samples; @@ -257,7 +256,7 @@ tsk_ls_hmm_reset(tsk_ls_hmm_t *self) for (j = 0; j < self->num_samples; j++) { u = samples[j]; self->transitions[j].tree_node = u; - self->transitions[j].value = 1.0 / n; + self->transitions[j].value = value; self->transition_index[u] = (tsk_id_t) j; } self->num_transitions = self->num_samples; @@ -300,7 +299,7 @@ tsk_ls_hmm_remove_dead_roots(tsk_ls_hmm_t *self) } static int -tsk_ls_hmm_update_tree(tsk_ls_hmm_t *self) +tsk_ls_hmm_update_tree(tsk_ls_hmm_t *self, int direction) { int ret = 0; tsk_id_t *restrict parent = self->parent; @@ -311,11 +310,15 @@ tsk_ls_hmm_update_tree(tsk_ls_hmm_t *self) tsk_id_t u, c, p, j, e; tsk_value_transition_t *vt; - tsk_tree_position_next(&self->tree_pos); + if (direction == TSK_DIR_FORWARD) { + tsk_tree_position_next(&self->tree_pos); + } else { + tsk_tree_position_prev(&self->tree_pos); + } tsk_bug_assert(self->tree_pos.index != -1); tsk_bug_assert(self->tree_pos.index == self->tree.index); - for (j = self->tree_pos.out.start; j < self->tree_pos.out.stop; j++) { + for (j = self->tree_pos.out.start; j != self->tree_pos.out.stop; j += direction) { e = self->tree_pos.out.order[j]; c = edges_child[e]; u = c; @@ -334,7 +337,7 @@ tsk_ls_hmm_update_tree(tsk_ls_hmm_t *self) parent[c] = TSK_NULL; } - for (j = self->tree_pos.in.start; j < self->tree_pos.in.stop; j++) { + for (j = self->tree_pos.in.start; j != self->tree_pos.in.stop; j += direction) { e = self->tree_pos.in.order[j]; c = edges_child[e]; p = edges_parent[e]; @@ -920,7 +923,7 @@ tsk_ls_hmm_compress(tsk_ls_hmm_t *self) } static int -tsk_ls_hmm_process_site( +tsk_ls_hmm_process_site_forward( tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) { int ret = 0; @@ -959,28 +962,23 @@ tsk_ls_hmm_process_site( return ret; } -int -tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, - int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), - double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output) +static int +tsk_ls_hmm_run_forward(tsk_ls_hmm_t *self, int32_t *haplotype) { int ret = 0; int t_ret; const tsk_site_t *sites; tsk_size_t j, num_sites; + const double n = (double) self->num_samples; - self->next_probability = next_probability; - self->compute_normalisation_factor = compute_normalisation_factor; - self->output = output; - - ret = tsk_ls_hmm_reset(self); + ret = tsk_ls_hmm_reset(self, 1 / n); if (ret != 0) { goto out; } for (t_ret = tsk_tree_first(&self->tree); t_ret == TSK_TREE_OK; t_ret = tsk_tree_next(&self->tree)) { - ret = tsk_ls_hmm_update_tree(self); + ret = tsk_ls_hmm_update_tree(self, TSK_DIR_FORWARD); if (ret != 0) { goto out; } @@ -990,7 +988,8 @@ tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, goto out; } for (j = 0; j < num_sites; j++) { - ret = tsk_ls_hmm_process_site(self, &sites[j], haplotype[sites[j].id]); + ret = tsk_ls_hmm_process_site_forward( + self, &sites[j], haplotype[sites[j].id]); if (ret != 0) { goto out; } @@ -1080,11 +1079,168 @@ tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, goto out; } } - ret = tsk_ls_hmm_run(self, haplotype, tsk_ls_hmm_next_probability_forward, - tsk_ls_hmm_compute_normalisation_factor_forward, output); + + self->next_probability = tsk_ls_hmm_next_probability_forward; + self->compute_normalisation_factor = tsk_ls_hmm_compute_normalisation_factor_forward; + self->output = output; + + ret = tsk_ls_hmm_run_forward(self, haplotype); +out: + return ret; +} + +/**************************************************************** + * Backward Algorithm + ****************************************************************/ + +static int +tsk_ls_hmm_next_probability_backward(tsk_ls_hmm_t *self, tsk_id_t site_id, double p_last, + bool is_match, tsk_id_t TSK_UNUSED(node), double *result) +{ + const double mu = self->mutation_rate[site_id]; + const double num_alleles = self->num_alleles[site_id]; + double p_e; + + p_e = mu; + if (is_match) { + p_e = 1 - (num_alleles - 1) * mu; + } + *result = p_last * p_e; + return 0; +} + +static int +tsk_ls_hmm_process_site_backward(tsk_ls_hmm_t *self, const tsk_site_t *site, + const int32_t haplotype_state, const double normalisation_factor) +{ + int ret = 0; + double x, b_last_sum; + tsk_compressed_matrix_t *output = (tsk_compressed_matrix_t *) self->output; + tsk_value_transition_t *restrict T = self->transitions; + const unsigned int precision = (unsigned int) self->precision; + const double rho = self->recombination_rate[site->id]; + const double n = (double) self->num_samples; + tsk_size_t j; + + /* FIXME!!! We are calling compress twice here because we need to compress + * immediately before calling store_site in order to filter out -1 nodes, + * and also (crucially) to ensure that the value transitions are listed + * in preorder, which we rely on later for decoding. + */ + ret = tsk_ls_hmm_compress(self); + if (ret != 0) { + goto out; + } + ret = tsk_compressed_matrix_store_site( + output, site->id, normalisation_factor, (tsk_size_t) self->num_transitions, T); + if (ret != 0) { + goto out; + } + + ret = tsk_ls_hmm_update_probabilities(self, site, haplotype_state); + if (ret != 0) { + goto out; + } + /* DO WE NEED THIS compress?? See above */ + ret = tsk_ls_hmm_compress(self); + if (ret != 0) { + goto out; + } + tsk_bug_assert(self->num_transitions <= self->num_samples); + b_last_sum = self->compute_normalisation_factor(self); + for (j = 0; j < self->num_transitions; j++) { + tsk_bug_assert(T[j].tree_node != TSK_NULL); + x = rho * b_last_sum / n + (1 - rho) * T[j].value; + x /= normalisation_factor; + T[j].value = tsk_round(x, precision); + } +out: + return ret; +} + +static int +tsk_ls_hmm_run_backward( + tsk_ls_hmm_t *self, int32_t *haplotype, const double *forward_norm) +{ + int ret = 0; + int t_ret; + const tsk_site_t *sites; + double s; + tsk_size_t num_sites; + tsk_id_t j; + + ret = tsk_ls_hmm_reset(self, 1); if (ret != 0) { goto out; } + + for (t_ret = tsk_tree_last(&self->tree); t_ret == TSK_TREE_OK; + t_ret = tsk_tree_prev(&self->tree)) { + ret = tsk_ls_hmm_update_tree(self, TSK_DIR_REVERSE); + if (ret != 0) { + goto out; + } + /* tsk_ls_hmm_check_state(self); */ + ret = tsk_tree_get_sites(&self->tree, &sites, &num_sites); + if (ret != 0) { + goto out; + } + for (j = (tsk_id_t) num_sites - 1; j >= 0; j--) { + s = forward_norm[sites[j].id]; + if (s <= 0) { + /* NOTE: I'm not sure if this is the correct interpretation, + * but norm values of 0 do lead to problems, and this seems + * like a simple way of guarding against it. We do seem to + * get norm values of 0 with impossible matches from the fwd + * matrix. + */ + ret = TSK_ERR_MATCH_IMPOSSIBLE; + goto out; + } + ret = tsk_ls_hmm_process_site_backward( + self, &sites[j], haplotype[sites[j].id], s); + if (ret != 0) { + goto out; + } + } + } + /* Set to zero so we can print and check the state OK. */ + self->num_transitions = 0; + if (t_ret != 0) { + ret = t_ret; + goto out; + } +out: + return ret; +} + +int +tsk_ls_hmm_backward(tsk_ls_hmm_t *self, int32_t *haplotype, const double *forward_norm, + tsk_compressed_matrix_t *output, tsk_flags_t options) +{ + int ret = 0; + + if (!(options & TSK_NO_INIT)) { + ret = tsk_compressed_matrix_init(output, self->tree_sequence, 0, 0); + if (ret != 0) { + goto out; + } + } else { + if (output->tree_sequence != self->tree_sequence) { + ret = TSK_ERR_BAD_PARAM_VALUE; + goto out; + } + ret = tsk_compressed_matrix_clear(output); + if (ret != 0) { + goto out; + } + } + + self->next_probability = tsk_ls_hmm_next_probability_backward; + self->compute_normalisation_factor = tsk_ls_hmm_compute_normalisation_factor_forward; + self->output = output; + + ret = tsk_ls_hmm_run_backward(self, haplotype, forward_norm); out: return ret; } @@ -1162,11 +1318,12 @@ tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t goto out; } } - ret = tsk_ls_hmm_run(self, haplotype, tsk_ls_hmm_next_probability_viterbi, - tsk_ls_hmm_compute_normalisation_factor_viterbi, output); - if (ret != 0) { - goto out; - } + + self->next_probability = tsk_ls_hmm_next_probability_viterbi; + self->compute_normalisation_factor = tsk_ls_hmm_compute_normalisation_factor_viterbi; + self->output = output; + + ret = tsk_ls_hmm_run_forward(self, haplotype); out: return ret; } @@ -1279,9 +1436,11 @@ tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site, } for (j = 0; j < num_transitions; j++) { + tsk_bug_assert(transitions[j].tree_node >= 0); self->values[site][j] = transitions[j].value; self->nodes[site][j] = transitions[j].tree_node; } + out: return ret; } diff --git a/c/tskit/haplotype_matching.h b/c/tskit/haplotype_matching.h index e4d82bef03..4809939458 100644 --- a/c/tskit/haplotype_matching.h +++ b/c/tskit/haplotype_matching.h @@ -134,6 +134,7 @@ typedef struct _tsk_ls_hmm_t { void *output; } tsk_ls_hmm_t; +/* TODO constify these APIs */ int tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, double *recombination_rate, double *mutation_rate, tsk_flags_t options); int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision); @@ -141,11 +142,10 @@ int tsk_ls_hmm_free(tsk_ls_hmm_t *self); void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out); int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_compressed_matrix_t *output, tsk_flags_t options); +int tsk_ls_hmm_backward(tsk_ls_hmm_t *self, int32_t *haplotype, + const double *forward_norm, tsk_compressed_matrix_t *output, tsk_flags_t options); int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t *output, tsk_flags_t options); -int tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, - int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), - double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output); int tsk_compressed_matrix_init(tsk_compressed_matrix_t *self, tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options); diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 5c6bd29986..6de1071f26 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -13260,6 +13260,64 @@ LsHmm_forward_matrix(LsHmm *self, PyObject *args) return ret; } +static PyObject * +LsHmm_backward_matrix(LsHmm *self, PyObject *args) +{ + int err; + PyObject *ret = NULL; + PyObject *haplotype = NULL; + PyObject *forward_norm = NULL; + CompressedMatrix *compressed_matrix = NULL; + PyArrayObject *haplotype_array = NULL; + PyArrayObject *forward_norm_array = NULL; + npy_intp *shape, num_sites; + + if (LsHmm_check_state(self) != 0) { + goto out; + } + if (!PyArg_ParseTuple(args, "OOO!", &haplotype, &forward_norm, &CompressedMatrixType, + &compressed_matrix)) { + goto out; + } + num_sites = (npy_intp) tsk_treeseq_get_num_sites(self->tree_sequence->tree_sequence); + + haplotype_array = (PyArrayObject *) PyArray_FROMANY( + haplotype, NPY_INT32, 1, 1, NPY_ARRAY_IN_ARRAY); + if (haplotype_array == NULL) { + goto out; + } + shape = PyArray_DIMS(haplotype_array); + if (shape[0] != num_sites) { + PyErr_SetString( + PyExc_ValueError, "haplotype array must have dimension (num_sites,)"); + goto out; + } + + forward_norm_array = (PyArrayObject *) PyArray_FROMANY( + forward_norm, NPY_FLOAT64, 1, 1, NPY_ARRAY_IN_ARRAY); + if (forward_norm_array == NULL) { + goto out; + } + shape = PyArray_DIMS(forward_norm_array); + if (shape[0] != num_sites) { + PyErr_SetString( + PyExc_ValueError, "forward_norm array must have dimension (num_sites,)"); + goto out; + } + err = tsk_ls_hmm_backward(self->ls_hmm, PyArray_DATA(haplotype_array), + PyArray_DATA(forward_norm_array), compressed_matrix->compressed_matrix, + TSK_NO_INIT); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue(""); +out: + Py_XDECREF(haplotype_array); + Py_XDECREF(forward_norm_array); + return ret; +} + static PyObject * LsHmm_viterbi_matrix(LsHmm *self, PyObject *args) { @@ -13306,6 +13364,10 @@ static PyMethodDef LsHmm_methods[] = { .ml_meth = (PyCFunction) LsHmm_forward_matrix, .ml_flags = METH_VARARGS, .ml_doc = "Returns the tree encoded forward matrix for a given haplotype" }, + { .ml_name = "backward_matrix", + .ml_meth = (PyCFunction) LsHmm_backward_matrix, + .ml_flags = METH_VARARGS, + .ml_doc = "Returns the tree encoded backward matrix for a given haplotype" }, { .ml_name = "viterbi_matrix", .ml_meth = (PyCFunction) LsHmm_viterbi_matrix, .ml_flags = METH_VARARGS, diff --git a/python/tests/test_haplotype_matching.py b/python/tests/test_haplotype_matching.py index b09ebcc005..725c6bf38c 100644 --- a/python/tests/test_haplotype_matching.py +++ b/python/tests/test_haplotype_matching.py @@ -22,13 +22,17 @@ """ Python implementation of the Li and Stephens forwards and backwards algorithms. """ -import itertools +import warnings import lshmm as ls import msprime import numpy as np +import numpy.testing as nt +import pytest +import _tskit import tskit +from tests import tsutil MISSING = -1 @@ -109,8 +113,8 @@ def __init__( self.N = np.zeros(ts.num_nodes, dtype=int) # Efficiently compute the allelic state at a site self.allelic_state = np.zeros(ts.num_nodes, dtype=int) - 1 - # Diffs so we can can update T and T_index between trees. - self.edge_diffs = self.ts.edge_diffs() + # TreePosition so we can can update T and T_index between trees. + self.tree_pos = tsutil.TreePosition(ts) self.parent = np.zeros(self.ts.num_nodes, dtype=int) - 1 self.tree = tskit.Tree(self.ts) self.output = None @@ -229,16 +233,24 @@ def compute(u, parent_state): if T_parent[j] != -1: self.N[T_parent[j]] -= self.N[j] - def update_tree(self): + def update_tree(self, direction=tskit.FORWARD): """ Update the internal data structures to move on to the next tree. """ parent = self.parent T_index = self.T_index T = self.T - _, edges_out, edges_in = next(self.edge_diffs) - - for edge in edges_out: + if direction == tskit.FORWARD: + self.tree_pos.next() + else: + self.tree_pos.prev() + assert self.tree_pos.index == self.tree.index + + for j in range( + self.tree_pos.out_range.start, self.tree_pos.out_range.stop, direction + ): + e = self.tree_pos.out_range.order[j] + edge = self.ts.edge(e) u = edge.child if T_index[u] == -1: # Make sure the subtree we're detaching has an T_index-value at the root. @@ -251,7 +263,11 @@ def update_tree(self): ) parent[edge.child] = -1 - for edge in edges_in: + for j in range( + self.tree_pos.in_range.start, self.tree_pos.in_range.stop, direction + ): + e = self.tree_pos.in_range.order[j] + edge = self.ts.edge(e) parent[edge.child] = edge.parent u = edge.parent if parent[edge.parent] == -1: @@ -320,6 +336,7 @@ def update_probabilities(self, site, haplotype_state): match = ( haplotype_state == MISSING or haplotype_state == allelic_state[v] ) + # Note that the node u is used only by Viterbi st.value = self.compute_next_probability(site.id, st.value, match, u) # Unset the states @@ -327,59 +344,61 @@ def update_probabilities(self, site, haplotype_state): for mutation in site.mutations: allelic_state[mutation.node] = -1 - def process_site(self, site, haplotype_state, forwards=True): - if forwards: - # Forwards algorithm, or forwards pass in Viterbi - self.update_probabilities(site, haplotype_state) - self.compress() - s = self.compute_normalisation_factor() - for st in self.T: - if st.tree_node != tskit.NULL: - st.value /= s - st.value = round(st.value, self.precision) - self.output.store_site( - site.id, s, [(st.tree_node, st.value) for st in self.T] - ) + def process_site(self, site, haplotype_state): + self.update_probabilities(site, haplotype_state) + self.compress() + s = self.compute_normalisation_factor() + for st in self.T: + assert st.tree_node != tskit.NULL + # if st.tree_node != tskit.NULL: + st.value /= s + st.value = round(st.value, self.precision) + self.output.store_site(site.id, s, [(st.tree_node, st.value) for st in self.T]) + + def compute_emission_proba(self, site_id, is_match): + mu = self.mu[site_id] + n_alleles = self.n_alleles[site_id] + if self.scale_mutation_based_on_n_alleles: + if is_match: + # Scale mutation based on the number of alleles + # - so the mutation rate is the mutation rate to one of the + # alleles. The overall mutation rate is then + # (n_alleles - 1) * mutation_rate. + p_e = 1 - (n_alleles - 1) * mu + else: + p_e = mu - mu * (n_alleles == 1) + # Added boolean in case we're at an invariant site else: - # Backwards algorithm - self.output.store_site( - site.id, - self.output.normalisation_factor[site.id], - [(st.tree_node, st.value) for st in self.T], - ) - self.update_probabilities(site, haplotype_state) - self.compress() - b_last_sum = self.compute_normalisation_factor() - s = self.output.normalisation_factor[site.id] - for st in self.T: - if st.tree_node != tskit.NULL: - st.value = ( - self.rho[site.id] / self.ts.num_samples - ) * b_last_sum + (1 - self.rho[site.id]) * st.value - st.value /= s - st.value = round(st.value, self.precision) - - def run_forward(self, h): - n = self.ts.num_samples - self.tree.clear() - for u in self.ts.samples(): - self.T_index[u] = len(self.T) - self.T.append(ValueTransition(tree_node=u, value=1 / n)) - while self.tree.next(): - self.update_tree() - for site in self.tree.sites(): - self.process_site(site, h[site.id]) - return self.output + # No scaling based on the number of alleles + # - so the mutation rate is the mutation rate to anything. + # This means that we must rescale the mutation rate to a different + # allele, by the number of alleles. + if n_alleles == 1: # In case we're at an invariant site + if is_match: + p_e = 1 + else: + p_e = 0 + else: + if is_match: + p_e = 1 - mu + else: + p_e = mu / (n_alleles - 1) + return p_e - def run_backward(self, h): + def initialise(self, value): self.tree.clear() for u in self.ts.samples(): - self.T_index[u] = len(self.T) - self.T.append(ValueTransition(tree_node=u, value=1)) + j = len(self.T) + self.T_index[u] = j + self.T.append(ValueTransition(tree_node=u, value=value)) + + def run(self, h): + n = self.ts.num_samples + self.initialise(1 / n) while self.tree.next(): self.update_tree() for site in self.tree.sites(): - self.process_site(site, h[site.id], forwards=False) + self.process_site(site, h[site.id]) return self.output def compute_normalisation_factor(self): @@ -389,6 +408,146 @@ def compute_next_probability(self, site_id, p_last, is_match, node): raise NotImplementedError() +class ForwardAlgorithm(LsHmmAlgorithm): + """ + The Li and Stephens forward algorithm. + """ + + def __init__( + self, ts, rho, mu, alleles, n_alleles, scale_mutation=False, precision=10 + ): + super().__init__( + ts, + rho, + mu, + alleles, + n_alleles, + precision=precision, + scale_mutation=scale_mutation, + ) + self.output = CompressedMatrix(ts) + + def compute_normalisation_factor(self): + s = 0 + for j, st in enumerate(self.T): + assert st.tree_node != tskit.NULL + # assert self.N[j] > 0 + s += self.N[j] * st.value + return s + + def compute_next_probability(self, site_id, p_last, is_match, node): + rho = self.rho[site_id] + n = self.ts.num_samples + p_e = self.compute_emission_proba(site_id, is_match) + p_t = p_last * (1 - rho) + rho / n + return p_t * p_e + + +class BackwardAlgorithm(ForwardAlgorithm): + """ + The Li and Stephens backward algorithm. + """ + + def compute_next_probability(self, site_id, p_next, is_match, node): + p_e = self.compute_emission_proba(site_id, is_match) + return p_next * p_e + + def process_site(self, site, haplotype_state, s): + # FIXME see nodes in the C code for why we have two calls to + # compress + self.compress() + self.output.store_site( + site.id, + s, + [(st.tree_node, st.value) for st in self.T], + ) + self.update_probabilities(site, haplotype_state) + # FIXME see nodes in the C code for why we have two calls to + # compress + self.compress() + b_last_sum = self.compute_normalisation_factor() + n = self.ts.num_samples + rho = self.rho[site.id] + for st in self.T: + if st.tree_node != tskit.NULL: + st.value = rho * b_last_sum / n + (1 - rho) * st.value + st.value /= s + st.value = round(st.value, self.precision) + + def run(self, h, normalisation_factor): + self.initialise(value=1) + while self.tree.prev(): + self.update_tree(direction=tskit.REVERSE) + for site in reversed(list(self.tree.sites())): + self.process_site(site, h[site.id], normalisation_factor[site.id]) + return self.output + + +class ViterbiAlgorithm(LsHmmAlgorithm): + """ + Runs the Li and Stephens Viterbi algorithm. + """ + + def __init__( + self, ts, rho, mu, alleles, n_alleles, scale_mutation=False, precision=10 + ): + super().__init__( + ts, + rho, + mu, + alleles, + n_alleles, + precision=precision, + scale_mutation=scale_mutation, + ) + self.output = ViterbiMatrix(ts) + + def compute_normalisation_factor(self): + max_st = ValueTransition(value=-1) + for st in self.T: + assert st.tree_node != tskit.NULL + if st.value > max_st.value: + max_st = st + if max_st.value == 0: + raise ValueError( + "Trying to match non-existent allele with zero mutation rate" + ) + return max_st.value + + def compute_next_probability(self, site_id, p_last, is_match, node): + rho = self.rho[site_id] + n = self.ts.num_samples + + p_no_recomb = p_last * (1 - rho + rho / n) + p_recomb = rho / n + recombination_required = False + if p_no_recomb > p_recomb: + p_t = p_no_recomb + else: + p_t = p_recomb + recombination_required = True + self.output.add_recombination_required(site_id, node, recombination_required) + + p_e = self.compute_emission_proba(site_id, is_match) + return p_t * p_e + + +def assert_compressed_matrices_equal(cm1, cm2): + nt.assert_array_almost_equal(cm1.normalisation_factor, cm2.normalisation_factor) + + for j in range(cm1.num_sites): + site1 = cm1.get_site(j) + site2 = cm2.get_site(j) + assert len(site1) == len(site2) + site1 = dict(site1) + site2 = dict(site2) + + assert set(site1.keys()) == set(site2.keys()) + for node in site1.keys(): + # TODO the precision value should be used as a parameter here + nt.assert_allclose(site1[node], site2[node], rtol=1e-5, atol=1e-8) + + class CompressedMatrix: """ Class representing a num_samples x num_sites matrix compressed by a @@ -398,18 +557,15 @@ class CompressedMatrix: values are on the path). """ - def __init__(self, ts, normalisation_factor=None): + def __init__(self, ts): self.ts = ts self.num_sites = ts.num_sites self.num_samples = ts.num_samples self.value_transitions = [None for _ in range(self.num_sites)] - if normalisation_factor is None: - self.normalisation_factor = np.zeros(self.num_sites) - else: - self.normalisation_factor = normalisation_factor - assert len(self.normalisation_factor) == self.num_sites + self.normalisation_factor = np.zeros(self.num_sites) def store_site(self, site, normalisation_factor, value_transitions): + assert all(u >= 0 for u, _ in value_transitions) self.normalisation_factor[site] = normalisation_factor self.value_transitions[site] = value_transitions @@ -428,25 +584,18 @@ def decode(self): Decodes the tree encoding of the values into an explicit matrix. """ + sample_index_map = np.zeros(self.ts.num_nodes, dtype=int) - 1 + sample_index_map[self.ts.samples()] = np.arange(self.ts.num_samples) A = np.zeros((self.num_sites, self.num_samples)) for tree in self.ts.trees(): for site in tree.sites(): - f = dict(self.value_transitions[site.id]) - for j, u in enumerate(self.ts.samples()): - while u not in f: - u = tree.parent(u) - A[site.id, j] = f[u] + for node, value in self.value_transitions[site.id]: + for u in tree.samples(node): + j = sample_index_map[u] + A[site.id, j] = value return A -class ForwardMatrix(CompressedMatrix): - """Class representing a compressed forward matrix.""" - - -class BackwardMatrix(CompressedMatrix): - """Class representing a compressed backward matrix.""" - - class ViterbiMatrix(CompressedMatrix): """ Class representing the compressed Viterbi matrix. @@ -527,206 +676,7 @@ def traceback(self): return match -class ForwardAlgorithm(LsHmmAlgorithm): - """Runs the Li and Stephens forward algorithm.""" - - def __init__( - self, ts, rho, mu, alleles, n_alleles, scale_mutation=False, precision=10 - ): - super().__init__( - ts, - rho, - mu, - alleles, - n_alleles, - precision=precision, - scale_mutation=scale_mutation, - ) - self.output = ForwardMatrix(ts) - - def compute_normalisation_factor(self): - s = 0 - for j, st in enumerate(self.T): - assert st.tree_node != tskit.NULL - assert self.N[j] > 0 - s += self.N[j] * st.value - return s - - def compute_next_probability( - self, site_id, p_last, is_match, node - ): # Note node only used in Viterbi - rho = self.rho[site_id] - mu = self.mu[site_id] - n = self.ts.num_samples - n_alleles = self.n_alleles[site_id] - - if self.scale_mutation_based_on_n_alleles: - if is_match: - # Scale mutation based on the number of alleles - # - so the mutation rate is the mutation rate to one of the - # alleles. The overall mutation rate is then - # (n_alleles - 1) * mutation_rate. - p_e = 1 - (n_alleles - 1) * mu - else: - p_e = mu - mu * (n_alleles == 1) - # Added boolean in case we're at an invariant site - else: - # No scaling based on the number of alleles - # - so the mutation rate is the mutation rate to anything. - # This means that we must rescale the mutation rate to a different - # allele, by the number of alleles. - if n_alleles == 1: # In case we're at an invariant site - if is_match: - p_e = 1 - else: - p_e = 0 - else: - if is_match: - p_e = 1 - mu - else: - p_e = mu / (n_alleles - 1) - - p_t = p_last * (1 - rho) + rho / n - return p_t * p_e - - -class BackwardAlgorithm(LsHmmAlgorithm): - """Runs the Li and Stephens backward algorithm.""" - - def __init__( - self, - ts, - rho, - mu, - alleles, - n_alleles, - normalisation_factor, - scale_mutation=False, - precision=10, - ): - super().__init__( - ts, - rho, - mu, - alleles, - n_alleles, - precision=precision, - scale_mutation=scale_mutation, - ) - self.output = BackwardMatrix(ts, normalisation_factor) - - def compute_normalisation_factor(self): - s = 0 - for j, st in enumerate(self.T): - assert st.tree_node != tskit.NULL - assert self.N[j] > 0 - s += self.N[j] * st.value - return s - - def compute_next_probability( - self, site_id, p_next, is_match, node - ): # Note node only used in Viterbi - mu = self.mu[site_id] - n_alleles = self.n_alleles[site_id] - - if self.scale_mutation_based_on_n_alleles: - if is_match: - p_e = 1 - (n_alleles - 1) * mu - else: - p_e = mu - mu * (n_alleles == 1) - else: - if n_alleles == 1: - if is_match: - p_e = 1 - else: - p_e = 0 - else: - if is_match: - p_e = 1 - mu - else: - p_e = mu / (n_alleles - 1) - return p_next * p_e - - -class ViterbiAlgorithm(LsHmmAlgorithm): - """ - Runs the Li and Stephens Viterbi algorithm. - """ - - def __init__( - self, ts, rho, mu, alleles, n_alleles, scale_mutation=False, precision=10 - ): - super().__init__( - ts, - rho, - mu, - alleles, - n_alleles, - precision=precision, - scale_mutation=scale_mutation, - ) - self.output = ViterbiMatrix(ts) - - def compute_normalisation_factor(self): - max_st = ValueTransition(value=-1) - for st in self.T: - assert st.tree_node != tskit.NULL - if st.value > max_st.value: - max_st = st - if max_st.value == 0: - raise ValueError( - "Trying to match non-existent allele with zero mutation rate" - ) - return max_st.value - - def compute_next_probability(self, site_id, p_last, is_match, node): - rho = self.rho[site_id] - mu = self.mu[site_id] - n = self.ts.num_samples - n_alleles = self.n_alleles[site_id] - - p_no_recomb = p_last * (1 - rho + rho / n) - p_recomb = rho / n - recombination_required = False - if p_no_recomb > p_recomb: - p_t = p_no_recomb - else: - p_t = p_recomb - recombination_required = True - self.output.add_recombination_required(site_id, node, recombination_required) - - if self.scale_mutation_based_on_n_alleles: - if is_match: - # Scale mutation based on the number of alleles - # - so the mutation rate is the mutation rate to one of the - # alleles. The overall mutation rate is then - # (n_alleles - 1) * mutation_rate. - p_e = 1 - (n_alleles - 1) * mu - else: - p_e = mu - mu * (n_alleles == 1) - # Added boolean in case we're at an invariant site - else: - # No scaling based on the number of alleles - # - so the mutation rate is the mutation rate to anything. - # This means that we must rescale the mutation rate to a different - # allele, by the number of alleles. - if n_alleles == 1: # In case we're at an invariant site - if is_match: - p_e = 1 - else: - p_e = 0 - else: - if is_match: - p_e = 1 - mu - else: - p_e = mu / (n_alleles - 1) - - return p_t * p_e - - -def ls_forward_tree( - h, ts, rho, mu, precision=30, alleles=None, scale_mutation_based_on_n_alleles=False -): +def get_site_alleles(ts, h, alleles): if alleles is None: n_alleles = np.int8( [ @@ -746,8 +696,13 @@ def ls_forward_tree( alleles = [alleles for _ in range(ts.num_sites)] else: alleles, n_alleles = check_alleles(alleles, ts.num_sites) + return alleles, n_alleles + - """Forward matrix computation based on a tree sequence.""" +def ls_forward_tree( + h, ts, rho, mu, precision=30, alleles=None, scale_mutation_based_on_n_alleles=False +): + alleles, n_alleles = get_site_alleles(ts, h, alleles) fa = ForwardAlgorithm( ts, rho, @@ -757,70 +712,26 @@ def ls_forward_tree( precision=precision, scale_mutation=scale_mutation_based_on_n_alleles, ) - return fa.run_forward(h) + return fa.run(h) -def ls_backward_tree( - h, ts_mirror, rho, mu, normalisation_factor, precision=30, alleles=None -): - if alleles is None: - n_alleles = np.int8( - [ - len(np.unique(np.append(ts_mirror.genotype_matrix()[j, :], h[j]))) - for j in range(ts_mirror.num_sites) - ] - ) - alleles = tskit.ALLELES_ACGT - if len(set(alleles).intersection(next(ts_mirror.variants()).alleles)) == 0: - alleles = tskit.ALLELES_01 - if len(set(alleles).intersection(next(ts_mirror.variants()).alleles)) == 0: - raise ValueError( - """Alleles list could not be identified. - Please pass a list of lists of alleles of length m, - or a list of alleles (e.g. tskit.ALLELES_ACGT)""" - ) - alleles = [alleles for _ in range(ts_mirror.num_sites)] - else: - alleles, n_alleles = check_alleles(alleles, ts_mirror.num_sites) - - """Backward matrix computation based on a tree sequence.""" +def ls_backward_tree(h, ts, rho, mu, normalisation_factor, precision=30, alleles=None): + alleles, n_alleles = get_site_alleles(ts, h, alleles) ba = BackwardAlgorithm( - ts_mirror, + ts, rho, mu, alleles, n_alleles, - normalisation_factor, precision=precision, ) - return ba.run_backward(h) + return ba.run(h, normalisation_factor) def ls_viterbi_tree( h, ts, rho, mu, precision=30, alleles=None, scale_mutation_based_on_n_alleles=False ): - if alleles is None: - n_alleles = np.int8( - [ - len(np.unique(np.append(ts.genotype_matrix()[j, :], h[j]))) - for j in range(ts.num_sites) - ] - ) - alleles = tskit.ALLELES_ACGT - if len(set(alleles).intersection(next(ts.variants()).alleles)) == 0: - alleles = tskit.ALLELES_01 - if len(set(alleles).intersection(next(ts.variants()).alleles)) == 0: - raise ValueError( - """Alleles list could not be identified. - Please pass a list of lists of alleles of length m, - or a list of alleles (e.g. tskit.ALLELES_ACGT)""" - ) - alleles = [alleles for _ in range(ts.num_sites)] - else: - alleles, n_alleles = check_alleles(alleles, ts.num_sites) - """ - Viterbi path computation based on a tree sequence. - """ + alleles, n_alleles = get_site_alleles(ts, h, alleles) va = ViterbiAlgorithm( ts, rho, @@ -830,14 +741,13 @@ def ls_viterbi_tree( precision=precision, scale_mutation=scale_mutation_based_on_n_alleles, ) - return va.run_forward(h) + return va.run(h) class LSBase: """Superclass of Li and Stephens tests.""" def example_haplotypes(self, ts): - H = ts.genotype_matrix() s = H[:, 0].reshape(1, H.shape[0]) H = H[:, 1:] @@ -874,13 +784,17 @@ def example_parameters_haplotypes(self, ts, seed=42): for s in haplotypes: yield n, H, s, r, mu - # Mixture of random and extremes - rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] - mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] + # FIXME removing these as tests are abominably slow. + # We'll be refactoring all this to use pytest anyway, so let's not + # worry too much about coverage for now. + # # Mixture of random and extremes + # rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] + # mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] - for s, r, mu in itertools.product(haplotypes, rs, mus): - r[0] = 0 - yield n, H, s, r, mu + # import itertools + # for s, r, mu in itertools.product(haplotypes, rs, mus): + # r[0] = 0 + # yield n, H, s, r, mu def assertAllClose(self, A, B): """Assert that all entries of two matrices are 'close'""" @@ -1029,13 +943,18 @@ class TestForwardHapTree(FBAlgorithmBase): def verify(self, ts): for n, H, s, r, mu in self.example_parameters_haplotypes(ts): for scale_mutation in [False, True]: - F, c, ll = ls.forwards( - H, - s, - r, - mutation_rate=mu, - scale_mutation_based_on_n_alleles=scale_mutation, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Warning from lshmm: + # Passed a vector of mutation rates, but rescaling each mutation + # rate conditional on the number of alleles + F, c, ll = ls.forwards( + H, + s, + r, + mutation_rate=mu, + scale_mutation_based_on_n_alleles=scale_mutation, + ) # Note, need to remove the first sample from the ts, and ensure # that invariant sites aren't removed. ts_check = ts.simplify(range(1, n + 1), filter_sites=False) @@ -1075,16 +994,15 @@ def verify(self, ts): c_f = ls_forward_tree(s[0, :], ts_check, r, mu) ll_tree = np.sum(np.log10(c_f.normalisation_factor)) - ts_check_mirror = mirror_coordinates(ts_check) - r_flip = np.flip(r) c_b = ls_backward_tree( - np.flip(s[0, :]), - ts_check_mirror, - r_flip, - np.flip(mu), - np.flip(c_f.normalisation_factor), + s[0, :], + ts_check, + r, + mu, + c_f.normalisation_factor, ) - B_tree = np.flip(c_b.decode(), axis=0) + B_tree = c_b.decode() + F_tree = c_f.decode() self.assertAllClose(B, B_tree) @@ -1118,3 +1036,247 @@ def verify(self, ts): scale_mutation_based_on_n_alleles=False, ) self.assertAllClose(ll, ll_check) + + +# TODO add params to run the various checks +def check_viterbi(ts, h, recombination=None, mutation=None): + h = np.array(h).astype(np.int8) + m = ts.num_sites + assert len(h) == m + if recombination is None: + recombination = np.zeros(ts.num_sites) + 1e-9 + if mutation is None: + mutation = np.zeros(ts.num_sites) + precision = 22 + + G = ts.genotype_matrix() + + path, ll = ls.viterbi( + G, + h.reshape(1, m), + recombination, + mutation_rate=mutation, + scale_mutation_based_on_n_alleles=False, + ) + assert np.isscalar(ll) + + cm = ls_viterbi_tree(h, ts, rho=recombination, mu=mutation) + ll_tree = np.sum(np.log10(cm.normalisation_factor)) + assert np.isscalar(ll_tree) + nt.assert_allclose(ll_tree, ll) + + # Check that the likelihood of the preferred path is + # the same as ll_tree (and ll). + path_tree = cm.traceback() + ll_check = ls.path_ll( + G, + h.reshape(1, m), + path_tree, + recombination, + mutation_rate=mutation, + scale_mutation_based_on_n_alleles=False, + ) + nt.assert_allclose(ll_check, ll) + + ll_ts = ts._ll_tree_sequence + ls_hmm = _tskit.LsHmm(ll_ts, recombination, mutation, precision=precision) + cm_lib = _tskit.ViterbiMatrix(ll_ts) + ls_hmm.viterbi_matrix(h, cm_lib) + path_lib = cm_lib.traceback() + + # Not true in general, but let's see how far it goes + nt.assert_array_equal(path_lib, path_tree) + + nt.assert_allclose(cm_lib.normalisation_factor, cm.normalisation_factor) + + return path + + +# TODO add params to run the various checks +def check_forward_matrix(ts, h, recombination=None, mutation=None): + precision = 22 + h = np.array(h).astype(np.int8) + n = ts.num_samples + m = ts.num_sites + assert len(h) == m + if recombination is None: + recombination = np.zeros(ts.num_sites) + 1e-9 + if mutation is None: + mutation = np.zeros(ts.num_sites) + + G = ts.genotype_matrix() + F, c, ll = ls.forwards( + G, + h.reshape(1, m), + recombination, + mutation_rate=mutation, + scale_mutation_based_on_n_alleles=False, + ) + assert F.shape == (m, n) + assert c.shape == (m,) + assert np.isscalar(ll) + + cm = ls_forward_tree( + h, ts, recombination, mutation, scale_mutation_based_on_n_alleles=False + ) + F2 = cm.decode() + nt.assert_allclose(F, F2) + nt.assert_allclose(c, cm.normalisation_factor) + ll_tree = np.sum(np.log10(cm.normalisation_factor)) + nt.assert_allclose(ll_tree, ll) + + ll_ts = ts._ll_tree_sequence + ls_hmm = _tskit.LsHmm(ll_ts, recombination, mutation, precision=precision) + cm_lib = _tskit.CompressedMatrix(ll_ts) + ls_hmm.forward_matrix(h, cm_lib) + F3 = cm_lib.decode() + + assert_compressed_matrices_equal(cm, cm_lib) + + nt.assert_allclose(F, F3) + nt.assert_allclose(c, cm_lib.normalisation_factor) + return cm_lib + + +def check_backward_matrix(ts, h, forward_cm, recombination=None, mutation=None): + precision = 22 + h = np.array(h).astype(np.int8) + m = ts.num_sites + assert len(h) == m + if recombination is None: + recombination = np.zeros(ts.num_sites) + 1e-9 + if mutation is None: + mutation = np.zeros(ts.num_sites) + + G = ts.genotype_matrix() + B = ls.backwards( + G, + h.reshape(1, m), + forward_cm.normalisation_factor, + recombination, + mutation_rate=mutation, + scale_mutation_based_on_n_alleles=False, + ) + + backward_cm = ls_backward_tree( + h, + ts, + recombination, + mutation, + forward_cm.normalisation_factor, + precision=precision, + ) + nt.assert_array_equal( + backward_cm.normalisation_factor, forward_cm.normalisation_factor + ) + + ll_ts = ts._ll_tree_sequence + ls_hmm = _tskit.LsHmm(ll_ts, recombination, mutation, precision=precision) + cm_lib = _tskit.CompressedMatrix(ll_ts) + ls_hmm.backward_matrix(h, forward_cm.normalisation_factor, cm_lib) + + assert_compressed_matrices_equal(backward_cm, cm_lib) + + B_lib = cm_lib.decode() + B_tree = backward_cm.decode() + nt.assert_allclose(B_tree, B_lib) + nt.assert_allclose(B, B_lib) + + +def add_unique_sample_mutations(ts, start=0): + """ + Adds a mutation for each of the samples at equally spaced locations + along the genome. + """ + tables = ts.dump_tables() + L = int(ts.sequence_length) + assert L % ts.num_samples == 0 + gap = L // ts.num_samples + x = start + for u in ts.samples(): + site = tables.sites.add_row(position=x, ancestral_state="0") + tables.mutations.add_row(site=site, derived_state="1", node=u) + x += gap + return tables.tree_sequence() + + +class TestSingleBalancedTreeExample: + # 3.00┊ 6 ┊ + # ┊ ┏━┻━┓ ┊ + # 2.00┊ 4 5 ┊ + # ┊ ┏┻┓ ┏┻┓ ┊ + # 1.00┊ 0 1 2 3 ┊ + # 0 8 + + @staticmethod + def ts(): + return add_unique_sample_mutations( + tskit.Tree.generate_balanced(4, span=8).tree_sequence, + start=1, + ) + + @pytest.mark.parametrize("j", [0, 1, 2, 3]) + def test_match_sample(self, j): + ts = self.ts() + h = np.zeros(4) + h[j] = 1 + path = check_viterbi(ts, h) + nt.assert_array_equal([j, j, j, j], path) + cm = check_forward_matrix(ts, h) + check_backward_matrix(ts, h, cm) + + @pytest.mark.parametrize("j", [1, 2]) + def test_match_sample_missing_flanks(self, j): + ts = self.ts() + h = np.zeros(4) + h[0] = -1 + h[-1] = -1 + h[j] = 1 + path = check_viterbi(ts, h) + nt.assert_array_equal([j, j, j, j], path) + cm = check_forward_matrix(ts, h) + check_backward_matrix(ts, h, cm) + + def test_switch_each_sample(self): + ts = self.ts() + h = np.ones(4) + path = check_viterbi(ts, h) + nt.assert_array_equal([0, 1, 2, 3], path) + cm = check_forward_matrix(ts, h) + check_backward_matrix(ts, h, cm) + + def test_switch_each_sample_missing_flanks(self): + ts = self.ts() + h = np.ones(4) + h[0] = -1 + h[-1] = -1 + path = check_viterbi(ts, h) + nt.assert_array_equal([1, 1, 2, 2], path) + cm = check_forward_matrix(ts, h) + check_backward_matrix(ts, h, cm) + + def test_switch_each_sample_missing_middle(self): + ts = self.ts() + h = np.ones(4) + h[1:3] = -1 + path = check_viterbi(ts, h) + # Implementation of Viterbi switches at right-most position + nt.assert_array_equal([0, 3, 3, 3], path) + cm = check_forward_matrix(ts, h) + check_backward_matrix(ts, h, cm) + + +class TestSimulationExamples: + @pytest.mark.parametrize("n", [3, 10, 50]) + @pytest.mark.parametrize("L", [1, 10, 100]) + def test_continuous_genome(self, n, L): + ts = msprime.simulate( + n, length=L, recombination_rate=1, mutation_rate=1, random_seed=42 + ) + h = np.zeros(ts.num_sites, dtype=np.int8) + # NOTE this is a bit slow at the moment but we can disable the Python + # implementation once testing has been improved on smaller examples. + # Add ``compare_py=False``to these calls. + check_viterbi(ts, h) + cm = check_forward_matrix(ts, h) + check_backward_matrix(ts, h, cm) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index c33f159deb..5994a7a851 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -2834,34 +2834,59 @@ def test_haplotype_input(self): m = ts.get_num_sites() fm = _tskit.CompressedMatrix(ts) vm = _tskit.ViterbiMatrix(ts) + norm = np.ones(m) ls_hmm = _tskit.LsHmm(ts, np.zeros(m), np.zeros(m)) for bad_size in [0, m - 1, m + 1, m + 2]: bad_array = np.zeros(bad_size, dtype=np.int8) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="haplotype array"): ls_hmm.forward_matrix(bad_array, fm) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="haplotype array"): + ls_hmm.backward_matrix(bad_array, norm, fm) + with pytest.raises(ValueError, match="haplotype array"): ls_hmm.viterbi_matrix(bad_array, vm) for bad_array in [[0.002], [[], []], None]: with pytest.raises(ValueError): ls_hmm.forward_matrix(bad_array, fm) with pytest.raises(ValueError): ls_hmm.viterbi_matrix(bad_array, vm) + with pytest.raises(ValueError): + ls_hmm.backward_matrix(bad_array, norm, fm) + + def test_norm_input(self): + ts = self.get_example_tree_sequence() + m = ts.get_num_sites() + cm = _tskit.CompressedMatrix(ts) + h = np.zeros(m, dtype=np.int32) + ls_hmm = _tskit.LsHmm(ts, np.zeros(m), np.zeros(m)) + for bad_size in [0, m - 1, m + 1, m + 2]: + bad_array = np.zeros(bad_size) + with pytest.raises(ValueError, match="forward_norm array"): + ls_hmm.backward_matrix(h, bad_array, cm) + + for bad_array in [[0.002], [[], []], None]: + with pytest.raises(ValueError): + ls_hmm.backward_matrix(h, bad_array, cm) def test_output_type_errors(self): ts = self.get_example_tree_sequence() m = ts.get_num_sites() h = np.zeros(m, dtype=np.int8) + norm = np.ones(m) ls_hmm = _tskit.LsHmm(ts, np.zeros(m), np.zeros(m)) for bad_type in [ls_hmm, None, m, []]: with pytest.raises(TypeError): ls_hmm.forward_matrix(h, bad_type) with pytest.raises(TypeError): ls_hmm.viterbi_matrix(h, bad_type) + with pytest.raises(TypeError): + ls_hmm.backward_matrix(h, norm, bad_type) other_ts = self.get_example_tree_sequence() output = _tskit.CompressedMatrix(other_ts) with pytest.raises(_tskit.LibraryError): ls_hmm.forward_matrix(h, output) + with pytest.raises(_tskit.LibraryError): + ls_hmm.backward_matrix(h, norm, output) output = _tskit.ViterbiMatrix(other_ts) with pytest.raises(_tskit.LibraryError): ls_hmm.viterbi_matrix(h, output) @@ -2910,7 +2935,7 @@ def verify_compressed_matrix(self, ts, output): assert len(item) == 2 node, value = item assert 0 <= node < ts.get_num_nodes() - assert 0 <= value <= 1 + assert value >= 0 for site in [m, m + 1, 2 * m]: with pytest.raises(ValueError): output.get_site(site) @@ -2924,6 +2949,17 @@ def test_forward_matrix(self): assert rv is None self.verify_compressed_matrix(ts, output) + def test_backward_matrix(self): + ts = self.get_example_tree_sequence() + m = ts.get_num_sites() + fm = _tskit.CompressedMatrix(ts) + bm = _tskit.CompressedMatrix(ts) + h = np.zeros(m, dtype=np.int32) + ls_hmm = _tskit.LsHmm(ts, np.zeros(m) + 0.1, np.zeros(m) + 0.1) + ls_hmm.forward_matrix(h, fm) + ls_hmm.backward_matrix(h, fm.normalisation_factor, bm) + self.verify_compressed_matrix(ts, bm) + def test_viterbi_matrix(self): ts = self.get_example_tree_sequence() m = ts.get_num_sites() From 64aedbf7a5525b6d8e8521a1e576ce51a912bbd7 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 18 Jul 2023 15:22:34 +0100 Subject: [PATCH 2/2] Update lshmm requirement to 0.0.5 Also update changed parameter names for lshmm 0.0.5 --- c/tskit/haplotype_matching.c | 2 ++ .../requirements/CI-complete/requirements.txt | 4 ++-- .../CI-tests-pip/requirements.txt | 4 ++-- python/requirements/development.txt | 2 +- python/tests/test_genotype_matching_fb.py | 8 +++---- .../tests/test_genotype_matching_viterbi.py | 6 +++--- python/tests/test_haplotype_matching.py | 21 ++++++++++--------- 7 files changed, 25 insertions(+), 22 deletions(-) diff --git a/c/tskit/haplotype_matching.c b/c/tskit/haplotype_matching.c index ea8853e72e..fab3a2b56a 100644 --- a/c/tskit/haplotype_matching.c +++ b/c/tskit/haplotype_matching.c @@ -1126,6 +1126,8 @@ tsk_ls_hmm_process_site_backward(tsk_ls_hmm_t *self, const tsk_site_t *site, * immediately before calling store_site in order to filter out -1 nodes, * and also (crucially) to ensure that the value transitions are listed * in preorder, which we rely on later for decoding. + * + * https://github.com/tskit-dev/tskit/issues/2803 */ ret = tsk_ls_hmm_compress(self); if (ret != 0) { diff --git a/python/requirements/CI-complete/requirements.txt b/python/requirements/CI-complete/requirements.txt index b64ac2f43c..fb2db98445 100644 --- a/python/requirements/CI-complete/requirements.txt +++ b/python/requirements/CI-complete/requirements.txt @@ -3,7 +3,7 @@ coverage==7.2.7 dendropy==4.6.1 h5py==3.9.0 kastore==0.3.2 -lshmm==0.0.4 +lshmm==0.0.5 msgpack==1.0.5 msprime==1.2.0 networkx==3.1 @@ -12,4 +12,4 @@ pytest==7.4.0 pytest-cov==4.1.0 pytest-xdist==3.3.1 tszip==0.2.2 -xmlunittest==0.5.0 \ No newline at end of file +xmlunittest==0.5.0 diff --git a/python/requirements/CI-tests-pip/requirements.txt b/python/requirements/CI-tests-pip/requirements.txt index 9f3b31ef3d..42063d0d58 100644 --- a/python/requirements/CI-tests-pip/requirements.txt +++ b/python/requirements/CI-tests-pip/requirements.txt @@ -1,4 +1,4 @@ -lshmm==0.0.4 +lshmm==0.0.5 numpy==1.21.6; python_version < '3.11' # Held at 1.21.6 for Python 3.7 compatibility numpy==1.24.1; python_version > '3.10' pytest==7.1.3 @@ -12,4 +12,4 @@ dendropy==4.5.2 networkx==2.6.3 # Held at 2.6.3 for Python 3.7 compatibility msgpack==1.0.4 newick==1.3.2 -tszip==0.2.2 \ No newline at end of file +tszip==0.2.2 diff --git a/python/requirements/development.txt b/python/requirements/development.txt index ba48881723..5181177b2a 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -10,7 +10,7 @@ h5py>=2.6.0 jsonschema>=3.0.0 jupyter-book>=0.12.1 kastore -lshmm +lshmm>=0.0.5 matplotlib meson>=0.61.0 msgpack>=1.0.0 diff --git a/python/tests/test_genotype_matching_fb.py b/python/tests/test_genotype_matching_fb.py index 761eadf403..90def524aa 100644 --- a/python/tests/test_genotype_matching_fb.py +++ b/python/tests/test_genotype_matching_fb.py @@ -953,7 +953,7 @@ def verify(self, ts): np.flip(G_check, axis=0), np.flip(s, axis=1), r_flip, - mutation_rate=np.flip(mu), + p_mutation=np.flip(mu), scale_mutation_based_on_n_alleles=False, ) @@ -978,7 +978,7 @@ def verify(self, ts): ) F, c, ll = ls.forwards( - G_check, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + G_check, s, r, p_mutation=mu, scale_mutation_based_on_n_alleles=False ) cm_d = ls_forward_tree(s[0, :], ts_check, r, mu) self.assertAllClose(cm_d.decode(), F) @@ -1004,14 +1004,14 @@ def verify(self, ts): ) F, c, ll = ls.forwards( - G_check, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + G_check, s, r, p_mutation=mu, scale_mutation_based_on_n_alleles=False ) B = ls.backwards( G_check, s, c, r, - mutation_rate=mu, + p_mutation=mu, scale_mutation_based_on_n_alleles=False, ) diff --git a/python/tests/test_genotype_matching_viterbi.py b/python/tests/test_genotype_matching_viterbi.py index acab5d1c28..d8e0b33294 100644 --- a/python/tests/test_genotype_matching_viterbi.py +++ b/python/tests/test_genotype_matching_viterbi.py @@ -1055,14 +1055,14 @@ def verify(self, ts): ) ts_check = ts.simplify(range(1, n + 1), filter_sites=False) phased_path, ll = ls.viterbi( - G_check, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + G_check, s, r, p_mutation=mu, scale_mutation_based_on_n_alleles=False ) path_ll_matrix = ls.path_ll( G_check, s, phased_path, r, - mutation_rate=mu, + p_mutation=mu, scale_mutation_based_on_n_alleles=False, ) @@ -1077,7 +1077,7 @@ def verify(self, ts): s, np.transpose(path_tree_dict), r, - mutation_rate=mu, + p_mutation=mu, scale_mutation_based_on_n_alleles=False, ) diff --git a/python/tests/test_haplotype_matching.py b/python/tests/test_haplotype_matching.py index 725c6bf38c..dcc1d684fb 100644 --- a/python/tests/test_haplotype_matching.py +++ b/python/tests/test_haplotype_matching.py @@ -455,6 +455,7 @@ def compute_next_probability(self, site_id, p_next, is_match, node): def process_site(self, site, haplotype_state, s): # FIXME see nodes in the C code for why we have two calls to # compress + # https://github.com/tskit-dev/tskit/issues/2803 self.compress() self.output.store_site( site.id, @@ -928,7 +929,7 @@ def verify(self, ts): np.flip(H, axis=0), np.flip(s, axis=1), r_flip, - mutation_rate=np.flip(mu), + p_mutation=np.flip(mu), scale_mutation_based_on_n_alleles=False, ) @@ -952,7 +953,7 @@ def verify(self, ts): H, s, r, - mutation_rate=mu, + p_mutation=mu, scale_mutation_based_on_n_alleles=scale_mutation, ) # Note, need to remove the first sample from the ts, and ensure @@ -977,14 +978,14 @@ class TestForwardBackwardTree(FBAlgorithmBase): def verify(self, ts): for n, H, s, r, mu in self.example_parameters_haplotypes(ts): F, c, ll = ls.forwards( - H, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + H, s, r, p_mutation=mu, scale_mutation_based_on_n_alleles=False ) B = ls.backwards( H, s, c, r, - mutation_rate=mu, + p_mutation=mu, scale_mutation_based_on_n_alleles=False, ) @@ -1017,7 +1018,7 @@ class TestTreeViterbiHap(VitAlgorithmBase): def verify(self, ts): for n, H, s, r, mu in self.example_parameters_haplotypes(ts): path, ll = ls.viterbi( - H, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + H, s, r, p_mutation=mu, scale_mutation_based_on_n_alleles=False ) ts_check = ts.simplify(range(1, n + 1), filter_sites=False) cm = ls_viterbi_tree(s[0, :], ts_check, r, mu) @@ -1032,7 +1033,7 @@ def verify(self, ts): s, path_tree, r, - mutation_rate=mu, + p_mutation=mu, scale_mutation_based_on_n_alleles=False, ) self.assertAllClose(ll, ll_check) @@ -1055,7 +1056,7 @@ def check_viterbi(ts, h, recombination=None, mutation=None): G, h.reshape(1, m), recombination, - mutation_rate=mutation, + p_mutation=mutation, scale_mutation_based_on_n_alleles=False, ) assert np.isscalar(ll) @@ -1073,7 +1074,7 @@ def check_viterbi(ts, h, recombination=None, mutation=None): h.reshape(1, m), path_tree, recombination, - mutation_rate=mutation, + p_mutation=mutation, scale_mutation_based_on_n_alleles=False, ) nt.assert_allclose(ll_check, ll) @@ -1109,7 +1110,7 @@ def check_forward_matrix(ts, h, recombination=None, mutation=None): G, h.reshape(1, m), recombination, - mutation_rate=mutation, + p_mutation=mutation, scale_mutation_based_on_n_alleles=False, ) assert F.shape == (m, n) @@ -1154,7 +1155,7 @@ def check_backward_matrix(ts, h, forward_cm, recombination=None, mutation=None): h.reshape(1, m), forward_cm.normalisation_factor, recombination, - mutation_rate=mutation, + p_mutation=mutation, scale_mutation_based_on_n_alleles=False, )