diff --git a/python/tests/test_beagle.py b/python/tests/test_beagle.py new file mode 100644 index 0000000000..3f6adc6a04 --- /dev/null +++ b/python/tests/test_beagle.py @@ -0,0 +1,471 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute genotypes. + +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The source code of BEAGLE 4.1 (particularly `LSHapBaum.java`) was also consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +Here are some notations used throughout this implementation: +h = Number of reference haplotypes. +m = Number of genotyped markers. +x = Number of imputed markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Genomic site positions of all the markers in an array of size (m + x). + +For simplicity, we assume that: +1. All the sites biallelic (0/1 encoding). +2. The genetic map is defined by equating 1 Mbp to 1 cM. + +In the query haplotypes: +1. The genotyped markers take values of 0 or 1. +2. The imputed markers take -1. + +The forward, backward, and HMM state probability matrices are of size (m, h). +The interpolated haplotype probability matrix is of size (x, 2), +The imputed alleles in the query haplotypes are the maximum a posteriori (MAP) alleles. + +For efficiency, BEAGLE uses aggregated markers, which are clusters of markers +within a 0.005 cM interval (default). Because the genotypes are phased, +the alleles in the aggregated markers form distinct allele sequences. +Below, we do not use aggregated markers or allele sequences, which simplifies +the implementation. + +Rather than trying to faithfully reimplementing the original BEAGLE 4.1 algorithm, +the following computes Equation 1 of BB2016. +""" +import numpy as np + + +def convert_to_genetic_map_position(pos): + """ + Convert genomic site positions (bp) to genetic map positions (cM). + + When a genetic map is not specified, it is assumed that 1 Mbp = 1cM. + + This trivial function is meant for documentation. + + :param numpy.ndarray pos: Site positions. + :return: Genetic map positions. + :rtype: numpy.ndarray + """ + assert 0 <= np.min(pos) + return pos / 1e6 + + +def get_weights(genotyped_pos, imputed_pos): + """ + Get weights at imputed markers in the query haplotype. + + These weights are used in linear interpolation of HMM state probabilities + at imputed markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at imputed marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray imputed_pos: Site positions of imputed markers. + :return: Weights used in linear interpolation. + :rtype: numpy.ndarray + """ + # Check that the two sets are mutually exclusive. + assert not np.any(np.isin(genotyped_pos, imputed_pos)) + # Set min genetic distance to avoid division by zero. + MIN_CM_DIST = 0.005 + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + # Get weights for each target marker. + weights = np.zeros(len(imputed_cm), dtype=np.float64) + for i in np.arange(imputed_cm): + if imputed_cm[i] < genotyped_cm[0]: + m = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + m = -2 + else: + m = np.min(np.where(genotyped_cm > imputed_cm[i])) + weights[i] = 0 + cm_mP1_x = genotyped_cm[m + 1] - imputed_cm[i] + cm_mP1_m = np.max(genotyped_cm[m + 1] - genotyped_cm[m], MIN_CM_DIST) + weights[i] = cm_mP1_x / cm_mP1_m + return weights + + +def get_mismatch_prob(pos, miscall_rate=0.0001): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001. Also, it is capped at 0.5. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + assert 0 <= miscall_rate <= 0.5 + mu = np.repeat(miscall_rate, len(pos)) + return mu + + +def get_switch_prob(pos, h, ne=1e6): + """ + Set switch probabilities at genotyped markers. + + :param numpy.ndarray pos: Site positions. + :param numpy.ndarray h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + assert pos[0] > 0, "Site positions are not 1-based." + # Get genetic distance between adjacent markers. + cm = np.concatenate([[0], convert_to_genetic_map_position(pos)]) + gen_dist = cm[1:] - cm[:-1] + assert len(gen_dist) == len(pos) + rho = -np.expm1(-(4 * ne * gen_dist / h)) + return rho + + +def compute_forward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the forwards algorithm. + + The forward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + # BEAGLE caps mismatch probabilities at 0.5. + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + fm = np.zeros((m, h), dtype=np.float64) + last_sum = 1.0 # Part of normalization factor + for i in np.arange(m): + # Get site-specific parameters + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + query_a = query_h[i] + for j in np.arange(h): + ref_a = ref_h[i, j] + # Get emission probability + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + if m == 0: + fm[i, j] = em_prob + else: + fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift) + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if m == 0 else site_sum + return fm + + +def compute_backward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the backwards algorithm. + + The backward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + In BEAGLE, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + # BEAGLE caps mismatch probabilities at 0.5. + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + bm = np.zeros((m, h), dtype=np.float64) + # Initialise the last column + bm[-1, :] = 1.0 / h + for i in np.arange(m - 2, -1, -1): + sum_site = 0.0 + query_a = query_h[i + 1] + for j in np.arange(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + bm[i, j] *= em_prob + sum_site += bm[i, j] + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + for j in np.arange(h): + bm[i, j] = scale * bm[i, j] + shift + return bm + + +def _get_ref_hap_seg(a, b): + """ + Assuming all biallelic sites, get the index of a reference haplotype segment (i) + defined by the alleles a and b at two adjacent genotyped markers. + + #i, a, b + 0, 0, 0 + 1, 1, 0 + 2, 0, 1 + 3, 1, 1 + + See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169 + + This is a helper function for `compute_state_probability_matrix`. + """ + ref_hap_idx = a * 1 + b * 2 + return ref_hap_idx + + +def compute_state_probability_matrix(fm, bm, ref_h, query_h, rho, mu): + """ + Implement the HMM forward-backward algorithm to compute: + 1) A HMM state probability matrix across genotyped markers; + 2) "Forward haplotype probabilities"; and + 3) "Backward haplotype probabilities". + + A HMM state is an index of a reference haplotype. + + The state probability matrix is of size (m, h), + where m is the number of genotyped markers in the query haplotype + and h is the number of reference haplotypes. + + Both the forward and backward haplotype probability matrices + are of size (m, 2), assuming only biallelic sites. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + This implementation is based on the function `setStateProbs` + of `LSHapBaum.java` in BEAGLE. It performs the computation + described in Equation 2 of BB2016. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: HMM state probability matrix, + forward haplotype probability matrix, and + backward haplotype probability matrix. + :rtype: tuple + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + # BEAGLE caps mismatch probabilities at 0.5. + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + assert fm.shape == (m, h) + assert bm.shape == (m, h) + # Check all biallelic sites + assert np.all(np.isin(np.unique(ref_h), [0, 1])) + assert np.all(np.isin(np.unique(query_h), [-1, 0, 1])) + sm = np.zeros((m, h), dtype=np.float64) # HMM state probability matrix + fwd_hap_probs = np.zeros((m, 4), dtype=np.float64) + bwd_hap_probs = np.zeros((m, 4), dtype=np.float64) + for i in np.arange(m - 1, 0, -1): + for j in np.arange(h): + sm[i, j] = fm[i, j] * bm[i, j] + ref_hap_seg_mP1 = _get_ref_hap_seg(ref_h[i, j], ref_h[i + 1, j]) + ref_hap_seg_m = _get_ref_hap_seg(ref_h[i - 1, j], ref_h[i, j]) + fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j] + bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j] + sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :]) + fwd_hap_probs[i, :] /= sum_fwd_hap_probs + bwd_hap_probs[i, :] /= sum_fwd_hap_probs # Sum of forward hap probs + return (sm, fwd_hap_probs, bwd_hap_probs) + + +def compute_state_probability_matrix_eqn1(fm, bm, ref_h, query_h, rho, mu): + """ + Implement the HMM forward-backward algorithm following Equation 1 of BB2016. + + This is simpler than faithfully reimplementing the original BEAGLE 4.1 algorithm. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + pass + + +def interpolate_haplotype_probability_matrix( + fwd_hap_probs, bwd_hap_probs, genotyped_pos, imputed_pos +): + """ + Compute the interpolated haplotype probability matrix of a query haplotype. + + The forward and backward haplotype probability matrices are of size (m, 2), + where m is the number of genotyped markers in the query haplotype. + Here, we assume all biallelic sites, hence the 2. + + The interpolated haplotype probability matrix is of size (x, 2), + where x is the number of imputed markers in the query haplotype. + Again, we assume all biallelic sites, hence the 2. + + This implementation is based on Equation 2 (the rearranged one) in BB2016. + + :param fwd_hap_probs: Forward haplotype probability matrix. + :param bwd_hap_probs: Backward haplotype probability matrix. + :param genotyped_pos: Site positions of genotyped markers. + :param imputed_pos: Site positions of imputed markers. + :return: Interpolated haplotype probability matrix. + :rtype: numpy.ndarray + """ + m = len(genotyped_pos) + x = len(imputed_pos) + assert fwd_hap_probs.shape == (m, 2) + assert bwd_hap_probs.shape == (m, 2) + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + assert len(genotyped_cm) == m + imputed_cm = convert_to_genetic_map_position(imputed_pos) + assert len(imputed_cm) == x + weights = get_weights(genotyped_cm, imputed_cm) + assert len(weights) == x + i_hap_probs = np.zeros((x, 2), dtype=np.float64) + for i in np.arange(x): + if imputed_cm[i] < genotyped_cm[0]: + m = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + m = -2 + else: + m = np.min(np.where(genotyped_cm > imputed_cm[i])) + # Vectorised over the allelic states + i_hap_probs[i, :] = weights[i] * bwd_hap_probs[m, :] + i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[m, :] + return i_hap_probs + + +def get_map_alleles(i_hap_probs): + """ + Compute the maximum a posteriori alleles at the imputed markers of a query haplotype. + + Assuming all biallelic sites, it is a matrix of size (x, 2), + where x is the number of imputed markers. + + This implementation is based on Equation 2 (the rearranged one) in BB2016. + + :param numpy.ndarray i_hap_probs: Interpolated haplotype probability matrix. + :return: MAP alleles at imputed markers. + :rtype: numpy.ndarray + """ + x = len(i_hap_probs) + imputed_alleles = np.zeros((x, 2)) + # TODO: Vectorise over the imputed markers + for i in np.arange(x): + imputed_alleles[i, 0] = np.argmax(i_hap_probs[i, :]) + imputed_alleles[i, 1] = np.argmax(i_hap_probs[i, :]) + return imputed_alleles + + +def run_beagle(ref_h, query_h, pos): + """ + Run the BEAGLE 4.1 imputation algorithm. + + `ref_h` and `query_h` span all genotyped and imputed markers. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :return: MAP alleles at imputed markers in the query haplotype. + :rtype: numpy.ndarray + """ + assert ref_h.shape[0] == len(pos) + assert query_h.shape[0] == len(pos) + # Index of genotyped markers in the query haplotype + genotyped_pos_idx = np.where(query_h != -1)[0] + # Index of imputed markers in the query haplotype + imputed_pos_idx = np.where(query_h == -1)[0] + assert len(genotyped_pos_idx) > 0 + assert len(imputed_pos_idx) > 0 + # Site positions of genotyped markers + genotyped_pos = pos[genotyped_pos_idx] + # Site positions of imputed markers + imputed_pos = pos[imputed_pos_idx] + m = len(genotyped_pos) + x = len(imputed_pos) + assert m + x == len(pos) + h = ref_h.shape[1] + # Subset the reference haplotypes to genotyped markers + ref_h_genotyped = ref_h[genotyped_pos_idx, :] + assert ref_h_genotyped.shape == (m, h) + # Subset the query haplotype to genotyped markers + query_h_genotyped = query_h[genotyped_pos_idx] + assert len(query_h_genotyped) == m + # Set mismatch probabilities at genotyped markers + mu = get_mismatch_prob(genotyped_pos) + assert len(mu) == m + # Set switch probabilities at genotyped markers + rho = get_switch_prob(genotyped_pos, h, ne=10) # Small ref. panel + assert len(rho) == m + # Compute forward probability matrix over genotyped markers + fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + assert fm.shape == (m, h) + # Compute backward probability matrix over genotyped markers + bm = compute_backward_probability_matrix( + ref_h_genotyped, query_h_genotyped, rho, mu + ) + assert bm.shape == (m, h) + # TODO: Replace this with the simpler implementation. + # Compute HMM state probability matrix over genotyped markers + # and forward and backward haplotype probability matrices + sm, fwd_hap_probs, bwd_hap_probs = compute_state_probability_matrix( + fm, bm, ref_h_genotyped, query_h_genotyped, rho, mu + ) + assert sm.shape == (m, h) # sm not used further + assert fwd_hap_probs.shape == (m, 2) + assert bwd_hap_probs.shape == (m, 2) + # Interpolate haplotype probabilities + # from genotype markers to imputed markers + i_hap_probs = interpolate_haplotype_probability_matrix( + fwd_hap_probs, bwd_hap_probs, genotyped_pos, imputed_pos + ) + assert i_hap_probs.shape == (x, 2) + # Get MAP alleles at imputed markers + imputed_alleles = get_map_alleles(i_hap_probs) + assert len(imputed_alleles) == x + return imputed_alleles diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py new file mode 100644 index 0000000000..1267ee92fb --- /dev/null +++ b/python/tests/test_imputation.py @@ -0,0 +1,308 @@ +""" +Tests for genotype imputation (forward and Baum-Welsh algorithms). +""" +import io + +import numpy as np + +import _tskit +import tskit + + +# A toy tree sequence containing 3 diploid individuals with 5 sites and 5 mutations. +# Two toy query haplotypes are targets for imputation. + +toy_ts_nodes_text = """\ +id is_sample time population individual metadata +0 1 0.000000 0 0 +1 1 0.000000 0 0 +2 1 0.000000 0 1 +3 1 0.000000 0 1 +4 1 0.000000 0 2 +5 1 0.000000 0 2 +6 0 0.029768 0 -1 +7 0 0.133017 0 -1 +8 0 0.223233 0 -1 +9 0 0.651586 0 -1 +10 0 0.698831 0 -1 +11 0 2.114867 0 -1 +12 0 4.322031 0 -1 +13 0 7.432311 0 -1 +""" + +toy_ts_edges_text = """\ +left right parent child metadata +0.000000 1000000.000000 6 0 +0.000000 1000000.000000 6 3 +0.000000 1000000.000000 7 2 +0.000000 1000000.000000 7 5 +0.000000 1000000.000000 8 1 +0.000000 1000000.000000 8 4 +0.000000 781157.000000 9 6 +0.000000 781157.000000 9 7 +0.000000 505438.000000 10 8 +0.000000 505438.000000 10 9 +505438.000000 549484.000000 11 8 +505438.000000 549484.000000 11 9 +781157.000000 1000000.000000 12 6 +781157.000000 1000000.000000 12 7 +549484.000000 1000000.000000 13 8 +549484.000000 781157.000000 13 9 +781157.000000 1000000.000000 13 12 +""" + +toy_ts_sites_text = """\ +position ancestral_state metadata +200000.000000 A +300000.000000 C +520000.000000 G +600000.000000 T +900000.000000 A +""" + +toy_ts_mutations_text = """\ +site node time derived_state parent metadata +0 9 unknown G -1 +1 8 unknown A -1 +2 9 unknown T -1 +3 9 unknown C -1 +4 12 unknown C -1 +""" + +toy_ts_individuals_text = """\ +flags +0 +0 +0 +""" + +toy_query_haplotypes_01 = np.array( + [ + [ + 0, + 1, + -1, + 1, + 1, + ], + [ + 0, + 1, + -1, + 0, + 1, + ], + ], + dtype=np.int32, +) + +toy_query_haplotypes_ACGT = np.array( + [ + [0, 0, -1, 2, 2], # AACC + [0, 0, -1, 3, 2], # AATC + ], + dtype=np.int32, +) + + +def get_toy_data(): + ref_ts = tskit.load_text( + nodes=io.StringIO(toy_ts_nodes_text), + edges=io.StringIO(toy_ts_edges_text), + sites=io.StringIO(toy_ts_sites_text), + mutations=io.StringIO(toy_ts_mutations_text), + individuals=io.StringIO(toy_ts_individuals_text), + strict=False, + ) + query_h = toy_query_haplotypes_ACGT + return [ref_ts, query_h] + + +def get_tskit_forward_backward_matrices(ts, h): + m = ts.num_sites + fm = _tskit.CompressedMatrix(ts._ll_tree_sequence) + bm = _tskit.CompressedMatrix(ts._ll_tree_sequence) + ls_hmm = _tskit.LsHmm( + ts._ll_tree_sequence, np.zeros(m) + 0.1, np.zeros(m) + 0.1, acgt_alleles=True + ) + ls_hmm.forward_matrix(h, fm) + ls_hmm.backward_matrix(h, fm.normalisation_factor, bm) + return [fm.decode(), bm.decode()] + + +# BEAGLE 4.1 was run on the toy data set above using default parameters. +# +# In the query VCF, the site at position 520,000 was redacted and then imputed. +# Note that the ancestral allele in the simulated tree sequence is +# treated as the REF in the VCFs. +# +# Three matrices were printed and kept below: +# 1. Forward probability matrix +# 2. Backward probability matrix +# 3. HMM state probability matrix +# +# There are two sets of matrices, one for each query haplotype. +# +# Notes about calculations: +# h = index of reference haplotype +# H = number of reference haplotypes +# m = index of genotyped marker +# M = number of genotyped markers +# +# In forward probability matrix, +# fwd[m][h] = emission prob., if m = 0 (first marker) +# fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise +# +# where scale = (1 - switch prob.)/sum of fwd[m - 1], +# and shift = switch prob. / H. +# +# In backward probability matrix, +# bwd[m][h] = 1/n, if m = M - 1 (last marker) +# unadj. bwd[m][h] = emission prob. / H +# bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise +# +# where scale = (1 - switch prob.)/sum of unadj. bwd[m], +# and shift = switch prob. / H. +# +# For each site, the sum of backward values over all haplotypes was calculated +# before scaling and shifting. +# +# The matrices below were obtained by setting Ne to 10. + +beagle_fwd_matrix_text_0 = """ +m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val, +0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100, +0,1,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,1.000000,0.999900, +0,2,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000100,0.000100, +0,3,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000200,0.000100, +0,4,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,2.000100,0.999900, +0,5,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,2.000200,0.000100, +1,0,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,0.000001,0.000001, +1,1,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,2.806240,2.816428,2.816427, +1,2,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,2.816429,0.000001, +1,3,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,2.816430,0.000001, +1,4,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,2.806240,5.632857,2.816427, +1,5,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,5.632858,0.000001, +2,0,0.283469,0.716531,0.999900,0.000100,1,1,0.047245,0.127206,0.047240,0.047240, +2,1,0.283469,0.716531,0.999900,0.000100,0,1,0.047245,0.127206,0.047281,0.000041, +2,2,0.283469,0.716531,0.999900,0.000100,1,1,0.047245,0.127206,0.094521,0.047240, +2,3,0.283469,0.716531,0.999900,0.000100,1,1,0.047245,0.127206,0.141761,0.047240, +2,4,0.283469,0.716531,0.999900,0.000100,0,1,0.047245,0.127206,0.141802,0.000041, +2,5,0.283469,0.716531,0.999900,0.000100,1,1,0.047245,0.127206,0.189042,0.047240, +3,0,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,4.948676,0.244501,0.244501, +3,1,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,4.948676,0.244502,0.000001, +3,2,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,4.948676,0.489003,0.244501, +3,3,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,4.948676,0.733504,0.244501, +3,4,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,4.948676,0.733505,0.000001, +3,5,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,4.948676,0.978005,0.244501, +""" + +beagle_bwd_matrix_text_0 = """ +m,h,val, +3,0,0.166667, +3,1,0.166667, +3,2,0.166667, +3,3,0.166667, +3,4,0.166667, +3,5,0.166667, +2,0,0.244614, +2,1,0.010772, +2,2,0.244614, +2,3,0.244614, +2,4,0.010772, +2,5,0.244614, +1,0,0.226377, +1,1,0.047246, +1,2,0.226377, +1,3,0.226377, +1,4,0.047246, +1,5,0.226377, +0,0,0.010973, +0,1,0.478054, +0,2,0.010973, +0,3,0.010973, +0,4,0.478054, +0,5,0.010973, +""" + +beagle_fwd_matrix_text_1 = """ +m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shiftFac,scaleFac,sumSite,val, +0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100, +0,1,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,1.000000,0.999900, +0,2,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000100,0.000100, +0,3,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,1.000200,0.000100, +0,4,0.000000,1.000000,0.999900,0.000100,0,0,0.000000,1.000000,2.000100,0.999900, +0,5,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,2.000200,0.000100, +1,0,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,0.000001,0.000001, +1,1,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,2.806240,2.816428,2.816427, +1,2,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,2.816429,0.000001, +1,3,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,2.816430,0.000001, +1,4,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,2.806240,5.632857,2.816427, +1,5,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,2.806240,5.632858,0.000001, +2,0,0.283469,0.716531,0.999900,0.000100,1,0,0.047245,0.127206,0.000005,0.000005, +2,1,0.283469,0.716531,0.999900,0.000100,0,0,0.047245,0.127206,0.405474,0.405470, +2,2,0.283469,0.716531,0.999900,0.000100,1,0,0.047245,0.127206,0.405479,0.000005, +2,3,0.283469,0.716531,0.999900,0.000100,1,0,0.047245,0.127206,0.405484,0.000005, +2,4,0.283469,0.716531,0.999900,0.000100,0,0,0.047245,0.127206,0.810953,0.405470, +2,5,0.283469,0.716531,0.999900,0.000100,1,0,0.047245,0.127206,0.810958,0.000005, +3,0,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,1.153582,0.010753,0.010753, +3,1,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,1.153582,0.010801,0.000048, +3,2,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,1.153582,0.021554,0.010753, +3,3,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,1.153582,0.032307,0.010753, +3,4,0.064493,0.935507,0.999900,0.000100,0,1,0.010749,1.153582,0.032355,0.000048, +3,5,0.064493,0.935507,0.999900,0.000100,1,1,0.010749,1.153582,0.043109,0.010753, +""" + +beagle_bwd_matrix_text_1 = """ +m,h,val, +,0,0.166667, +3,1,0.166667, +3,2,0.166667, +3,3,0.166667, +3,4,0.166667, +3,5,0.166667, +2,0,0.244614, +2,1,0.010772, +2,2,0.244614, +2,3,0.244614, +2,4,0.010772, +2,5,0.244614, +1,0,0.048055, +1,1,0.403891, +1,2,0.048055, +1,3,0.048055, +1,4,0.403891, +1,5,0.048055, +0,0,0.010754, +0,1,0.478491, +0,2,0.010754, +0,3,0.010754, +0,4,0.478491, +0,5,0.010754, +""" + + +def parse_matrix(csv_text): + # This returns a record array, which is essentially the same as a + # pandas dataframe, which we can access via df["m"] etc. + return np.lib.npyio.recfromcsv(io.StringIO(csv_text)) + + +def test_toy_example(): + ref_ts, query = get_toy_data() + print(list(ref_ts.haplotypes())) + print(ref_ts) + print(query) + tskit_fwd, tskit_bwd = get_tskit_forward_backward_matrices(ref_ts, query[0]) + beagle_fwd = parse_matrix(beagle_fwd_matrix_text_1) + beagle_bwd = parse_matrix(beagle_bwd_matrix_text_1) + print("Forward probability matrix") + print("tskit") + print(tskit_fwd) + print("beagle") + print(beagle_fwd["val"].reshape((4, 6))) + print("Backward probability matrix") + print("tskit") + print(tskit_bwd) + print("beagle") + print(beagle_bwd["val"].reshape((4, 6)))