diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py new file mode 100644 index 0000000000..d7440061f6 --- /dev/null +++ b/python/tests/beagle_numba.py @@ -0,0 +1,1085 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute alleles by linear interpolation +of state probabilities at ungenotyped positions in the hidden state probability matrix. + +This was implemented while closely consulting the BEAGLE 4.1 paper and source code: +* Browning & Browning (2016). Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 +* Source code: https://faculty.washington.edu/browning/beagle/b4_1.html + +These notations are used throughout: +h = number of reference haplotypes. +m = number of genotyped positions. +x = number of ungenotyped positions. + +This implementation takes the following inputs: +* Panel of reference haplotypes in a matrix of size (m + x, h). +* Query haplotype in an array of size (m + x). +* Physical positions of all the markers in an array of size (m + x). +* Genetic map. + +In the query haplotype: +* Genotyped positions take values of 0, 1, 2, or 3 (i.e. ACGT encoding). +* Ungenotyped positions take -1. + +The following objects are computed: +* Forward and backward probability matrices of size (m, h). +* Hidden state probability matrix of size (m, h). +* Interpolated state probability matrix of size (x, h). +* Imputed allele probability matrix of size (x, 4), +* Imputed alleles as the maximum a posteriori alleles. + +The following evaluation metrics are produced in VCF format: +* Estimated allelic R-squared (AR2). +* Dosage R-squared (DR2). +* Estimated allele frequency (AF). +* Genotype probabilities of 00, 01/10, and 11 (GP). +* Estimated dosage (DS). + +To improve computational efficiency, BEAGLE uses aggregated markers, which are clusters +of markers within a 0.005 cM interval (default). Because the genotypes are phased, +the alleles in the aggregated markers form distinct "allele sequences". Below, we do not +use aggregated markers or allele sequences, which would complicate the implementation. + +Rather than exactly replicate the original BEAGLE algorithm, this implementation uses +Equation 1 of BB2016. +""" +import warnings +from dataclasses import dataclass + +import numpy as np +from numba import njit + +import _tskit +import tskit + + +__VERSION__ = "0.0.0" +__DATE__ = "20XXXXXX" + +_ACGT_ALLELES_INT = [0, 1, 2, 3, tskit.MISSING_DATA] + + +@dataclass(frozen=True) +class GeneticMap: + """ + A genetic map containing: + * Physical positions (bp). + * Genetic map positions (cM). + """ + + base_pos: np.ndarray + gen_pos: np.ndarray + + def __post_init__(self): + assert len(self.base_pos) == len( + self.gen_pos + ), "Lengths of physical positions and genetic map positions differ." + assert np.all( + self.base_pos[1:] > self.base_pos[:-1] + ), "Physical positions are not in strict ascending order." + + +@dataclass(frozen=True) +class ImpData: + """ + Imputation data containing: + * Individual names. + * Physical positions of imputed sites (bp). + * Designated REF allele at each site. + * Designated ALT allele at each site. + * Imputed alleles at each site. + * Imputed allele probabilities at each site. + + Assume that all the sites are biallelic. + + Let x = number of imputed sites and q = number of query haplotypes. + Since the query haplotypes are from diploid individuals, q is equal to + twice the number of individuals. + + Imputed alleles is a matrix of size (q, x). + Imputed allele probabilities is a matrix of size (q, x). + """ + + individual_names: list + site_pos: np.ndarray + refs: np.ndarray + alts: np.ndarray + alleles: np.ndarray + allele_probs: np.ndarray + + def __post_init__(self): + assert len(self.individual_names) > 0, "There must be at least one individual." + assert len(self.site_pos) > 0, "There must be at least one site." + assert self.alleles.shape[0] / 2 == len( + self.individual_names + ), "Number of query haplotypes is not equal to twice the number of individuals." + assert len(self.site_pos) == len( + self.alts + ), "Number of sites in refs is not equal to the number of site positions." + assert len(self.site_pos) == len( + self.refs + ), "Number of sites in alts != number of site positions." + assert ( + len(self.site_pos) == self.alleles.shape[1] + ), "Number of sites in alleles != number of site positions." + assert ( + self.alleles.shape == self.allele_probs.shape + ), "Dimensions of alleles != dimensions of allele probabilities." + for i in range(self.alleles.shape[1]): + assert np.all(np.isin(self.alleles[:, i], [self.refs[i], self.alts[i]])) + assert np.all( + np.isin(np.unique(self.refs), _ACGT_ALLELES_INT) + ), "Unrecognized alleles are in REF alleles." + assert np.all( + np.isin(np.unique(self.alts), _ACGT_ALLELES_INT) + ), "Unrecognized alleles are in ALT alleles." + assert np.all( + np.isin(np.unique(self.alleles), _ACGT_ALLELES_INT) + ), "Unrecognized alleles are in alleles." + assert ~np.array_equal(self.refs, self.alts), "Some REFs are identical to ALTs." + + @property + def num_sites(self): + return len(self.site_pos) + + @property + def num_samples(self): + return self.alleles.shape[0] + + @property + def num_individuals(self): + return len(self.individual_names) + + def get_ref_allele_at_site(self, i): + return self.refs[i] + + def get_alt_allele_at_site(self, i): + return self.alts[i] + + def get_alleles_at_site(self, i): + idx_hap1 = np.arange(0, self.num_samples, 2) + idx_hap2 = np.arange(1, self.num_samples, 2) + a1 = self.alleles[idx_hap1, i] + ap1 = self.allele_probs[idx_hap1, i] + a2 = self.alleles[idx_hap2, i] + ap2 = self.allele_probs[idx_hap2, i] + return a1, ap1, a2, ap2 + + +""" Helper functions. """ + + +def remap_alleles(a): + """ + Map an array of alleles encoded as characters to integers. + + :param np.ndarray a: Alleles. + :return: Recoded alleles. + :rtype: np.ndarray(dtype=np.int8) + """ + b = np.zeros(len(a), dtype=np.int8) - 1 # Encoded as missing by default. + for i in range(len(a)): + if a[i] in [None, ""]: + continue + try: + b[i] = tskit.ALLELES_ACGT.index(a[i]) + except ValueError as err: + err.args = (f"Allele {a[i]} is not recognised.",) + raise + return b + + +def check_data(ref_h, query_h): + """ + For each position, check whether the alleles in the query haplotype + are represented in the reference haplotypes. + + Missing data (i.e. -1) are ignored. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :return: True if alleles in query are in references at all the positions. + :rtype: bool + """ + m = ref_h.shape[0] # Number of genotyped positions. + assert m == len(query_h), "Reference and query haplotypes differ in length." + for i in range(m): + if query_h[i] != tskit.MISSING_DATA: + ref_a = np.unique(ref_h[i, :]) + if query_h[i] not in ref_a: + raise AssertionError( + f"Allele {query_h[i]} at the {i}-th position is not in reference." + ) + return True + + +def convert_to_genetic_map_positions(pos, *, genetic_map=None): + """ + Convert physical positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed + that the recombination rate is constant (1 cM / 1 Mb). + + If a genetic map is specified, then the genetic map positions are + either taken straight from it or interpolated using it. The genetic map + needs to contain physical positions and corresponding genetic map positions. + See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details. + + :param numpy.ndarray pos: Physical positions (bp). + :param GeneticMap genetic_map: Genetic map. + :return: Genetic map positions (cM). + :rtype: numpy.ndarray + """ + # See 'cumPos' in 'ImputationData.java' in BEAGLE 4.1. + _MIN_CM_DIST = 1e-7 + if genetic_map is None: + return pos / 1e6 # 1 cM / 1 Mb + assert np.all(pos >= genetic_map.base_pos[0]) and np.all( + pos < genetic_map.base_pos[-1] + ), "Some physical positions are outside of genetic map." + # Approximate genetic map distances by linear interpolation. + # Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i]. + right_idx = np.searchsorted(genetic_map.base_pos, pos, side="right") + est_cm = np.zeros(len(pos), dtype=np.float64) # BEAGLE 4.1 uses double in Java. + for i in range(len(pos)): + a = genetic_map.base_pos[right_idx[i] - 1] + b = genetic_map.base_pos[right_idx[i]] + fa = genetic_map.gen_pos[right_idx[i] - 1] + fb = genetic_map.gen_pos[right_idx[i]] + assert ( + pos[i] >= a + ), f"Query position is not >= left-bound position: {pos[i]}, {a}." + assert ( + fb >= fa + ), f"Genetic map positions are not monotonically ascending: {fb}, {fa}." + est_cm[i] = fa + est_cm[i] += (fb - fa) * (pos[i] - a) / (b - a) + # Ensure that adjacent positions are not identical in cM. + if i > 0: + if est_cm[i] - est_cm[i - 1] < _MIN_CM_DIST: + est_cm[i] = est_cm[i - 1] + _MIN_CM_DIST + return est_cm + + +""" Li & Stephens HMM. """ + + +def get_mismatch_probs(num_sites, error_rate): + """ + Compute mismatch probabilities at genotyped positions. + + Mutation rates should be dominated by the rate of allele error, + which should be the main source of mismatch between query and + reference haplotypes. + + In BEAGLE 4.1/5.4, error rate is 1e-4 by default, and capped at 0.5. + In IMPUTE5, the default value is also 1e-4. + + This corresponds to `mu` in `_tskit.LsHmm`. + + :param numpy.ndarray num_sites: Number of sites. + :param float error_rate: Allele error rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + MAX_ERROR_RATE = 0.50 + if error_rate >= MAX_ERROR_RATE: + error_rate = MAX_ERROR_RATE + mismatch_probs = np.zeros(num_sites, dtype=np.float64) + error_rate + return mismatch_probs + + +def get_transition_probs(cm, h, ne): + """ + Compute probabilities of transitioning to a random state at genotyped positions. + + In BEAGLE 4.1, the default value of `ne` is set to 1e6, + whereas in BEAGLE 5.4, the default value of `ne` is set to 1e5. + In BEAGLE 4.1/5.4, this value was optimized empirically on datasets + from large outbred human populations. + + In IMPUTE5, the default value of `ne` is set to 1e6. + + If `h` is small and `ne` is large, the transition probabilities are ~1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes too frequently. + + This corresponds to `rho` in `_tskit.LsHmm`. + + :param numpy.ndarray cm: Genetic map positions (cM). + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Transition probabilities. + :rtype: numpy.ndarray + """ + # E(number of crossover events) at first site is always 0. + r = np.zeros(len(cm), dtype=np.float64) + r[1:] = np.diff(cm) + c = -0.04 * (ne / h) + trans_probs = -1 * np.expm1(c * r) + return trans_probs + + +@njit +def compute_emission_probability(mismatch_prob, is_match, *, num_alleles=2): + """ + Compute the emission probability at a site based on whether the alleles + carried by a query haplotype and a reference haplotype match at the site. + + Emission probability may be scaled according to the number of distinct + segregating alleles. + + :param float mismatch_prob: Mismatch probability. + :param bool is_match: True if matched, otherwise False. + :param int num_alleles: Number of distinct alleles (default = 2). + :return: Emission probability. + :rtype: float + """ + em_prob = mismatch_prob + if is_match: + em_prob = 1.0 - (num_alleles - 1) * mismatch_prob + return em_prob + + +""" Replication of BEAGLE's implementation of the Li and Stephens HMM. """ + + +@njit +def compute_forward_matrix( + ref_h, query_h, trans_probs, mismatch_probs, *, num_alleles=2 +): + """ + Implement Li and Stephens forward algorithm. + + Reference haplotypes and query haplotype are subsetted to genotyped positions. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + This computes a forward probablity matrix of size (m, h). + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray trans_probs: Transition probabilities. + :param numpy.ndarray mismatch_probs: Mismatch probabilities. + :param int num_alleles: Number of distinct alleles (default = 2). + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + h = ref_h.shape[1] # Number of reference haplotypes. + m = ref_h.shape[0] # Number of genotyped positions. + fwd_mat = np.zeros((m, h), dtype=np.float64) + last_sum = 1.0 # Normalization factor. + for i in range(m): + # Get site-specific parameters. + shift = trans_probs[i] / h + scale = (1 - trans_probs[i]) / last_sum + # Get allele at genotyped position i on query haplotype. + query_a = query_h[i] + for j in range(h): + # Get allele at genotyped position i on reference haplotype j. + ref_a = ref_h[i, j] + # Get emission probability. + em_prob = mismatch_probs[i] + if query_a == ref_a: + em_prob = 1.0 - (num_alleles - 1) * mismatch_probs[i] + fwd_mat[i, j] = em_prob + if i > 0: + fwd_mat[i, j] *= scale * fwd_mat[i - 1, j] + shift + site_sum = np.sum(fwd_mat[i, :]) + # Prior probabilities are multiplied when i = 0. + last_sum = site_sum / h if i == 0 else site_sum + return fwd_mat + + +@njit +def compute_backward_matrix( + ref_h, query_h, trans_probs, mismatch_probs, *, num_alleles=2 +): + """ + Implement Li and Stephens backward algorithm. + + Reference haplotypes and query haplotype are subsetted to genotyped positions. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + This computes a backward probablity matrix of size (m, h). + + In BEAGLE 4.1, the values are kept one position at a time. Here, we keep the values + at all the positions. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray trans_probs: Transition probabilities. + :param numpy.ndarray mismatch_probs: Mismatch probabilities. + :param int num_alleles: Number of distinct alleles (default = 2). + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + h = ref_h.shape[1] # Number of reference haplotypes. + m = ref_h.shape[0] # Number of genotyped positions. + bwd_mat = np.zeros((m, h), dtype=np.float64) + bwd_mat[-1, :] = 1.0 / h # Initialise the last column. + for i in range(m - 2, -1, -1): + iP1 = i + 1 + query_a = query_h[iP1] + for j in range(h): + ref_a = ref_h[iP1, j] + em_prob = mismatch_probs[iP1] + if query_a == ref_a: + em_prob = 1.0 - (num_alleles - 1) * mismatch_probs[iP1] + bwd_mat[iP1, j] *= em_prob + site_sum = np.sum(bwd_mat[iP1, :]) + scale = (1 - trans_probs[iP1]) / site_sum + shift = trans_probs[iP1] / h + bwd_mat[i, :] = scale * bwd_mat[iP1, :] + shift + return bwd_mat + + +def compute_state_prob_matrix(fwd_mat, bwd_mat): + """ + Implement Li and Stephens forward-backward algorithm. + + The forward and backward probability matrices are of size (m, h). + + This computes a hidden state probability matrix of size (m, h), + in which each element is the probability of copying from + a reference haplotype at a genotyped position. + + Implementing this is simpler than faithfully reproducing BEAGLE 4.1. + + :param numpy.ndarray fwd_mat: Forward probability matrix. + :param numpy.ndarray bwd_mat: Backward probability matrix. + :return: Hidden state probability matrix. + :rtype: numpy.ndarray + """ + assert ( + fwd_mat.shape == bwd_mat.shape + ), "Forward and backward matrices differ in shape." + state_mat = np.multiply(fwd_mat, bwd_mat) + # Normalise each column. + for i in range(len(state_mat)): + state_mat[i, :] /= np.sum(state_mat[i, :]) + return state_mat + + +""" Imputation of ungenotyped positions. """ + + +@njit +def get_weights(typed_pos, untyped_pos, typed_cm, untyped_cm): + """ + Compute weights for the ungenotyped positions in a query haplotype, which are used in + linear interpolation of hidden state probabilities at the ungenotyped positions. + + In BB2016 (see below Equation 1), a weight between genotyped positions m and (m + 1) + bounding ungenotyped position x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + This looks for the right-bounding position instead of the left-bounding. + + :param numpy.ndarray typed_pos: Physical positions of genotyped markers (bp). + :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers (bp). + :param numpy.ndarray typed_cm: Genetic map positions of genotyped markers (cM). + :param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers (cM). + :return: Weights for ungenotyped positions and indices of right-bounding positions. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + m = len(typed_pos) # Number of genotyped positions. + x = len(untyped_pos) # Number of ungenotyped positions. + # Identify genotype positions (m - 1) and m bounding ungenotyped position i. + # Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i]. + right_idx = np.searchsorted(typed_pos, untyped_pos, side="right") + # Calculate weights for ungenotyped positions. + weights = np.zeros(x, dtype=np.float64) + for i in range(len(right_idx)): + k = right_idx[i] + if k == 0: + # Left of the first genotyped position. + weights[i] = 1.0 + elif k == m: + # Right of the last genotyped position. + weights[i] = 0.0 + else: + # Between two genotyped positions. + cm_m2x = typed_cm[k] - untyped_cm[i] + # Avoid negative weights. + if cm_m2x < 0: + cm_m2x = 0 + cm_m2mM1 = typed_cm[k] - typed_cm[k - 1] + weights[i] = cm_m2x / cm_m2mM1 + return (weights, right_idx) + + +@njit +def interpolate_allele_probs( + state_mat, + ref_h, + pos_typed, + pos_untyped, + cm_typed, + cm_untyped, + *, + use_threshold=False, + return_weights=False, +): + """ + Interpolate allele probabilities at the ungenotyped positions of a query haplotype + following Equation 1 of BB2016. + + The interpolated allele probability matrix is of size (x, a), + where a is the number of alleles. + + Note that this function takes: + 1. Hidden state probability matrix at genotyped positions of size (m, h). + 2. Reference haplotypes subsetted to ungenotyped positions of size (x, h). + + If thresholding is employed, it replicates BEAGLE's way to approximate calculations. + See 'setFirstAlleleProbs', 'setAlleleProbs', and 'setLastAlleleProbs' + in 'LSHapBaum.java' in BEAGLE 4.1 source code. + + :param numpy.ndarray state_mat: State probability matrix at genotyped positions. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to ungenotyped positions. + :param numpy.ndarray pos_typed: Physical positions of genotyped markers (bp). + :param numpy.ndarray pos_untyped: Physical positions of ungenotyped markers (bp). + :param numpy.ndarray cm_typed: Genetic map positions at genotyped markers (cM). + :param numpy.ndarray cm_untyped: Genetic map positions at ungenotyped markers (cM). + :param bool use_threshold: Set trivial probabilities to 0 if True (default = False). + :param bool return_weights: Return weights if True (default = False). + :return: Imputed allele probabilities and weights. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + # TODO: Allow for biallelic site matrix. Work with `_tskit.lshmm` properly. + alleles = np.arange(len(tskit.ALLELES_ACGT)) + m = state_mat.shape[0] # Number of genotyped positions. + x = ref_h.shape[0] # Number of ungenotyped positions. + # Set threshold to set trivial probabilities to zero. + _MIN_THRESHOLD = 0 + weights, right_idx = get_weights(pos_typed, pos_untyped, cm_typed, cm_untyped) + probs = np.zeros((x, len(alleles)), dtype=np.float64) + for i in range(x): + k = right_idx[i] + w = weights[i] + for a in alleles: + is_a_in_ref_h = ref_h[i, :] == a + if np.sum(is_a_in_ref_h) == 0: + # This avoids division by zero when getting a threshold adaptively below. + continue + if use_threshold: + # TODO: Check whether this is implemented correctly. Not used by default. + # Threshold based on "the number of subsets in the partition Am of H". + threshold_Am = 1 / np.sum(is_a_in_ref_h) + _MIN_THRESHOLD = min(0.005, threshold_Am) + if k == 0: + # See 'setFirstAlleleProbs' in 'LSHapBaum.java'. + assert w == 1.0, "Weight should be 1.0." + sum_probs_a_k = np.sum(state_mat[k, is_a_in_ref_h]) + if sum_probs_a_k > _MIN_THRESHOLD: + probs[i, a] += sum_probs_a_k + elif k == m: + # See 'setLastAlleleProbs' in 'LSHapBaum.java'. + assert w == 0.0, "Weight should be 0.0." + sum_probs_a_kM1 = np.sum(state_mat[k - 1, is_a_in_ref_h]) + if sum_probs_a_kM1 > _MIN_THRESHOLD: + probs[i, a] += sum_probs_a_kM1 + else: + # See 'setAlleleProbs' in 'LSHapBaum.java'. + sum_probs_a_k = np.sum(state_mat[k, is_a_in_ref_h]) + sum_probs_a_kM1 = np.sum(state_mat[k - 1, is_a_in_ref_h]) + if max(sum_probs_a_k, sum_probs_a_kM1) > _MIN_THRESHOLD: + probs[i, a] += w * sum_probs_a_kM1 + probs[i, a] += (1 - w) * sum_probs_a_k + site_sums = np.sum(probs, axis=1) + assert np.all(site_sums > 0), "Some site sums of allele probabilities is <= 0." + probs_rescaled = probs / site_sums[:, np.newaxis] + if return_weights: + return (probs_rescaled, weights) + return (probs_rescaled, None) + + +def get_map_alleles(allele_probs): + """ + Compute maximum a posteriori (MAP) alleles at the ungenotyped positions + of a query haplotype, based on posterior marginal allele probabilities. + + The imputed alleles and their probabilities are arrays of size x. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + TODO: Investigate how often this happens and the effect of this arbitrary choice. + + :param numpy.ndarray allele_probs: Imputed allele probabilities. + :return: Imputed alleles and their probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + imputed_alleles = np.argmax(allele_probs, axis=1) + max_allele_probs = np.max(allele_probs, axis=1) + return (imputed_alleles, max_allele_probs) + + +def run_interpolation_beagle( + ref_h, + query_h, + pos_all, + *, + ne=1e6, + error_rate=1e-4, + genetic_map=None, + use_threshold=False, +): + """ + Perform a simplified version of the procedure of interpolation-style imputation + based on Equation 1 of BB2016. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x). + + The physical positions of all the markers are an array of size (m + x). + + This produces imputed alleles and their probabilities at the ungenotyped positions + of the query haplotype. + + The default values of `ne` and `error_rate` are taken from BEAGLE 4.1, not 5.4. + In BEAGLE 5.4, the default value of `ne` is 1e5, and `error_rate` is data-dependent. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos_all: Physical positions of all the markers (bp). + :param int ne: Effective population size (default = 1e6). + :param float error_rate: Allele error rate (default = 1e-4). + :param GeneticMap genetic_map: Genetic map (default = None). + :param bool use_threshold: Set trivial probabilities to 0 if True (default = False). + :return: Imputed alleles and their probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + warnings.warn("This function is experimental and not fully tested.", stacklevel=1) + warnings.warn( + "Check the reference and query haplotypes use the same allele encoding.", + stacklevel=1, + ) + num_alleles = len(tskit.ALLELES_ACGT) + h = ref_h.shape[1] # Number of reference haplotypes. + # Separate indices of genotyped and ungenotyped positions. + idx_typed = np.where(query_h != tskit.MISSING_DATA)[0] + idx_untyped = np.where(query_h == tskit.MISSING_DATA)[0] + # Get physical positions of of genotyped and ungenotyped markers. + pos_typed = pos_all[idx_typed] + pos_untyped = pos_all[idx_untyped] + # Get genetic map positions of of genotyped and ungenotyped markers. + cm_typed = convert_to_genetic_map_positions(pos_typed, genetic_map=genetic_map) + cm_untyped = convert_to_genetic_map_positions(pos_untyped, genetic_map=genetic_map) + # Get HMM probabilities at genotyped positions. + trans_probs = get_transition_probs(cm_typed, h=h, ne=ne) + mismatch_probs = get_mismatch_probs(len(pos_typed), error_rate=error_rate) + # Subset haplotypes. + ref_h_typed = ref_h[idx_typed, :] + ref_h_untyped = ref_h[idx_untyped, :] + query_h_typed = query_h[idx_typed] + # Compute matrices at genotyped positions. + fwd_mat = compute_forward_matrix( + ref_h_typed, + query_h_typed, + trans_probs, + mismatch_probs, + num_alleles=num_alleles, + ) + bwd_mat = compute_backward_matrix( + ref_h_typed, + query_h_typed, + trans_probs, + mismatch_probs, + num_alleles=num_alleles, + ) + state_mat = compute_state_prob_matrix(fwd_mat, bwd_mat) + # Interpolate allele probabilities. + imputed_allele_probs, _ = interpolate_allele_probs( + state_mat=state_mat, + ref_h=ref_h_untyped, + pos_typed=pos_typed, + pos_untyped=pos_untyped, + cm_typed=cm_typed, + cm_untyped=cm_untyped, + use_threshold=use_threshold, + return_weights=False, + ) + imputed_alleles, max_allele_probs = get_map_alleles(imputed_allele_probs) + return (imputed_alleles, max_allele_probs) + + +def run_tsimpute( + ref_ts, + query_h, + pos_all, + *, + ne=1e6, + error_rate=1e-4, + precision=10, + genetic_map=None, + use_threshold=False, +): + """ + Perform interpolation-style imputation, except that the forward and backward + probability matrices are computed from a tree sequence. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x). + + The physical positions of all the markers are an array of size (m + x). + + This produces imputed alleles and their probabilities at the ungenotyped positions + of the query haplotype. + + The default values for `ne` and `error_rate` are taken from BEAGLE 4.1. + + In an analysis comparing imputation accuracy from precision 6 to 24 + using the FinnGen SiSu dataset (~1k genotyped positions in query haplotypes), + accuracy was highly similar from 8 to 24 and only drastically worsened at 6. + Also, in an informal benchmark experiment, the runtime per query haplotype + improved ~8x, going from precision 22 to 8. This indicates that there is + a large boost in speed with very little loss in accuracy when precision is 8. + To be on the safe side, the default value of precision is set to 10. + + Note that BEAGLE 4.1 uses Java float (32-bit) when calculating + the forward, backward, and hidden state probability matrices. + + TODO: Handle `acgt_alleles` properly. + + :param numpy.ndarray ref_ts: Tree sequence with reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos_all: Physical positions of all the markers (bp). + :param int ne: Effective population size (default = 1e6). + :param float error_rate: Allelic error rate (default = 1e-4). + :param int precision: Precision for running LS HMM (default = 10). + :param GeneticMap genetic_map: Genetic map (default = None). + :param bool use_threshold: Set trivial probabilities to 0 if True (default = False). + :return: Imputed alleles and their probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + warnings.warn( + "Check the reference and query haplotypes use the same allele encoding.", + stacklevel=1, + ) + h = ref_ts.num_samples # Number of reference haplotypes. + # Separate indices of genotyped and ungenotyped positions. + idx_typed = np.where(query_h != tskit.MISSING_DATA)[0] + idx_untyped = np.where(query_h == tskit.MISSING_DATA)[0] + # Set physical positions of genotyped and ungenotyped markers. + pos_typed = pos_all[idx_typed] + pos_untyped = pos_all[idx_untyped] + # Get genetic map positions of genotyped and ungenotyped markers. + cm_typed = convert_to_genetic_map_positions(pos_typed, genetic_map=genetic_map) + cm_untyped = convert_to_genetic_map_positions(pos_untyped, genetic_map=genetic_map) + # Get HMM probabilities at genotyped positions. + trans_probs = get_transition_probs(cm_typed, h=h, ne=ne) + mismatch_probs = get_mismatch_probs(len(pos_typed), error_rate=error_rate) + # Subset haplotypes. + ref_ts_typed = ref_ts.delete_sites(site_ids=idx_untyped) + ref_ts_untyped = ref_ts.delete_sites(site_ids=idx_typed) + ref_h_untyped = ref_ts_untyped.genotype_matrix(alleles=tskit.ALLELES_ACGT) + query_h_typed = query_h[idx_typed] + # Get matrices at genotyped positions from tree sequence. + fwd_mat = _tskit.CompressedMatrix(ref_ts_typed._ll_tree_sequence) + bwd_mat = _tskit.CompressedMatrix(ref_ts_typed._ll_tree_sequence) + # WARN: Be careful with argument `acgt_alleles`!!! + ls_hmm = _tskit.LsHmm( + ref_ts_typed._ll_tree_sequence, + recombination_rate=trans_probs, # Transition probabilities. + mutation_rate=mismatch_probs, # Mismatch probabilities. + acgt_alleles=True, # TODO: Handle allele encoding properly. + precision=precision, + ) + ls_hmm.forward_matrix(query_h_typed.T, fwd_mat) + ls_hmm.backward_matrix(query_h_typed.T, fwd_mat.normalisation_factor, bwd_mat) + # TODO: Check that these state probabilities align. + state_mat = state_mat = np.multiply(fwd_mat.decode(), bwd_mat.decode()) + # Interpolate allele probabilities. + imputed_allele_probs, _ = interpolate_allele_probs( + state_mat=state_mat, + ref_h=ref_h_untyped, + pos_typed=pos_typed, + pos_untyped=pos_untyped, + cm_typed=cm_typed, + cm_untyped=cm_untyped, + use_threshold=use_threshold, + return_weights=False, + ) + imputed_alleles, max_allele_probs = get_map_alleles(imputed_allele_probs) + return (imputed_alleles, max_allele_probs) + + +""" Evaluation metrics and printing of results. """ + + +# Individual-level data. +def compute_individual_scores( + alleles_1, allele_probs_1, alleles_2, allele_probs_2, ref +): + """ + Compute genotype probabilities and allele dosages of diploid individuals + at a position based on posterior marginal allele probabilities. + + Assume that all sites are biallelic. Otherwise, the calculation below is incorrect. + Note 0 refers to the REF allele and 1 the ALT allele. + + Unphased genotype (or dosage) probabilities are: P(RR), P(RA or AR), P(AA). + Dosages of the ALT allele are: RR = 0, RA or AR = 1, AA = 2. + + In BEAGLE 4.1 output, + GP: "Estimated Genotype Probability", and + DS: "Estimated ALT dose [P(RA) + P(AA)]". + + :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1. + :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. + :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2. + :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2. + :param int ref: Specified REF allele (ACGT encoding). + :return: Dosage probabilities and dosage scores. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + n = len(alleles_1) # Number of individuals. + assert len(alleles_2) == n, "Lengths of alleles differ." + assert n > 0, "There must be at least one individual." + assert len(allele_probs_1) == n, "Lengths of alleles and probabilities differ." + assert len(allele_probs_2) == n, "Lengths of alleles and probabilities differ." + dosage_probs = np.zeros((n, 3), dtype=np.float64) + dosage_scores = np.zeros(n, dtype=np.float64) + for i in range(n): + ap_hap1_ref = ( + allele_probs_1[i] if alleles_1[i] == ref else 1 - allele_probs_1[i] + ) + ap_hap1_alt = 1 - ap_hap1_ref + ap_hap2_ref = ( + allele_probs_2[i] if alleles_2[i] == ref else 1 - allele_probs_2[i] + ) + ap_hap2_alt = 1 - ap_hap2_ref + dosage_probs[i, 0] = ap_hap1_ref * ap_hap2_ref # P(RR) + dosage_probs[i, 1] = ap_hap1_ref * ap_hap2_alt # P(RA) + dosage_probs[i, 1] += ap_hap1_alt * ap_hap2_ref # P(AR) + dosage_probs[i, 2] = ap_hap1_alt * ap_hap2_alt # P(AA) + dosage_scores[i] = dosage_probs[i, 1] + 2 * dosage_probs[i, 2] + return (dosage_probs, dosage_scores) + + +# Site-level data. +def compute_allelic_r_squared(dosage_probs): + """ + Compute the estimated allelic R^2 at a position from the unphased genotype + (or dosage) probabilities of a set of diploid individuals. + + Assume that site is biallelic. Otherwise, the calculation below is incorrect. + Note that 0 refers to REF allele and 1 the ALT allele. + + It is not the true allelic R^2, which needs access to true genotypes to compute. + The true allelic R^s is the squared correlation between true and imputed ALT dosages. + It has been shown the true and estimated allelic R-squared are highly correlated. + + In BEAGLE 4.1, it is AR2: "Allelic R-Squared: estimated squared correlation + between most probable REF dose and true REF dose". + See `allelicR2` in `R2Estimator.java` of the BEAGLE 4.1 source code. + + See the formulation in the Appendix 1 of Browning & Browning (2009). + Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005. + + :return: Dosage probabilities and dosage scores. + :return: Estimated allelic R-squared. + :rtype: float + """ + _MIN_R2_DEN = 1e-8 + n = len(dosage_probs) # Number of individuals. + assert n > 0, "There must be at least one individual." + assert dosage_probs.shape[1] == 3, "Three genotypes are considered." + f = 1 / n + z = np.argmax(dosage_probs, axis=1) # Most likely imputed dosage. + u = dosage_probs[:, 1] + 2 * dosage_probs[:, 2] # E[X | y_i] + w = dosage_probs[:, 1] + 4 * dosage_probs[:, 2] # E[X^2 | y_i] + cov = np.sum(z * u) - np.sum(z) * np.sum(u) * f + var_best = np.sum(z**2) - np.sum(z) ** 2 * f + var_exp = np.sum(w) - np.sum(u) ** 2 * f + den = var_best * var_exp + # Minimum of allelic R^2 is zero. + allelic_rsq = 0 if den < _MIN_R2_DEN else cov**2 / den + return allelic_rsq + + +def compute_dosage_r_squared(dosage_probs): + """ + Compute the dosage R^2 for a position from the unphased genotype (or dosage) + probabilities of a set of diploid individuals. + + Assume that site is biallelic. Otherwise, the calculation below is incorrect. + Note that 0 refers to REF allele and 1 the ALT allele. + + In BEAGLE 4.1, DR2: "Dosage R-Squared: estimated squared correlation + between estimated REF dose [P(RA) + 2 * P(RR)] and true REF dose". + See `doseR2` in `R2Estimator.java` of the BEAGLE 4.1 source code. + + :return: Dosage probabilities and dosage scores. + :return: Dosage R-squared. + :rtype: float + """ + _MIN_R2_DEN = 1e-8 + n = len(dosage_probs) # Number of individuals. + assert n > 0, "There must be at least one individual." + assert dosage_probs.shape[1] == 3, "Three genotypes are considered." + f = 1 / n + u = dosage_probs[:, 1] + 2 * dosage_probs[:, 2] # E[X | y_i]. + w = dosage_probs[:, 1] + 4 * dosage_probs[:, 2] # E[X^2 | y_i]. + c = np.sum(u) ** 2 * f + num = np.sum(u**2) - c + if num < 0: + num = 0 + den = np.sum(w) - c + dosage_rsq = 0 if den < _MIN_R2_DEN else num / den + return dosage_rsq + + +def compute_allele_frequency( + alleles_1, + allele_probs_1, + alleles_2, + allele_probs_2, + allele, +): + """ + Estimate the frequency of a specified allele at a position from allele probabilities + of a set of diploid individuals. + + Assume that site is biallelic. Otherwise, the calculation below is incorrect. + + Input are the imputed alleles and their probabilities at a position. + + In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies". + See `printInfo` in `VcfRecBuilder.java` of the BEAGLE 4.1 source code. + + See the note in "Standardized Allele-Frequency Error" in Browning & Browning (2009). + Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005. + + :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1. + :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. + :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2. + :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2. + :param int allele: Specified allele (ACGT encoding). + :return: Estimated allele frequency. + :rtype: float + """ + n = len(alleles_1) # Number of individuals. + assert len(alleles_2) == n, "Lengths of alleles differ." + assert n > 0, "There must be at least one individual." + assert len(allele_probs_1) == n, "Lengths of alleles and probabilities differ." + assert len(allele_probs_2) == n, "Lengths of alleles and probabilities differ." + cum_ap_hap1 = np.sum(allele_probs_1[alleles_1 == allele]) + cum_ap_hap2 = np.sum(allele_probs_2[alleles_2 == allele]) + # See `printInfo` in `VcfRecBuilder.java` in BEAGLE 4.1 source code. + est_af = (cum_ap_hap1 + cum_ap_hap2) / (2 * n) + return est_af + + +def write_vcf(impdata, out_file, *, chr_name="1", print_gp=False, decimals=2): + """ + Print imputation results in VCF format, following the output of BEAGLE 4.1. + + TODO: Print VCF records for genotyped sites. + + :param ImpData impdata: Object containing imputation data. + :param str out_file: Path to output VCF file. + :param str chr_name: Chromosome name (default = "1"). + :param bool print_gp: Print genotype probabilities if True (default = False). + :param int decimals: Number of decimal places to print (default = 2). + :return: None + :rtype: None + """ + _HEADER = [ + "##fileformat=VCFv4.2", + f"##filedata={__DATE__}", + f"##source=tsimpute (version {__VERSION__})", + "##INFO=', + "##INFO=', + "##INFO=', + '##INFO=', + '##FORMAT=', + "##FORMAT=', + "##FORMAT=', + "##FORMAT=', + ] + _COL_NAMES = [ + "CHROM", + "POS", + "ID", + "REF", + "ALT", + "QUAL", + "FILTER", + "INFO", + "FORMAT", + ] + with open(out_file, "w") as f: + # Add header with metadata and definitions. + for line in _HEADER: + f.write(line + "\n") + # Add column names. + col_str = "#" + col_str += "\t".join(_COL_NAMES) + col_str += "\t" + col_str += "\t".join(impdata.individual_names) + f.write(col_str + "\n") + # Add VCF records. + is_imputed = True + for i in range(impdata.num_sites): + a1, ap1, a2, ap2 = impdata.get_alleles_at_site(i) + gt_probs, dosages = compute_individual_scores(a1, ap1, a2, ap2) + line_str = chr_name + line_str += "\t" + line_str += str(int(impdata.site_pos[i])) + line_str += "\t" + line_str += str(i) + line_str += "\t" + REF = impdata.get_ref_allele_at_site(i) + ALT = impdata.get_alt_allele_at_site(i) + line_str += tskit.ALLELES_ACGT[REF] + line_str += "\t" + line_str += tskit.ALLELES_ACGT[ALT] + line_str += "\t" + # QUAL field + # '.' denotes missing. + line_str += "." + line_str += "\t" + # FILTER field + line_str += "PASS" + line_str += "\t" + # INFO field + ar2 = compute_allelic_r_squared(gt_probs) + dr2 = compute_dosage_r_squared(gt_probs) + af = compute_allele_frequency(a1, ap1, a2, ap2, allele=1) + ar2 = round(ar2, decimals) + dr2 = round(dr2, decimals) + af = round(af, decimals) + info_str = f"AR2={ar2};DR2={dr2};AF={af}" + if is_imputed: + info_str += ";" + "IMP" + line_str += info_str + line_str += "\t" + # FORMAT field + line_str += "GT:DS" + if print_gp: + line_str += ":" + "GP" + line_str += "\t" + # DATA fields + data_str = "" + for j in range(impdata.num_individuals): + gt_a1 = "0" if a1[j] == REF else "1" + gt_a2 = "0" if a2[j] == REF else "1" + data_str += gt_a1 + "|" + gt_a2 + ":" + data_str += str(round(dosages[j], decimals)) + if print_gp: + data_str += ":" + data_str += ",".join( + [str(round(gt_probs[j, k], decimals)) for k in range(3)] + ) + if j < impdata.num_individuals - 1: + data_str += "\t" + line_str += data_str + f.write(line_str + "\n") diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py new file mode 100644 index 0000000000..0f806243d6 --- /dev/null +++ b/python/tests/test_imputation.py @@ -0,0 +1,718 @@ +""" +Tests for interpolation-style genotype imputation. +""" +import io + +import numpy as np +import pytest + +import tests.beagle_numba +import tskit + + +# The following toy query haplotypes were imputed from the toy reference haplotypes +# using BEAGLE 4.1 with Ne set to 10. +# +# There are 9 sites, starting from 10,000 to 90,000, with 10,000 increments. +# The REF is A and ALT is C for all the sites in the VCF input to BEAGLE. +# The ancestral state is A and derived state is C for all the sites. +# In this setup, A is encoded as 0 and C as 1 whether the allele encoding is +# ancestral/derived (0/1) or ACGT (0123). +# +# Case 0: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_0 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_0 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_0_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_0_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_0.shape == (9, 2) +assert toy_query_0.shape == (2, 9) +assert toy_query_0_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_0[0] == 0) == 4 +assert np.sum(toy_query_0[0] == -1) == 5 +assert np.array_equal(toy_query_0[0], toy_query_0[1]) +assert np.all(toy_query_0_beagle_imputed[0] == 0) +assert np.all(toy_query_0_beagle_imputed[1] == 0) +assert toy_query_0_beagle_allele_probs.shape == (2, 2, 5) + +# Case 1: +# Reference haplotypes and query haplotypes have 1|1 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_1 = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_1 = np.array( + [ + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_1_beagle_imputed = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_1_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + # query_0, haplotype 1 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_1.shape == (9, 2) +assert toy_query_1.shape == (2, 9) +assert toy_query_1_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_1[0] == 1) == 4 +assert np.sum(toy_query_1[0] == -1) == 5 +assert np.array_equal(toy_query_1[0], toy_query_1[1]) +assert np.all(toy_query_1_beagle_imputed[0] == 1) +assert np.all(toy_query_1_beagle_imputed[1] == 1) +assert toy_query_1_beagle_allele_probs.shape == (2, 2, 5) + +# Case 2: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are genotyped in the query haplotypes. +# fmt: off +toy_ref_2 = np.copy(toy_ref_0) +toy_query_2 = np.array( + [ + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 0 + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_2_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_2_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_2.shape == (9, 2) +assert toy_query_2.shape == (2, 9) +assert toy_query_2_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_2[0] == 0) == 6 +assert np.sum(toy_query_2[0] == -1) == 3 +assert np.array_equal(toy_query_2[0], toy_query_2[1]) +assert np.all(toy_query_2_beagle_imputed[0] == 0) +assert np.all(toy_query_2_beagle_imputed[1] == 0) +assert toy_query_2_beagle_allele_probs.shape == (2, 2, 3) + +# Case 3: +# Reference haplotypes and query haplotypes have 0|1 at all sites. +# fmt: off +toy_ref_3 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_3 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_3_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_3_beagle_allele_probs = np.array( + # Ungenotyped markers only + # See comment https://github.com/tskit-dev/tskit/pull/2815#issuecomment-1708178257 + [ + # query_0, haplotype 0 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_3.shape == (9, 2) +assert toy_query_3.shape == (2, 9) +assert toy_query_3_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_3[0] == 0) == 4 +assert np.sum(toy_query_3[1] == 1) == 4 +assert np.sum(toy_query_3[0] == -1) == np.sum(toy_query_3[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_3_beagle_imputed[0], toy_query_3_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 4: +# Reference haplotypes and query haplotypes have alternating 0|1 and 1|0 genotypes. +# Query haplotypes have 0|1 at all genotyped sites. +# fmt: off +toy_ref_4 = np.array( + [ + [0, 1, 0, 1, 0, 1, 0, 1, 0], # ref_0, haplotype 0 + [1, 0, 1, 0, 1, 0, 1, 0, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_4 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_4_beagle_imputed = np.array( + [ + [1, 0, 1, 0, 1, 0, 1, 0, 1], # query_0, haplotype 0 + [0, 1, 0, 1, 0, 1, 0, 1, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_4_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_4.shape == (9, 2) +assert toy_query_4.shape == (2, 9) +assert toy_query_4_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_4[0] == 0) == 4 +assert np.sum(toy_query_4[1] == 1) == 4 +assert np.sum(toy_query_4[0] == -1) == np.sum(toy_query_4[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_4_beagle_imputed[0], toy_query_4_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 5: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have one recombination breakpoint in the middle. +# fmt: off +toy_ref_5 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + ], dtype=np.int32 +).T +toy_query_5 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_5_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_5_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_5.shape == (9, 4) +assert toy_query_5.shape == (2, 9) +assert toy_query_5_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_5[0] == 0) == 3 +assert np.sum(toy_query_5[0] == 1) == 1 +assert np.sum(toy_query_5[0] == -1) == 5 +assert np.sum(toy_query_5[1] == 0) == 1 +assert np.sum(toy_query_5[1] == 1) == 3 +assert np.sum(toy_query_5[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_5_beagle_imputed[0], toy_query_5_beagle_imputed[1])) +) +assert toy_query_5_beagle_allele_probs.shape == (2, 2, 5) + +# Case 6: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have two recombination breakpoints in the middle. +# fmt: off +toy_ref_6 = np.copy(toy_ref_5) +toy_query_6 = np.array( + [ + [-1, 0, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_6_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 0 + [1, 1, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_6_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + [0.09016396, 0.53229386, 0.97451866 , 0.5316114 , 0.088609315], + ], + # query_0, haplotype 1 + [ + [0.09016395, 0.53229386, 0.97451866 , 0.5316114 , 0.0886093], + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_6.shape == (9, 4) +assert toy_query_6.shape == (2, 9) +assert toy_query_6_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_6[0] == 0) == 2 +assert np.sum(toy_query_6[0] == 1) == 2 +assert np.sum(toy_query_6[0] == -1) == 5 +assert np.sum(toy_query_6[1] == 0) == 2 +assert np.sum(toy_query_6[1] == 1) == 2 +assert np.sum(toy_query_6[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_6_beagle_imputed[0], toy_query_6_beagle_imputed[1])) +) +assert toy_query_6_beagle_allele_probs.shape == (2, 2, 5) + +# Case 7: +# Hand-crafted example. +# fmt: off +toy_ref_7 = np.array( + [ + [0, 1, 2, 0, 1, 1, 2, 3, 1], # ref_0 + [1, 1, 2, 3, 1, 1, 2, 3, 1], # ref_0 + [0, 1, 3, 0, 1, 1, 2, 3, 1], # ref_1 + [0, 2, 2, 3, 0, 1, 2, 0, 0], # ref_1 + [0, 1, 2, 3, 0, 2, 3, 0, 0], # ref_2 + [1, 1, 2, 3, 1, 1, 2, 3, 1], # ref_2 + [0, 1, 3, 0, 1, 1, 2, 3, 1], # ref_3 + [0, 1, 2, 3, 0, 1, 2, 3, 1], # ref_3 + [0, 2, 2, 3, 0, 1, 2, 0, 0], # ref_4 + [0, 1, 2, 3, 0, 2, 3, 0, 0], # ref_4 + ], dtype=np.int32 +).T +toy_query_7 = np.array( + [ + [0, 1, -1, 0, -1, 2, -1, 3, 1], # query_0, haplotype 0 + [1, 2, -1, 3, -1, 1, -1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_7_beagle_imputed = np.array( + [ + [0, 1, 3, 0, 1, 2, 2, 3, 1], # query_0, haplotype 0 + [1, 2, 2, 3, 0, 1, 2, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_7_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.33317572, 0.0 , 0.99945664], + [0.66635144, 0.9991437, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9995998, 0.99971414, 0.99974835], + [0.0 , 0.0 , 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_7.shape == (9, 10) +assert toy_query_7.shape == (2, 9) +assert toy_query_7_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_7[0] == -1) == np.sum(toy_query_7[1] == -1) == 3 +assert toy_query_7_beagle_allele_probs.shape == (2, 2, 3) + +# Case 8: +# Same as case 7 except the last genotyped marker is missing. +# fmt: off +toy_ref_8 = np.copy(toy_ref_7) +toy_query_8 = np.array( + [ + [-1, 1, -1, 0, -1, 2, -1, 3, -1], # query_0, haplotype 0 + [-1, 2, -1, 3, -1, 1, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_8_beagle_imputed = np.array( + [ + [0, 1, 3, 0, 1, 2, 2, 3, 1], # query_0, haplotype 0 + [0, 2, 2, 3, 0, 1, 2, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_8_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.9997735, 0.33310473, 0.0 , 0.9992305, 0.9997736], + [0.0 , 0.66620946, 0.99893093, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999998, 0.9999998, 0.99991965, 0.0], + [0.0 , 0.0 , 0.0 , 0.0 , 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_8.shape == (9, 10) +assert toy_query_8.shape == (2, 9) +assert toy_query_8_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_8[0] == -1) == 5 +assert np.sum(toy_query_8[1] == -1) == 5 +assert toy_query_8_beagle_allele_probs.shape == (2, 2, 5) + + +@pytest.mark.parametrize( + "input_ref,input_query,expected", + [ + (toy_ref_0, toy_query_0, toy_query_0_beagle_imputed), + (toy_ref_1, toy_query_1, toy_query_1_beagle_imputed), + (toy_ref_2, toy_query_2, toy_query_2_beagle_imputed), + (toy_ref_3, toy_query_3, toy_query_3_beagle_imputed), + (toy_ref_4, toy_query_4, toy_query_4_beagle_imputed), + (toy_ref_5, toy_query_5, toy_query_5_beagle_imputed), + (toy_ref_6, toy_query_6, toy_query_6_beagle_imputed), + (toy_ref_7, toy_query_7, toy_query_7_beagle_imputed), + (toy_ref_8, toy_query_8, toy_query_8_beagle_imputed), + ], +) +def test_beagle_vanilla(input_ref, input_query, expected): + """Compare imputed alleles from Python BEAGLE implementation and BEAGLE 4.1.""" + assert input_query.shape == expected.shape + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + num_ungenotyped_markers = np.sum(input_query[0] == -1) + imputed_alleles = np.zeros( + (num_query_haps, num_ungenotyped_markers), dtype=np.int32 + ) + expected_ungenotyped = expected[:, input_query[0] == -1] + assert imputed_alleles.shape == expected_ungenotyped.shape + for i in range(num_query_haps): + imputed_alleles[i], _ = tests.beagle.run_beagle( + input_ref, input_query[i], pos, miscall_rate=1e-4, ne=10.0 + ) + np.testing.assert_array_equal(imputed_alleles[i], expected_ungenotyped[i]) + + +@pytest.mark.parametrize( + "input_ref,input_query", + [ + (toy_ref_0, toy_query_0), + (toy_ref_1, toy_query_1), + (toy_ref_2, toy_query_2), + (toy_ref_3, toy_query_3), + (toy_ref_4, toy_query_4), + (toy_ref_5, toy_query_5), + (toy_ref_6, toy_query_6), + (toy_ref_7, toy_query_7), + (toy_ref_8, toy_query_8), + ], +) +def test_beagle_numba(input_ref, input_query): + """Compare imputed alleles from vanilla and numba Python BEAGLE implementations.""" + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + for i in range(num_query_haps): + imputed_alleles_vanilla, _ = tests.beagle.run_beagle( + input_ref, + input_query[i], + pos, + ne=10.0, + miscall_rate=1e-4, + ) + imputed_alleles_numba, _ = tests.beagle_numba.run_interpolation_beagle( + input_ref, + input_query[i], + pos, + ne=10.0, + error_rate=1e-4, + ) + np.testing.assert_array_equal(imputed_alleles_vanilla, imputed_alleles_numba) + + +# Below is toy data set case 7 in tree sequence format. +toy_ts_nodes_text = """\ +id is_sample time population individual metadata +0 1 0.000000 0 0 +1 1 0.000000 0 0 +2 1 0.000000 0 1 +3 1 0.000000 0 1 +4 1 0.000000 0 2 +5 1 0.000000 0 2 +6 1 0.000000 0 3 +7 1 0.000000 0 3 +8 1 0.000000 0 4 +9 1 0.000000 0 4 +10 0 0.009923 0 -1 +11 0 0.038603 0 -1 +12 0 0.057935 0 -1 +13 0 0.145141 0 -1 +14 0 0.238045 0 -1 +15 0 0.528344 0 -1 +16 0 0.646418 0 -1 +17 0 1.462199 0 -1 +18 0 2.836600 0 -1 +19 0 3.142225 0 -1 +20 0 4.056253 0 -1 +""" + +toy_ts_edges_text = """\ +left right parent child metadata +0.000000 100000.000000 10 1 +0.000000 100000.000000 10 5 +0.000000 100000.000000 11 3 +0.000000 100000.000000 11 8 +0.000000 100000.000000 12 2 +0.000000 100000.000000 12 6 +0.000000 100000.000000 13 0 +0.000000 100000.000000 13 12 +0.000000 100000.000000 14 10 +0.000000 100000.000000 14 13 +0.000000 40443.000000 15 9 +0.000000 40443.000000 15 14 +40443.000000 100000.000000 16 4 +40443.000000 100000.000000 16 9 +0.000000 40443.000000 17 4 +0.000000 100000.000000 17 11 +40443.000000 100000.000000 17 16 +0.000000 12721.000000 18 7 +0.000000 12721.000000 18 17 +12721.000000 100000.000000 19 7 +40443.000000 100000.000000 19 14 +12721.000000 40443.000000 19 15 +0.000000 12721.000000 20 15 +12721.000000 100000.000000 20 17 +0.000000 12721.000000 20 18 +12721.000000 100000.000000 20 19 +""" + +toy_ts_sites_text = """\ +position ancestral_state metadata +10000.000000 A +20000.000000 C +30000.000000 G +40000.000000 T +50000.000000 A +60000.000000 C +70000.000000 G +80000.000000 T +90000.000000 A +""" + +toy_ts_mutations_text = """\ +site node time derived_state parent metadata +0 10 unknown C -1 +1 11 unknown G -1 +2 12 unknown T -1 +3 13 unknown A -1 +4 14 unknown C -1 +5 16 unknown G -1 +6 16 unknown T -1 +7 17 unknown A -1 +8 19 unknown C -1 +""" + +toy_ts_individuals_text = """\ +flags +0 +0 +0 +0 +0 +""" + + +def get_toy_data(): + """Get toy example 7 in tree sequence format.""" + ts = tskit.load_text( + nodes=io.StringIO(toy_ts_nodes_text), + edges=io.StringIO(toy_ts_edges_text), + sites=io.StringIO(toy_ts_sites_text), + mutations=io.StringIO(toy_ts_mutations_text), + individuals=io.StringIO(toy_ts_individuals_text), + strict=False, + ) + return ts + + +def parse_matrix(csv_text): + # TODO: Maybe remove. + # This returns a record array, which is essentially the same as a + # pandas dataframe, which we can access via df["m"] etc. + return np.lib.npyio.recfromcsv(io.StringIO(csv_text)) + + +@pytest.mark.parametrize( + "input_query,expected", + [ + (toy_query_7, toy_query_7_beagle_imputed), + (toy_query_8, toy_query_8_beagle_imputed), + ], +) +def test_tsimpute(input_query, expected): + """ + Compare imputed alleles from tsimpute and BEAGLE 4.1. + """ + toy_ref_ts = get_toy_data() # Same for both cases + pos = toy_ref_ts.sites_position + num_query_haps = input_query.shape[0] + mu = np.zeros(len(pos), dtype=np.float32) + 1e-8 + rho = np.zeros(len(pos), dtype=np.float32) + 1e-8 + expected_subset = expected[:, input_query[0] == -1] + for i in range(num_query_haps): + imputed_alleles, _ = tests.beagle_numba.run_tsimpute( + toy_ref_ts, + input_query[i], + pos, + rho, + mu, + ) + np.testing.assert_array_equal(imputed_alleles, expected_subset[i]) + + +# Tests for helper functions. +@pytest.mark.parametrize( + "typed_pos,untyped_pos,expected_weights,expected_idx", + [ + # All ungenotyped markers are between genotyped markers. + ( + np.array([10, 20]) * 1e6, + np.array([15]) * 1e6, + np.array([0.5]), + np.array([0]), + ), + # Same as above, but more genotyped markers. + ( + np.array([10, 20, 30, 40]) * 1e6, + np.array([15, 25, 35]) * 1e6, + np.array([0.5, 0.5, 0.5]), + np.array([0, 1, 2]), + ), + # Ungenotyped markers are left of the first genotyped marker. + ( + np.array([10, 20]) * 1e6, + np.array([5, 15]) * 1e6, + np.array([1.0, 0.5]), + np.array([0, 0]), + ), + # Ungenotyped markers are right of the last genotyped marker. + ( + np.array([10, 20]) * 1e6, + np.array([15, 25]) * 1e6, + np.array([0.5, 0.0]), + np.array([0, 1]), + ), + # Denominator below min threshold. + (np.array([10, 20]), np.array([15]), np.array([0.001]), np.array([0])), + ], +) +def test_get_weights(typed_pos, untyped_pos, expected_weights, expected_idx): + # beagle, no numba + actual_weights, actual_idx = tests.beagle.get_weights(typed_pos, untyped_pos) + np.testing.assert_allclose(actual_weights, expected_weights) + np.testing.assert_array_equal(actual_idx, expected_idx) + # beagle, numba + genotyped_cm = tests.beagle_numba.convert_to_genetic_map_positions(typed_pos) + ungenotyped_cm = tests.beagle_numba.convert_to_genetic_map_positions(untyped_pos) + actual_weights, actual_idx = tests.beagle_numba.get_weights( + typed_pos, untyped_pos, genotyped_cm, ungenotyped_cm + ) + np.testing.assert_allclose(actual_weights, expected_weights) + np.testing.assert_array_equal(actual_idx, expected_idx)