From 0e9e0aacc1179b4cb33d401e735b227d63ff9b2c Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 26 Jul 2024 12:33:46 +0100 Subject: [PATCH] Update tutorial.md --- docs/tutorial.md | 166 +++++++++++++++++++++++++---------------------- 1 file changed, 87 insertions(+), 79 deletions(-) diff --git a/docs/tutorial.md b/docs/tutorial.md index 50241f35..2d38798c 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -23,72 +23,89 @@ kernelspec: ## Toy example -Suppose that we have observed the following data:: - - sample alignment - 0 A..G......C..G.....AT - 1 T..G......A..C.....AG - 2 A..G......A..C.....AC - 3 A..C......C..G.....CT - 4 A..C......C..G.....CT - | | | - 0 10 20 - -Here we have phased haplotype data for five samples at six variable sites. We wish to -infer the genealogies that gave rise to this data set. To import the data into `tsinfer` -we must know the *ancestral state* for each site we wish to use for inference; there are -many methods for achieving this: details are outside the scope of this manual, but we -have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523) -on this issue to provide some recommendations. Assuming that we -know the ancestral states for most sites, we can then import our data into a `tsinfer` -{ref}`Sample data` file using the Python -{ref}`API`: +`Tsinfer` takes as input a [Zarr](https://zarr.readthedocs.io/) file, with phased variant data encoded in the +[VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) (.vcz) format. The standard +route to create such a file is by conversion from a VCF file, e.g. using +[vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as described later in this +document. For a quick introduction, however, we will instead create an example file using +[sgkit](https://sgkit-dev.github.io/sgkit/latest/). + ```{code-cell} ipython3 +import sgkit +ds = sgkit.simulate_genotype_call_dataset(n_variant=8, n_sample=3, missing_pct=0, phased=True, seed=79) +sgkit.save_dataset(ds, "data.vcz") +``` + +This is what that generated data looks like: + +```{code-cell} +:"tags": ["remove-input"] +import numpy as np +assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele'].values) +assert any([np.sum(g.values) == 1 for g in ds['call_genotype']]) # at least one singleton +assert any([np.sum(g.values) == 0 for g in ds['call_genotype']]) # at least one non-variable + +alleles = ds['variant_allele'].values.astype(str) +sites = np.arange(ds['call_genotype'].shape[0]) +for sample in range(ds['call_genotype'].shape[1]): + for genome in range(ds['call_genotype'].shape[2]): + genotypes = ds['call_genotype'][:,sample, genome] + print( + f"Diploid sample {sample} (genome {genome}):", + "".join(alleles[sites, genotypes.values]) + ) +``` + +We wish to infer a genealogy that could have given rise to this data set. To run `tsinfer` +we create a `tsinfer.VariantData` object from the .vcz file. This requires an +*ancestral allele* to be specified for each site; there are +many methods for calculating there: details are outside the scope of this manual, but we +have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523) +on this issue to provide some recommendations. Sometimes VCF files will contain the +ancestral allele in the "AA" info field, in which case it will be encoded in the +`variant_ancestral_allele` field of the .vcz file. It's also possible to provide a numpy array +of ancestral alleles, of the same length as the number of variants. Ancestral +alleles that are not in the list of alleles for their respective site are treated as unknown +and not used for inference (with a warning given). + +```{code-cell} import tsinfer -from tskit import MISSING_DATA - -with tsinfer.SampleData(sequence_length=21) as sample_data: - sample_data.add_site(0, [0, 1, 0, 0, 0], ["A", "T"], ancestral_allele=0) - sample_data.add_site(3, [0, 0, 0, 1, 1], ["G", "C"], ancestral_allele=0) - sample_data.add_site(10, [0, 1, 1, 0, 0], ["C", "A"], ancestral_allele=0) - sample_data.add_site(13, [0, 1, 1, 0, 0], ["G", "C"], ancestral_allele=MISSING_DATA) - sample_data.add_site(19, [0, 0, 0, 1, 1], ["A", "C"], ancestral_allele=0) - sample_data.add_site(20, [0, 1, 2, 0, 0], ["T", "G", "C"], ancestral_allele=0) + +# In this example, take the REF allele (index 0) as ancestral +ancestral_alleles = ds['variant_allele'].values[:,0].astype(str) +# set the 5th site (index 4) as of unknown ancestral allele +ancestral_alleles[4] = "." + +vdata = tsinfer.VariantData( + "data.vcz", + ancestral_allele_name_or_array=ancestral_alleles +) ``` -Here we create a new {class}`.SampleData` object for five samples. We then -sequentially add the data for each site one-by-one using the -{meth}`.SampleData.add_site` method. The first argument for `add_site` is the -position of the site in genome coordinates. This can be any positive value -(even floating point), but site positions must be unique and sites must be -added in increasing order of position. Since `tskit` uses a zero-based notation, -we've added the first site at position 0, and the sequence covers 21 bp. The second -argument for `add_site` is a list of *genotypes* for the site. Each value in a -genotypes array `g` is an integer, giving an index into the list provided as the third -argument to ``add_site``: the list of *alleles* for this site. The fourth argument -is the index in the allele list which represents the ancestral state. -Each call to ``add_site`` thus stores a single variable site in the -original haplotype data above. For example, the ancestral and derived states for -the site at position 0 are "A" and "T" and the genotypes are 01000: this encodes -the first column, ATAAA. +Here we create a new {class}`.VariantData` object for the 3 diploid samples in our +dataset. Each diploid sample will correspond to an *individual* in the resulting tree +sequenece, and each of the 6 genomes will correspond to a sample node +(hence `ts.num_samples == 6`). Not all sites are used for genealogical inference: this includes non-variable (fixed) -sites, singletons (in this example, the site at position 0), sites -with more than 2 alleles (e.g. the site at position 20), and sites where the ancestral -state is unknown (marked as {data}`tskit.MISSING_DATA`, e.g. the site at position 10). -Additionally, during the next step, extra sites can be flagged as not for use in +sites, singleton sites, and sites where the ancestral allele is unknown: in this example, +these are seen at the seventh and third, and fifth sites respectively. In addition, +multiallelic sites, with more than 2 alleles, are not used for inference (but see +https://github.com/tskit-dev/tsinfer/issues/670 for a workaround). + +Additionally, during the inference step, extra sites can be flagged as not for use in inferring the genealogy, for example if they are deemed unreliable (this is done via the `exclude_positions` parameter). Note, however, that even if a site is not used -for genealogical inference, its genetic variation will still be encoded in the final +for genealogical inference, its genetic variation can still be encoded in the final tree sequence. -Once we have stored our data in a {class}`.SampleData` object, we can easily infer +Once we have stored our data in a {class}`.VariantData` object, we can easily infer a {ref}`tree sequence` using the Python API: ```{code-cell} ipython3 -inferred_ts = tsinfer.infer(sample_data) +inferred_ts = tsinfer.infer(vdata) ``` And that's it: we now have a fully functional {class}`tskit.TreeSequence` @@ -112,7 +129,7 @@ check this programatically if you want: ```{code-cell} ipython3 import numpy as np -for v_orig, v_inferred in zip(sample_data.variants(), inferred_ts.variants()): +for v_orig, v_inferred in zip(vdata.variants(), inferred_ts.variants()): if any( np.array(v_orig.alleles)[v_orig.genotypes] != np.array(v_inferred.alleles)[v_inferred.genotypes] @@ -141,10 +158,9 @@ inferred_ts.draw_svg( size=(500, 200), node_labels=node_labels, mutation_labels=mut_labels, y_axis=True) ``` -This small dataset is compatible with a single tree, so the inferred tree sequence -consists of only one tree. Note, however, that it contains a *polytomy* at the root -(i.e. the root node has more than two children) . This is a -common feature of trees inferred by `tsinfer` and signals that there was not +We have inferred 4 trees aong the genome. Note, however, that there are "polytomies" in the +trees, where some nodes have more than two children (e.g. the root node in the first tree). +This is a common feature of trees inferred by `tsinfer` and signals that there was not sufficient information to resolve the tree at this internal node. @@ -175,8 +191,8 @@ The previous example showed how we can infer a tree sequence using the Python AP toy example. However, for real data we will not prepare our data and infer the tree sequence all in one go; rather, we will usually split the process into at least two distinct steps. -The first step in any inference is to prepare your data and import it into a -{ref}`sample data` file. For simplicity here we'll use +The first step in any inference is to prepare your data and create the .vcz file. +For simplicity here we'll use Python to simulate some data under the coalescent with recombination, using [msprime](https://msprime.readthedocs.io/en/stable/api.html#msprime.simulate): @@ -184,6 +200,8 @@ Python to simulate some data under the coalescent with recombination, using import builtins import sys +from Bio import bgzf +import subprocess import msprime import tsinfer @@ -211,33 +229,23 @@ print( f"{ts.num_trees} trees and {ts.num_sites} sites" ) -with tsinfer.SampleData( - path=name + ".samples", - sequence_length=ts.sequence_length, - num_flush_threads=2 -) as sim_sample_data: - for var in tqdm(ts.variants(), total=ts.num_sites): - sim_sample_data.add_site(var.site.position, var.genotypes, var.alleles) -print("Stored output in", name + ".samples") +# Convert to a VCF: this should be easier once a tskit2zarr utility is made, see +# https://github.com/sgkit-dev/bio2zarr/issues/232 +vcf_name = f"{name}.vcf.gz" +with bgzf.open(vcf_name, "wt") as f: + ts.write_vcf(f) +subprocess.run(["tabix", vcf_name]) +subprocess.run(["python", "-m", "bio2zarr", "vcf2zarr", "convert", "--force", vcf_name, name+".vcz"]) +print(f"Converted to {name}.vcz") ``` -Here we first run a simulation then, as before, we create a {class}`.SampleData` -instance to store the data we have simulated. When doing this, we use the extra -parameters `path`, `sequence_length` and `num_flush_threads`. The first provides a -filename in which to permanently store the information. The second defines the -overall coordinate space for site positions, so that this value can -be recovered in the final tree sequence that we output later. Finally, -`num_flush_threads=2` tells `tsinfer` to use two background threads for -compressing data and flushing it to disk. - +Here we first run a simulation then we create a vcf file and convert it to .vcz format. If we store this code in a file named `simulate-data.py`, we can run it on the -command line, providing parameters to generate much bigger datasets (and showing -a progress meter using [tqdm](https://github.com/tqdm/tqdm) to keep track of how -the process of compressing and storing the sample data is progressing): +command line, providing parameters to generate much bigger datasets: ```{code} bash -% python3 simulate-data.py 5000 10_000_000 +% python simulate-data.py 5000 10_000_000 Simulated 10000 samples over 10.0 Mb: 36204 trees and 38988 sites 100%|██████████████████████████████████████████| 38988/38988 [00:08<00:00, 4617.55it/s] ```