diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 50f53c18..7dd0bb7b 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -29,32 +29,22 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.11" + cache: "pip" - - uses: actions/cache@v4 - id: venv-cache - with: - path: venv - key: docs-venv-v6-${{ hashFiles('requirements/CI-docs/requirements.txt') }} - - - name: Create venv and install deps (one by one to avoid conflict errors) - if: steps.venv-cache.outputs.cache-hit != 'true' + - name: Install deps (one by one to avoid conflict errors) run: | - python -m venv venv - . venv/bin/activate pip install --upgrade pip wheel - pip install -r requirements/CI-docs/requirements.txt - + pip install -r requirements/CI-docs/requirements.txt + sudo apt-get install -y tabix - name: Build C module if: env.MAKE_TARGET run: | - . venv/bin/activate make $MAKE_TARGET - name: Build Docs run: | - . venv/bin/activate cd docs && make dist - name: Trigger docs site rebuild diff --git a/docs/.gitignore b/docs/.gitignore index f3c3680f..f4b2a542 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,4 +1,7 @@ notebook-simulation.trees notebook-simulation.samples notebook-simulation-source.trees +notebook-simulation.vc* +notebook-simulation-AA.npy P_dom_chr24_phased.samples +sparrows.vcz diff --git a/docs/_config.yml b/docs/_config.yml index 75ed2ee2..e2025d9a 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -41,7 +41,7 @@ sphinx: config: html_theme: sphinx_book_theme html_theme_options: - pygment_dark_style: monokai + pygments_dark_style: monokai pygments_style: monokai myst_enable_extensions: - colon_fence diff --git a/docs/_static/example_data.vcz/.zattrs b/docs/_static/example_data.vcz/.zattrs new file mode 100644 index 00000000..f778611e --- /dev/null +++ b/docs/_static/example_data.vcz/.zattrs @@ -0,0 +1,6 @@ +{ + "contigs": [ + "0" + ], + "source": "sgkit-0.9.0" +} \ No newline at end of file diff --git a/docs/_static/example_data.vcz/.zgroup b/docs/_static/example_data.vcz/.zgroup new file mode 100644 index 00000000..3b7daf22 --- /dev/null +++ b/docs/_static/example_data.vcz/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/docs/_static/example_data.vcz/.zmetadata b/docs/_static/example_data.vcz/.zmetadata new file mode 100644 index 00000000..24b95bd0 --- /dev/null +++ b/docs/_static/example_data.vcz/.zmetadata @@ -0,0 +1,243 @@ +{ + "metadata": { + ".zattrs": { + "contigs": [ + "0" + ], + "source": "sgkit-0.9.0" + }, + ".zgroup": { + "zarr_format": 2 + }, + "call_genotype/.zarray": { + "chunks": [ + 8, + 3, + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 8, + 3, + 2 + ], + "zarr_format": 2 + }, + "call_genotype/.zattrs": { + "_ARRAY_DIMENSIONS": [ + "variants", + "samples", + "ploidy" + ], + "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.", + "mixed_ploidy": false + }, + "call_genotype_mask/.zarray": { + "chunks": [ + 8, + 3, + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 8, + 3, + 2 + ], + "zarr_format": 2 + }, + "call_genotype_mask/.zattrs": { + "_ARRAY_DIMENSIONS": [ + "variants", + "samples", + "ploidy" + ], + "comment": "A flag for each call indicating which values are missing.", + "dtype": "bool" + }, + "call_genotype_phased/.zarray": { + "chunks": [ + 8, + 3 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 8, + 3 + ], + "zarr_format": 2 + }, + "call_genotype_phased/.zattrs": { + "_ARRAY_DIMENSIONS": [ + "variants", + "samples" + ], + "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.", + "dtype": "bool" + }, + "contig_id/.zarray": { + "chunks": [ + 1 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "` to allow the user to efficiently construct it from their own data. -An example of how to use this API is given in the {ref}`sec_tutorial`. +An example of how to use this API is given in the {ref}`sec_usage` documentation. We do not provide an automatic means of importing data from VCF (or any other format) intentionally, as we believe that this would be extremely difficult to do. diff --git a/docs/simulation-example.py b/docs/simulation-example.py index 45ab12aa..b72601c1 100644 --- a/docs/simulation-example.py +++ b/docs/simulation-example.py @@ -1,44 +1,42 @@ -import os +import builtins +import subprocess import sys import msprime -import tqdm - -sys.path.insert(0, os.path.abspath("..")) -import tsinfer # noqa - -if True: - ts = msprime.simulate( - sample_size=10000, - Ne=10**4, - recombination_rate=1e-8, - mutation_rate=1e-8, - length=10 * 10**6, - random_seed=42, - ) - ts.dump("simulation-source.trees") - print("Simulation done:", ts.num_trees, "trees and", ts.num_sites) - - with tsinfer.SampleData( - sequence_length=ts.sequence_length, - path="simulation.samples", - num_flush_threads=2, - ) as samples: - for var in tqdm.tqdm(ts.variants(), total=ts.num_sites): - samples.add_site(var.site.position, var.genotypes, var.alleles) - -else: - source = msprime.load("simulation-source.trees") - inferred = msprime.load("simulation.trees") - - subset = range(0, 6) - source_subset = source.simplify(subset) - inferred_subset = inferred.simplify(subset) - - tree = source_subset.first() - print("True tree: interval=", tree.interval) - print(tree.draw(format="unicode")) - - tree = inferred_subset.first() - print("Inferred tree: interval=", tree.interval) - print(tree.draw(format="unicode")) +import numpy as np +from Bio import bgzf + +if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook + num_diploids, seq_len = 100, 10_000 + name = "notebook-simulation" +else: # Take parameters from the command-line + num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2]) + name = "cli-simulation" + +ts = msprime.sim_ancestry( + num_diploids, + population_size=10**4, + recombination_rate=1e-8, + sequence_length=seq_len, + random_seed=6, +) +ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7) +ts.dump(name + "-source.trees") +print( + f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:", + f"{ts.num_trees} trees and {ts.num_sites} sites", +) + +# Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see +# https://github.com/sgkit-dev/bio2zarr/issues/232 +np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()]) +vcf_name = f"{name}.vcf.gz" +with bgzf.open(vcf_name, "wt") as f: + ts.write_vcf(f) +subprocess.run(["tabix", vcf_name]) +ret = subprocess.run( + "python -m bio2zarr vcf2zarr convert --force".split() + [vcf_name, name + ".vcz"], + stderr=subprocess.DEVNULL if name == "notebook-simulation" else None, +) +if ret.returncode == 0: + print(f"Converted to {name}.vcz") diff --git a/docs/tutorial.md b/docs/tutorial.md deleted file mode 100644 index 50241f35..00000000 --- a/docs/tutorial.md +++ /dev/null @@ -1,735 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.12 - jupytext_version: 1.9.1 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -:::{currentmodule} tsinfer -::: - - -(sec_tutorial)= - -# Tutorial - -(sec_tutorial_toy_example)= - -## Toy example - -Suppose that we have observed the following data:: - - sample alignment - 0 A..G......C..G.....AT - 1 T..G......A..C.....AG - 2 A..G......A..C.....AC - 3 A..C......C..G.....CT - 4 A..C......C..G.....CT - | | | - 0 10 20 - -Here we have phased haplotype data for five samples at six variable sites. We wish to -infer the genealogies that gave rise to this data set. To import the data into `tsinfer` -we must know the *ancestral state* for each site we wish to use for inference; there are -many methods for achieving this: details are outside the scope of this manual, but we -have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523) -on this issue to provide some recommendations. Assuming that we -know the ancestral states for most sites, we can then import our data into a `tsinfer` -{ref}`Sample data` file using the Python -{ref}`API`: - -```{code-cell} ipython3 -import tsinfer -from tskit import MISSING_DATA - -with tsinfer.SampleData(sequence_length=21) as sample_data: - sample_data.add_site(0, [0, 1, 0, 0, 0], ["A", "T"], ancestral_allele=0) - sample_data.add_site(3, [0, 0, 0, 1, 1], ["G", "C"], ancestral_allele=0) - sample_data.add_site(10, [0, 1, 1, 0, 0], ["C", "A"], ancestral_allele=0) - sample_data.add_site(13, [0, 1, 1, 0, 0], ["G", "C"], ancestral_allele=MISSING_DATA) - sample_data.add_site(19, [0, 0, 0, 1, 1], ["A", "C"], ancestral_allele=0) - sample_data.add_site(20, [0, 1, 2, 0, 0], ["T", "G", "C"], ancestral_allele=0) -``` - -Here we create a new {class}`.SampleData` object for five samples. We then -sequentially add the data for each site one-by-one using the -{meth}`.SampleData.add_site` method. The first argument for `add_site` is the -position of the site in genome coordinates. This can be any positive value -(even floating point), but site positions must be unique and sites must be -added in increasing order of position. Since `tskit` uses a zero-based notation, -we've added the first site at position 0, and the sequence covers 21 bp. The second -argument for `add_site` is a list of *genotypes* for the site. Each value in a -genotypes array `g` is an integer, giving an index into the list provided as the third -argument to ``add_site``: the list of *alleles* for this site. The fourth argument -is the index in the allele list which represents the ancestral state. -Each call to ``add_site`` thus stores a single variable site in the -original haplotype data above. For example, the ancestral and derived states for -the site at position 0 are "A" and "T" and the genotypes are 01000: this encodes -the first column, ATAAA. - -Not all sites are used for genealogical inference: this includes non-variable (fixed) -sites, singletons (in this example, the site at position 0), sites -with more than 2 alleles (e.g. the site at position 20), and sites where the ancestral -state is unknown (marked as {data}`tskit.MISSING_DATA`, e.g. the site at position 10). -Additionally, during the next step, extra sites can be flagged as not for use in -inferring the genealogy, for example if they are deemed unreliable (this is done -via the `exclude_positions` parameter). Note, however, that even if a site is not used -for genealogical inference, its genetic variation will still be encoded in the final -tree sequence. - -Once we have stored our data in a {class}`.SampleData` object, we can easily infer -a {ref}`tree sequence` using the Python -API: - -```{code-cell} ipython3 -inferred_ts = tsinfer.infer(sample_data) -``` - -And that's it: we now have a fully functional {class}`tskit.TreeSequence` -object that we can interrogate in the usual ways. For example, we can look -at the {meth}`aligned haplotypes` in the tree sequence: - -```{code-cell} ipython3 -print("Sample\tInferred sequence") -for sample_id, seq in zip( - inferred_ts.samples(), - inferred_ts.alignments(missing_data_character="."), -): - print(sample_id, seq, sep="\t") -``` - -You will notice that the sample sequences generated by this tree sequence are identical -to the input haplotype data: apart from the imputation of -{ref}`missing data`, `tsinfer` is guaranteed to -losslessly encode any given input data, regardless of the inferred topology. You can -check this programatically if you want: - -```{code-cell} ipython3 -import numpy as np -for v_orig, v_inferred in zip(sample_data.variants(), inferred_ts.variants()): - if any( - np.array(v_orig.alleles)[v_orig.genotypes] != - np.array(v_inferred.alleles)[v_inferred.genotypes] - ): - raise ValueError("Genotypes in original dataset and inferred tree seq not equal") -print("** Genotypes in original dataset and inferred tree sequence are the same **") -``` - -We can examine the inferred genetic genealogy, -in the form of {ref}`local trees`. `Tsinfer` has also -placed mutations on the genealogy to explain the observed genetic variation: - -```{code-cell} ipython3 -mut_labels = { - m.id: "{:g}: {}→{}".format( - s.position, - inferred_ts.mutation(m.parent).derived_state if m.parent >= 0 else s.ancestral_state, - m.derived_state, - ) - for s in inferred_ts.sites() - for m in s.mutations -} -node_labels = {u: u for u in inferred_ts.samples()} - -inferred_ts.draw_svg( - size=(500, 200), node_labels=node_labels, mutation_labels=mut_labels, y_axis=True) -``` - -This small dataset is compatible with a single tree, so the inferred tree sequence -consists of only one tree. Note, however, that it contains a *polytomy* at the root -(i.e. the root node has more than two children) . This is a -common feature of trees inferred by `tsinfer` and signals that there was not -sufficient information to resolve the tree at this internal node. - - -Each internal (non-sample) node in this inferred tree represents an ancestral sequence, -constructed on the basis of shared, derived alleles at one or more of the sites. By -default, the time of each such node is *not* measured in years or generations, but -is simply the frequency of the shared derived allele(s) on which the ancestral sequence -is based. For this reason, the time units are described as "uncalibrated" in the plot, -and trying to calculate statistics based on branch lengths will raise an error: - -```{code-cell} ipython3 -:tags: ["raises-exception"] -inferred_ts.diversity(mode="branch") -``` - -To add meaningful dates to an inferred tree sequence, allowing meaningful branch length -statistics to be calulated, you must use additional -software such as [tsdate](https://tskit.dev/software/tsdate.html): the `tsinfer` -algorithm is only intended to infer the genetic relationships between the samples -(i.e. the *topology* of the tree sequence). - - -(sec_tutorial_simulation_example)= - -## Simulation example - -The previous example showed how we can infer a tree sequence using the Python API for a trivial -toy example. However, for real data we will not prepare our data and infer the tree sequence all -in one go; rather, we will usually split the process into at least two distinct steps. - -The first step in any inference is to prepare your data and import it into a -{ref}`sample data` file. For simplicity here we'll use -Python to simulate some data under the coalescent with recombination, using -[msprime](https://msprime.readthedocs.io/en/stable/api.html#msprime.simulate): - -```{code-cell} ipython3 - -import builtins -import sys - -import msprime -import tsinfer - -if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook - from tqdm.notebook import tqdm - num_diploids, seq_len = 100, 10_000 - name = "notebook-simulation" -else: # Take parameters from the command-line - from tqdm import tqdm - num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2]) - name = "cli-simulation" - -ts = msprime.sim_ancestry( - num_diploids, - population_size=10**4, - recombination_rate=1e-8, - sequence_length=seq_len, - random_seed=6, -) -ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7) -ts.dump(name + "-source.trees") -print( - f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:", - f"{ts.num_trees} trees and {ts.num_sites} sites" -) - -with tsinfer.SampleData( - path=name + ".samples", - sequence_length=ts.sequence_length, - num_flush_threads=2 -) as sim_sample_data: - for var in tqdm(ts.variants(), total=ts.num_sites): - sim_sample_data.add_site(var.site.position, var.genotypes, var.alleles) -print("Stored output in", name + ".samples") -``` - -Here we first run a simulation then, as before, we create a {class}`.SampleData` -instance to store the data we have simulated. When doing this, we use the extra -parameters `path`, `sequence_length` and `num_flush_threads`. The first provides a -filename in which to permanently store the information. The second defines the -overall coordinate space for site positions, so that this value can -be recovered in the final tree sequence that we output later. Finally, -`num_flush_threads=2` tells `tsinfer` to use two background threads for -compressing data and flushing it to disk. - -If we store this code in a file named `simulate-data.py`, we can run it on the -command line, providing parameters to generate much bigger datasets (and showing -a progress meter using [tqdm](https://github.com/tqdm/tqdm) to keep track of how -the process of compressing and storing the sample data is progressing): - - -```{code} bash -% python3 simulate-data.py 5000 10_000_000 -Simulated 10000 samples over 10.0 Mb: 36204 trees and 38988 sites -100%|██████████████████████████████████████████| 38988/38988 [00:08<00:00, 4617.55it/s] -``` - -Here, we simulated a sample of 10 thousand chromosomes, each 10Mb long, under -human-like parameters. That has created 39K segregating sites. The output -above shows the state of the progress meter at the end of this process, -indicating that it took about 8 seconds to import the data for this simulation into -`tsinfer`'s sample data format. - -:::{note} -If you already have a tree sequence file, and wish to create a sample data file from -it, you can use the {meth}`tsinfer.SampleData.from_tree_sequence` method. For example, -the following snippet is equivalent to the data file creation part of the code above: - -```{code} python -tsinfer.SampleData.from_tree_sequence( - ts, path="cli-simulation.samples", num_flush_threads=2) -``` -::: - -Examining the files on the command line, we then see the following:: - -```{code} bash -$ ls -lh cli-simulation* --rw-r--r-- 1 user group 8.2M 14 Oct 13:45 cli-simulation-source.trees --rw-r--r-- 1 user group 22M 14 Oct 13:45 cli-simulation.samples -``` - -The `cli-simulation.samples` file is quite small, being less than three times the size of the -original `msprime` tree sequence file. The {ref}`tsinfer command line interface ` -provides a useful way to examine files in more detail using the `list` (or `ls`) command: - -```{code} bash -$ tsinfer ls cli-simulation.samples -path = cli-simulation.samples -file_size = 21.6 MiB -format_name = tsinfer-sample-data -format_version = (5, 1) -finalised = True -uuid = 422b25be-fe6e-4c25-b302-af2715fd9c93 -num_provenances = 0 -provenances/timestamp = shape=(0,); dtype=object; -provenances/record = shape=(0,); dtype=object; -sequence_length = 10000000.0 -metadata_schema = {'codec': 'json'} -metadata = {} -num_populations = 0 -num_individuals = 10000 -num_samples = 10000 -num_sites = 38988 -populations/metadata_schema = None -populations/metadata = shape=(0,); dtype=object; -individuals/metadata_schema = None -individuals/metadata = shape=(10000,); dtype=object; -individuals/location = shape=(10000,); dtype=object; -individuals/time = shape=(10000,); dtype=float64;uncompressed size=80.0 kB -individuals/population = shape=(10000,); dtype=int32;uncompressed size=40.0 kB -individuals/flags = shape=(10000,); dtype=uint32;uncompressed size=40.0 kB -samples/individual = shape=(10000,); dtype=int32;uncompressed size=40.0 kB -sites/position = shape=(38988,); dtype=float64;uncompressed size=311.9 kB -sites/time = shape=(38988,); dtype=float64;uncompressed size=311.9 kB -sites/alleles = shape=(38988,); dtype=object; -sites/genotypes = shape=(38988, 10000); dtype=int8;uncompressed size=389.9 MB -sites/metadata_schema = None -sites/metadata = shape=(38988,); dtype=object; -``` - -Most of this output is not particularly interesting here, but we can see that the -`sites/genotypes` array which holds all of the sample genotypes (and thus the vast bulk of the -actual data) requires about 390MB uncompressed. The `tsinfer` sample data format is therefore -achieving a roughly 20X compression in this case. In practice this means we can keep such files -lying around without taking up too much space. - -Once we have our `.samples` file created, running the inference is straightforward. -We can do so within Python, or use `tsinfer` on -the command-line, which is useful when inference is expected to take a long time: - -```{code-cell} ipython3 -:"tags": ["hide-input"] -# Infer & save a ts from the notebook simulation. This could also be done -# from the command-line using `tsinfer infer notebook-simulation.samples` -tsinfer.infer(sim_sample_data).dump(name + ".trees") -``` - -```{code} bash -$ tsinfer infer cli-simulation.samples -p -t 4 -ga-add (1/6)100%|█████████████████████████████| 39.0k/39.0k [00:02, 16.0kit/s] -ga-gen (2/6)100%|█████████████████████████████| 26.3k/26.3k [00:17, 1.52kit/s] -ms-muts (4/6)100%|█████████████████████████████| 35.1k/35.1k [00:00, 48.2kit/s] -ma-match (3/6)100%|████████████████████████████▉| 26.3k/26.3k [07:21, 60.5it/s] -ms-match (5/6)100%|█████████████████████████████| 10.0k/10.0k [10:32, 15.8it/s] -ms-paths (6/6)100%|█████████████████████████████| 10.0k/10.0k [00:00, 40.8kit/s] -ms-muts (7/6)100%|█████████████████████████████| 35.1k/35.1k [00:00, 42.1kit/s] -ms-xsites (8/6)100%|█████████████████████████████| 3.85k/3.85k [00:05, 688it/s] -``` - -Running the `infer` command runs the full inference pipeline in one go (the individual steps -are explained {ref}`here `), writing the output, by default, to the tree sequence -file ``simulation.trees``. We provided two extra arguments to `infer`: the `-p` flag -(`--progress`) gives us the progress bars show above, and `-t 4` (``--num-threads=4``) tells -`tsinfer` to use four worker threads whenever it can use them. - -This inference was run on a Core i5 processor (launched 2009) with 4GiB of RAM, and took -about eighteen minutes. The maximum memory usage was about 600MiB. - -Looking at our output files, we see:: - -```{code} bash -$ ls -lh cli-simulation* --rw-r--r-- 1 user group 8.2M 14 Oct 13:45 cli-simulation-source.trees --rw-r--r-- 1 user group 22M 14 Oct 13:45 cli-simulation.samples --rw-r--r-- 1 user group 8.1M 14 Oct 13:50 cli-simulation.trees -``` - -Therefore our output tree sequence file that we have just inferred in a few minutes is -*even smaller* than the original ``msprime`` simulated tree sequence! -Because the output file is also a {class}`tskit.TreeSequence`, we can use the same API -to work with both. For example, to load up the original and inferred tree sequences -from their corresponding `.trees` files you can simply use the {func}`tskit.load` -function, as shown below. - -However, there are thousands of trees in these tree sequences, each of which has -10 thousand samples: much too much to easily visualise. Therefore, in the code below, we -we subset both tree sequences down to their minimal representations for the first six -sample nodes using {meth}`tskit.TreeSequence.simplify`, while keeping all sites, even if -they do not show genetic variation between the six selected samples. -:::{note} -Using this tiny subset of the overall data allows us to get an informal -feel for the trees that are inferred by `tsinfer`, but this is certainly -not a recommended approach for validating the inference!) -::: - -Once we've subsetted the tree sequences down to something that we can comfortably look -at, we can plot the trees in particular regions of interest, for example the first -2kb of genome. Below, for simplicity we use the smaller dataset that was simulated and -inferred within the the notebook, but exactly the same code will work with -the larger version run from the cli simulation. - - -```{code-cell} ipython3 -import tskit - -subset = range(0, 6) # show first 6 samples -limit = (0, 2_000) # show only the trees covering the first 2kb - -prefix = "notebook" # Or use "cli" for the larger example -source = tskit.load(prefix + "-simulation-source.trees") -source_subset = source.simplify(subset, filter_sites=False) -print(f"True tree seq, simplified to {len(subset)} sampled genomes") -source_subset.draw_svg(size=(800, 200), x_lim=limit, time_scale="rank") -``` - -```{code-cell} ipython3 -inferred = tskit.load(prefix + "-simulation.trees") -inferred_subset = inferred.simplify(subset, filter_sites=False) -print(f"Inferred tree seq, simplified to {len(subset)} sampled genomes") -inferred_subset.draw_svg(size=(800, 200), x_lim=limit) -``` - -There are a number of things to note when comparing the plots above. Most -obviously, the first tree in the inferred tree sequence is empty: by default, `tsinfer` -will not generate a genealogy before the first site in the genome (here, position 655) -or past the last site, as there is, by definition, no data in these regions. - -You can also see that the inferred tree sequence has fewer trees than the original, -simulated one. There are two reasons for this. First, there are some tree changes that -do not involve a change of topology (e.g. the first two trees in the original tree -sequence are topologically identical, as are two trees after that). -Even if we had different mutations on the -different edges in these trees, such tree changes are unlikely to be spotted. -Second, our inference depends on the mutational information that is present. If no -mutations fall on a particular edge in the tree sequence, then we have no way of -inferring that this edge existed. The fact that there are no mutations in the first, -third, and fifth trees shows that tree changes can occur without associated mutations. -As a result, there will be tree transitions that we cannot pick up. In the simulation -that we performed, the mutation rate is equal to the recombination rate, and so we -expect that many recombinations will be invisible to us. - -For similar reasons, there will be many nodes in the tree at which -polytomies occur. Here we correctly infer that sample nodes 0 and 4 group together, -as do 1 and 2. However, in the middle inferred tree we were not able to distinguish -whether node 3 was closer to node 1 or 2, so we have three children of node 7 in that -tree. - -Finally, you should note that inference does not always get the right answer. At the -first site in the inferred trees, node 5 is placed as an outgroup, but at this site -it is actually closer to the group consisting of nodes 0 and 4. - -:::{note} -Other than the sample node IDs, it is meaningless to compare node numbers in the -source and inferred tree sequences. -::: - -(sec_tutorial_data_example)= - -## Data example - -Inputting real data for inference is similar in principle to the examples above: -you simply iterate over the sites in your data file, calling {meth}`.SampleData.add_site` -for each variable site. We do not provide methods to read directly from other formats, as -these formats are often too complex for automatic conversion. However, many Python -libraries are available for reading different genomic data formats, and writing a script -to iterate over the sites in such files is usually a fairly simple task. The example -below shows how this can be done for a VCF file, using a freely available VCF reading -library. - -(sec_tutorial_read_vcf)= - -### Reading a VCF - -A common way to store genetic variation data is as a VCF (Variant Call Format) file. -The [CYVCF2 library](https://github.com/brentp/cyvcf2) provides a relatively fast -Python interface for reading VCF files. The following script demonstrates how to infer a -tree sequence from a simple VCF file. In particular the function below named -`add_diploid_sites` illustrates how to iterate over the variants in a `CYVCF2.VCF` -object and add them to a `tsinfer` sample data file. This particular script assumes -that the VCF describes a set of phased, diploid individuals, and has the ancestral state -present in the INFO field of the VCF (otherwise it takes the reference allele as the -ancestral state). For example data, we use a publicly available VCF file of the genetic -variants from chromosome 24 of ten Norwegian and French house sparrows, -*Passer domesticus* (thanks to Mark Ravinet for the data file): - - -```{code-cell} ipython3 -import cyvcf2 -import tsinfer - -def add_diploid_sites(vcf, samples): - """ - Read the sites in the vcf and add them to the samples object. - """ - # You may want to change the following line, e.g. here we allow - # "*" (a spanning deletion) to be a valid allele state - allele_chars = set("ATGCatgc*") - pos = 0 - progressbar = tqdm(total=samples.sequence_length, desc="Read VCF", unit='bp') - for variant in vcf: # Loop over variants, each assumed at a unique site - progressbar.update(variant.POS - pos) - if pos == variant.POS: - print(f"Duplicate entries at position {pos}, ignoring all but the first") - continue - else: - pos = variant.POS - if any([not phased for _, _, phased in variant.genotypes]): - raise ValueError("Unphased genotypes for variant at position", pos) - alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT] - ancestral = variant.INFO.get("AA", ".") # "." means unknown - # some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st - ancestral = ancestral.split("|")[0].upper() - if ancestral == "." or ancestral == "": - ancestral_allele = MISSING_DATA - # alternatively, you could specify `ancestral = variant.REF.upper()` - else: - ancestral_allele = alleles.index(ancestral) - # Check we have ATCG alleles - for a in alleles: - if len(set(a) - allele_chars) > 0: - print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}") - continue - # Map original allele indexes to their indexes in the new alleles list. - genotypes = [g for row in variant.genotypes for g in row[0:2]] - samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele) - - -def chromosome_length(vcf): - assert len(vcf.seqlens) == 1 - return vcf.seqlens[0] - - -vcf_location = "_static/P_dom_chr24_phased.vcf.gz" -# NB: could also read from an online version by setting vcf_location to -# "https://github.com/tskit-dev/tsinfer/raw/main/docs/_static/P_dom_chr24_phased.vcf.gz" - -vcf = cyvcf2.VCF(vcf_location) -with tsinfer.SampleData( - path="P_dom_chr24_phased.samples", sequence_length=chromosome_length(vcf) -) as samples: - add_diploid_sites(vcf, samples) - -print( - "Sample file created for {} samples ".format(samples.num_samples) - + "({} individuals) ".format(samples.num_individuals) - + "with {} variable sites.".format(samples.num_sites), - flush=True, -) - -# Do the inference -ts = tsinfer.infer(samples) -print( - "Inferred tree sequence: {} trees over {} Mb ({} edges)".format( - ts.num_trees, ts.sequence_length / 1e6, ts.num_edges - ) -) -``` - -On a modern computer, this should only take a few seconds to run. - -:::{note} -This code can also be modified to read huge VCF files in parallel, see -[this issue](https://github.com/tskit-dev/tsinfer/issues/277#issuecomment-652024871). -::: - -#### Adding more information - -As the output indicates, we have created 20 individuals, each with a single chromosome, -rather than (as we probably intended) 10 individuals each with 2 chromosomes. That's -because the script has not called {meth}`.SampleData.add_individual`, so -`tsinfer` has assumed that each input chromosome belongs to a single (haploid) -{ref}`individual `. Although this is perfectly -OK and does not invalidate further analysis, it may be difficult to match information -from the original VCF file with nodes in the tree sequence. Ideally, you should aim to -incorporate all necessary information into the sample data file, from where it will be -carried through into the final inferred tree sequence. The code below demonstrates not -only how to associate each pair of samples with a diploid individual, but also how to -allocate an identifier to these individuals by using the individual's metadata field. -The code also gives a basic illustration of adding extra information about the -{ref}`population` from which each individual has -been sampled. - -```{code-cell} ipython3 -import json - -def add_populations(vcf, samples): - """ - Add tsinfer Population objects and returns a list of IDs corresponding to the VCF samples. - """ - # In this VCF, the first letter of the sample name refers to the population - samples_first_letter = [sample_name[0] for sample_name in vcf.samples] - pop_lookup = {} - pop_lookup["8"] = samples.add_population(metadata={"country": "Norway"}) - pop_lookup["F"] = samples.add_population(metadata={"country": "France"}) - return [pop_lookup[first_letter] for first_letter in samples_first_letter] - - -def add_diploid_individuals(vcf, samples, populations): - for name, population in zip(vcf.samples, populations): - samples.add_individual(ploidy=2, metadata={"name": name}, population=population) - - -# Repeat as previously but add both populations and individuals -vcf_location = "_static/P_dom_chr24_phased.vcf.gz" -# NB: could also read from an online version by setting vcf_location to -# "https://github.com/tskit-dev/tsinfer/raw/main/docs/_static/P_dom_chr24_phased.vcf.gz" - -vcf = cyvcf2.VCF(vcf_location) -with tsinfer.SampleData( - path="P_dom_chr24_phased.samples", sequence_length=chromosome_length(vcf) -) as samples: - populations = add_populations(vcf, samples) - add_diploid_individuals(vcf, samples, populations) - add_diploid_sites(vcf, samples) - -print( - "Sample file created for {} samples ".format(samples.num_samples) - + "({} individuals) ".format(samples.num_individuals) - + "with {} variable sites.".format(samples.num_sites), - flush=True, -) - -# Do the inference -sparrow_ts = tsinfer.infer(samples) -print( - "Inferred tree sequence `{}`: {} trees over {} Mb".format( - "sparrow_ts", sparrow_ts.num_trees, sparrow_ts.sequence_length / 1e6 - ) -) -# Check the metadata -for sample_node_id in sparrow_ts.samples(): - individual_id = sparrow_ts.node(sample_node_id).individual - population_id = sparrow_ts.node(sample_node_id).population - print( - "Node", - sample_node_id, - "labels a chr24 sampled from individual", - json.loads(sparrow_ts.individual(individual_id).metadata), - "in", - json.loads(sparrow_ts.population(population_id).metadata)["country"], - ) -``` - - -### Analysis - -To analyse your inferred tree sequence you can use all the analysis functions built in to -the [tskit](https://tskit.dev/tskit/docs/stable/) library. The -{ref}`tskit tutorial` provides much more detail. Below we just give a -flavour of the possibilities. - -To quickly eyeball small datasets, we can draw the entire tree sequence, or -{meth}`~tskit.Tree.draw` the tree at any particular genomic position. The following -code demonstrates how to use the {meth}`tskit.TreeSequence.at` method to obtain the tree -1Mb from the start of the sequence, and plot it, colouring the tips according to -population: - -```{code-cell} ipython3 -colours = {"Norway": "red", "France": "blue"} -colours_for_node = {} -for n in sparrow_ts.samples(): - population_data = sparrow_ts.population(sparrow_ts.node(n).population) - colours_for_node[n] = colours[json.loads(population_data.metadata)["country"]] - -individual_for_node = {} -for n in sparrow_ts.samples(): - individual_data = sparrow_ts.individual(sparrow_ts.node(n).individual) - individual_for_node[n] = json.loads(individual_data.metadata)["name"] - -tree = sparrow_ts.at(1e6) -tree.draw( - path="tree_at_1Mb.svg", - height=700, - width=1200, - node_labels=individual_for_node, - node_colours=colours_for_node, -) -``` - -This tree seems to suggest that Norwegian and French individuals may not fall into -discrete groups on the tree, but be part of a larger mixing population. Note, however, -that this is only one of thousands of trees, and may not be typical of the genome as a -whole. Additionally, most data sets will have far more samples than this example, so -trees visualized in this way are likely to be huge and difficult to understand. As in -the {ref}`simulation example ` above, one possibility -is to {meth}`~tskit.TreeSequence.simplify` the tree sequence to a limited number of -samples, but it is likely that most studies will -instead rely on various statistical summaries of the trees. Storing genetic data as a -tree sequence makes many of these calculations fast and efficient, and tskit has both a -set of {ref}`commonly used methods` and a framework that -{ref}`generalizes population genetic statistics`. For example, -the allele or site frequency spectrum (SFS) can be calculated using -{meth}`tskit.TreeSequence.allele_frequency_spectrum` and the allelic diversity ("Tajima's -{math}`{\pi}`") using {meth}`tskit.TreeSequence.diversity`, both of which can also be -calculated locally (e.g. {ref}`per tree or in genomic windows `). As -a basic example, here's how to calculate genome-wide {math}`F_{st}` between the Norwegian -and French (sub)populations: - -```{code-cell} ipython3 -samples_listed_by_population = [ - sparrow_ts.samples(population=pop_id) - for pop_id in range(sparrow_ts.num_populations) -] - -print("Fst between populations:", sparrow_ts.Fst(samples_listed_by_population)) -``` - -As noted above, the times of nodes are uncalibrated so we shouldn't perform -calculations that reply on branch lengths. However, some statistics, such as the -genealogical nearest neighbour (GNN) proportions are calculated from the topology -of the trees. using the individual and population metadata to format the results table in -a tidy manner - -```{code-cell} ipython3 -import pandas as pd - -gnn = sparrow_ts.genealogical_nearest_neighbours( - sparrow_ts.samples(), samples_listed_by_population -) - -# Tabulate GNN nicely using a Pandas dataframe with named rows and columns -sample_nodes = [sparrow_ts.node(n) for n in sparrow_ts.samples()] -sample_ids = [n.id for n in sample_nodes] -sample_names = [ - json.loads(sparrow_ts.individual(n.individual).metadata)["name"] - for n in sample_nodes -] -sample_pops = [ - json.loads(sparrow_ts.population(n.population).metadata)["country"] - for n in sample_nodes -] -gnn_table = pd.DataFrame( - data=gnn, - index=[ - pd.Index(sample_ids, name="Sample node"), - pd.Index(sample_names, name="Bird"), - pd.Index(sample_pops, name="Country"), - ], - columns=[json.loads(p.metadata)["country"] for p in sparrow_ts.populations()], -) - -print(gnn_table) -# Summarize GNN for all birds from the same country -print(gnn_table.groupby(level="Country").mean()) -``` - - -From this, it can be seen that the genealogical nearest neighbours of birds in Norway -tend also to be in Norway, and vice versa for birds from France. In other words, there is -a small but noticable degree of population structure in the data. The bird ``8L19766`` -and one of the chromosomes of bird `FR046` seem to buck this trend, and it would -probably be worth checking these data points further (perhaps they are migrant birds), -and verifying if other chromosomes in these individuals show the same pattern. - -Much more can be done with the genomic statistics built into tskit. For further -information, please refer to the {ref}`statistics section` of the -tskit documentation. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..7250acaa --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,639 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsinfer +::: + + +(sec_usage)= + +# Usage + +(sec_usage_toy_example)= + +## Toy example + +_Tsinfer_ takes as input a [Zarr](https://zarr.readthedocs.io/) file, with phased variant data encoded in the +[VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) (.vcz) format. The standard +route to create such a file is by conversion from a VCF file, e.g. using +[vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as described later in this +document. However, for the moment we'll just use a pre-generated dataset: + + +```{code-cell} ipython3 +import zarr +ds = zarr.load("_static/example_data.vcz") +``` + +This is what the genotypes stored in that datafile look like: + +```{code-cell} +:"tags": ["remove-input"] +import numpy as np +assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele']) +assert any([np.sum(g) == 1 for g in ds['call_genotype']]) # at least one singleton +assert any([np.sum(g) == 0 for g in ds['call_genotype']]) # at least one non-variable + +alleles = ds['variant_allele'].astype(str) +sites = np.arange(ds['call_genotype'].shape[0]) +print(" " * 22, "Site:", " ".join(str(x) for x in range(8)), "\n") +for sample in range(ds['call_genotype'].shape[1]): + for genome in range(ds['call_genotype'].shape[2]): + genotypes = ds['call_genotype'][:,sample, genome] + print( + f"Diploid sample {sample} (genome {genome}):", + " ".join(alleles[sites, genotypes]) + ) +``` + +### VariantData and ancestral alleles + +We wish to infer a genealogy that could have given rise to this data set. To run _tsinfer_ +we wrap the .vcz file in a `tsinfer.VariantData` object. This requires an +*ancestral allele* to be specified for each site; there are +many methods for calculating these: details are outside the scope of this manual, but we +have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523) +on this issue to provide some recommendations. + +Sometimes VCF files will contain the +ancestral allele in the "AA" info field, in which case it will be encoded in the +`variant_AA` field of the .vcz file. It's also possible to provide a numpy array +of ancestral alleles, of the same length as the number of variants. Ancestral +alleles that are not in the list of alleles for their respective site are treated as unknown +and not used for inference (with a warning given). + +```{code-cell} +import tsinfer + +# For this example take the REF allele (index 0) as ancestral +ancestral_alleles = ds['variant_allele'][:,0].astype(str) +# This is just a numpy array, set the last site to an unknown value, for demo purposes +ancestral_alleles[-1] = "." + +vdata = tsinfer.VariantData("_static/example_data.vcz", ancestral_alleles) +``` + +The `VariantData` object is a lightweight wrapper around the .vcz file. +We'll use it to infer a tree sequence on the basis of the sites that vary between the +different samples. However, note that certain sites are not used by _tsinfer_ for inferring +the genealogy (although they are still encoded in the final tree sequence), These are: + +* Non-variable (fixed) sites, e.g. site 4 above +* Singleton sites, where only one genome has the derived allele e.g. site 5 above +* Sites where the ancestral allele is unknown, e.g. demonstrated by site 7 above +* Multialleleic sites, with more than 2 alleles (but see + [here](https://github.com/tskit-dev/tsinfer/issues/670) for a workaround) + +Additionally, during the inference step, additional sites can be flagged as not for use in +inference, for example if they are deemed unreliable (this is done +via the `exclude_positions` parameter). + +### Masks + +Sites which are not used for inference will still be included in the final tree sequence, with +mutations at those sites being placed onto branches by parsimony. However, it is also possible +to completely exclude sites and samples from the final tree sequence, by specifing a `site_mask` +and/or a `sample_mask` when creating the `VariantData` object. Such sites or samples will be +completely omitted from both inference and the final tree sequence. This can be useful, for +example, to reduce the amount of computation required for an inference. + +### Topology inference + +Once we have stored our data in a `.VariantData` object, we can easily infer +a {ref}`tree sequence` using the Python +API. Note that each sample in the original .vcz file will correspond to an *individual* +in the resulting tree sequence. Since these three individuals are diploid, the resulting +tree sequence will have `ts.num_samples == 6` (i.e. unlike in a .vcz file, a "sample" in +tskit refers to a haploid genome, not a diploid individual). + +```{code-cell} ipython3 +inferred_ts = tsinfer.infer(vdata) +print("Inferred a genetic genealogy for {inferred_ts.num_samples} (haploid) genomes") +``` + +And that's it: we now have a fully functional {class}`tskit.TreeSequence` +object that we can interrogate in the usual ways. For example, we can look +at the {meth}`aligned haplotypes` in the tree sequence: + +```{code-cell} ipython3 +print("Sample\tInferred sequence") +for sample_id, seq in zip( + inferred_ts.samples(), + inferred_ts.alignments(missing_data_character="."), +): + print(sample_id, seq, sep="\t") +``` + +You will notice that the sample sequences generated by this tree sequence are identical +to the input haplotype data: apart from the imputation of +{ref}`missing data`, _tsinfer_ is guaranteed to +losslessly encode any given input data, regardless of the inferred topology. You can +check this programatically if you want: + +```{code-cell} ipython3 +import numpy as np +for v_orig, v_inferred in zip(vdata.variants(), inferred_ts.variants()): + if any( + np.array(v_orig.alleles)[v_orig.genotypes] != + np.array(v_inferred.alleles)[v_inferred.genotypes] + ): + raise ValueError("Genotypes in original dataset and inferred tree seq not equal") +print("** Genotypes in original dataset and inferred tree sequence are the same **") +``` + +We can examine the inferred genetic genealogy, +in the form of {ref}`local trees`. _Tsinfer_ has also +placed mutations on the genealogy to explain the observed genetic variation: + +```{code-cell} ipython3 +mut_labels = { + m.id: "{:g}: {}→{}".format( + s.position, + inferred_ts.mutation(m.parent).derived_state if m.parent >= 0 else s.ancestral_state, + m.derived_state, + ) + for s in inferred_ts.sites() + for m in s.mutations +} +node_labels = {u: u for u in inferred_ts.samples()} + +inferred_ts.draw_svg( + size=(500, 200), node_labels=node_labels, mutation_labels=mut_labels, y_axis=True) +``` + +We have inferred 4 trees aong the genome. Note, however, that there are "polytomies" in the +trees, where some nodes have more than two children (e.g. the root node in the first tree). +This is a common feature of trees inferred by _tsinfer_ and signals that there was not +sufficient information to resolve the tree at this internal node. + + +Each internal (non-sample) node in this inferred tree represents an ancestral sequence, +constructed on the basis of shared, derived alleles at one or more of the sites. By +default, the time of each such node is *not* measured in years or generations, but +is simply the frequency of the shared derived allele(s) on which the ancestral sequence +is based. For this reason, the time units are described as "uncalibrated" in the plot, +and trying to calculate statistics based on branch lengths will raise an error: + +```{code-cell} ipython3 +:tags: ["raises-exception"] +inferred_ts.diversity(mode="branch") +``` + +To add meaningful dates to an inferred tree sequence, allowing meaningful branch length +statistics to be calulated, you must use additional +software such as [tsdate](https://tskit.dev/software/tsdate.html): the _tsinfer_ +algorithm is only intended to infer the genetic relationships between the samples +(i.e. the *topology* of the tree sequence). + + +(sec_usage_simulation_example)= + +## Simulation example + +The previous example showed how we can infer a tree sequence using the Python API for a trivial +toy example. However, for real data we will not prepare our data and infer the tree sequence all +in one go; rather, we will usually split the process into at least two distinct steps. + +The first step in any inference is to prepare your data and create the .vcz file. +For simplicity here we'll use +Python to simulate some data under the coalescent with recombination, using +[msprime](https://msprime.readthedocs.io/en/stable/api.html#msprime.simulate): + +```{code-cell} ipython3 + +import builtins +import sys +import os +import subprocess + +from Bio import bgzf +import numpy as np + +import msprime +import tsinfer + +if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook + num_diploids, seq_len = 100, 10_000 + name = "notebook-simulation" + python = sys.executable +else: # Take parameters from the command-line + num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2]) + name = "cli-simulation" + python = "python" + +ts = msprime.sim_ancestry( + num_diploids, + population_size=10**4, + recombination_rate=1e-8, + sequence_length=seq_len, + random_seed=6, +) +ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7) +ts.dump(name + "-source.trees") +print( + f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:", + f"{ts.num_trees} trees and {ts.num_sites} sites" +) + +# Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see +# https://github.com/sgkit-dev/bio2zarr/issues/232 +np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()]) +vcf_name = f"{name}.vcf.gz" +with bgzf.open(vcf_name, "wt") as f: + ts.write_vcf(f) +subprocess.run(["tabix", vcf_name]) +ret = subprocess.run( + [python, "-m", "bio2zarr", "vcf2zarr", "convert", "--force", vcf_name, f"{name}.vcz"], + stderr = subprocess.DEVNULL if name == "notebook-simulation" else None, +) +assert os.path.exists(f"{name}.vcz") + +if ret.returncode == 0: + print(f"Converted to {name}.vcz") +``` + +Here we first run a simulation then we create a vcf file and convert it to .vcz format. +If we store this code in a file named `simulate-data.py`, we can run it on the +command line, providing parameters to generate much bigger datasets: + + +```{code} bash +% python simulate-data.py 5000 10_000_000 +Simulated 10000 samples over 10.0 Mb: 36204 trees and 38988 sites + Scan: 100%|████████████████████████████████████████████████████████████████████| 1.00/1.00 [00:00<00:00, 2.86files/s] + Explode: 100%|████████████████████████████████████████████████████████████████████| 39.0k/39.0k [01:17<00:00, 501vars/s] + Encode: 100%|███████████████████████████████████████████████████████████████████████| 976M/976M [00:12<00:00, 78.0MB/s] +Finalise: 100%|█████████████████████████████████████████████████████████████████████| 10.0/10.0 [00:00<00:00, 421array/s] +``` + +Here, we simulated a sample of 10 thousand chromosomes, each 10Mb long, under +human-like parameters. That has created 39K segregating sites. The output +above shows the state of the vcf-to-zarr conversion at the end of this process, +indicating that it took about a minute to convert the data into .vcz format +(this only needs doing once) + +Examining the files on the command line, we then see the following:: + +```{code} bash +$ du -sh cli-simulation* +156K cli-simulation-AA.npy +8.8M cli-simulation-source.trees + 25M cli-simulation.vcf.gz +8.0K cli-simulation.vcf.gz.tbi +9.4M cli-simulation.vcz +``` + +The `cli-simulation.vcz` zarr data is quite small, about the same size as the +original `msprime` tree sequence file. Since (like all zarr datastores) + `cli-simulation.vcz` is a directory +we can peer inside to look where everything is stored: + +```{code} bash +$ du -sh cli-simulation.vcz/* +8.7M cli-simulation.vcz/call_genotype + 88K cli-simulation.vcz/call_genotype_mask + 88K cli-simulation.vcz/call_genotype_phased + 12K cli-simulation.vcz/contig_id + 12K cli-simulation.vcz/contig_length + 12K cli-simulation.vcz/filter_id + 28K cli-simulation.vcz/sample_id + 52K cli-simulation.vcz/variant_allele + 24K cli-simulation.vcz/variant_contig + 24K cli-simulation.vcz/variant_filter + 48K cli-simulation.vcz/variant_id + 24K cli-simulation.vcz/variant_id_mask +144K cli-simulation.vcz/variant_position + 24K cli-simulation.vcz/variant_quality +``` + +Most of this output is not particularly interesting here, but we can see that the +`call_genotype` array which holds all of the sample genotypes (and thus the vast bulk of the +actual data) only requires about 8.7MB compared to 25MB for the compressed vcf file. +In practice this means we can keep such files lying around without taking up too much space. + +Once we have our `.vcz` file created, running the inference is straightforward. + +```{code-cell} ipython3 +# Infer & save a ts from the notebook simulation. +ancestral_alleles = np.load(f"{name}-AA.npy") +vdata = tsinfer.VariantData(f"{name}.vcz", ancestral_alleles) +tsinfer.infer(vdata, progress_monitor=True, num_threads=4).dump(name + ".trees") +``` + +Running the `infer` command runs the full inference pipeline in one go (the individual steps +are explained {ref}`here `). We provided two extra arguments to `infer`: +`progress_monitor` gives us the progress bars show above, and `num_threads=4` tells +_tsinfer_ to use four worker threads whenever it can use them. + +Performing this inference on the `cli-simulation` data using Core i5 processor (launched 2009) +with 4GiB of RAM, took about eighteen minutes. The maximum memory usage was about 600MiB. + +The output files look like this: + +```{code} bash +$ ls -lh cli-simulation* +-rw-r--r-- 1 user group 8.2M 14 Oct 13:45 cli-simulation-source.trees +-rw-r--r-- 1 user group 22M 14 Oct 13:45 cli-simulation.samples +-rw-r--r-- 1 user group 8.1M 14 Oct 13:50 cli-simulation.trees +``` + +Therefore our output tree sequence file that we have just inferred in a few minutes is +*even smaller* than the original ``msprime`` simulated tree sequence! +Because the output file is also a {class}`tskit.TreeSequence`, we can use the same API +to work with both. For example, to load up the original and inferred tree sequences +from their corresponding `.trees` files you can simply use the {func}`tskit.load` +function, as shown below. + +However, there are thousands of trees in these tree sequences, each of which has +10 thousand samples: much too much to easily visualise. Therefore, in the code below, we +we subset both tree sequences down to their minimal representations for the first six +sample nodes using {meth}`tskit.TreeSequence.simplify`, while keeping all sites, even if +they do not show genetic variation between the six selected samples. +:::{note} +Using this tiny subset of the overall data allows us to get an informal +feel for the trees that are inferred by _tsinfer_, but this is certainly +not a recommended approach for validating the inference! +::: + +Once we've subsetted the tree sequences down to something that we can comfortably look +at, we can plot the trees in particular regions of interest, for example the first +2kb of genome. Below, for simplicity we use the smaller dataset that was simulated and +inferred within the the notebook, but exactly the same code will work with +the larger version run from the cli simulation. + + +```{code-cell} ipython3 +import tskit + +subset = range(0, 6) # show first 6 samples +limit = (0, 2_000) # show only the trees covering the first 2kb + +prefix = "notebook" # Or use "cli" for the larger example +source = tskit.load(prefix + "-simulation-source.trees") +source_subset = source.simplify(subset, filter_sites=False) +print(f"True tree seq, simplified to {len(subset)} sampled genomes") +source_subset.draw_svg(size=(800, 200), x_lim=limit, time_scale="rank") +``` + +```{code-cell} ipython3 +inferred = tskit.load(prefix + "-simulation.trees") +inferred_subset = inferred.simplify(subset, filter_sites=False) +print(f"Inferred tree seq, simplified to {len(subset)} sampled genomes") +inferred_subset.draw_svg(size=(800, 200), x_lim=limit) +``` + +There are a number of things to note when comparing the plots above. Most +obviously, the first tree in the inferred tree sequence is empty: by default, _tsinfer_ +will not generate a genealogy before the first site in the genome (here, position 655) +or past the last site, as there is, by definition, no data in these regions. + +You can also see that the inferred tree sequence has fewer trees than the original, +simulated one. There are two reasons for this. First, there are some tree changes that +do not involve a change of topology (e.g. the first two trees in the original tree +sequence are topologically identical, as are two trees after that). +Even if we had different mutations on the +different edges in these trees, such tree changes are unlikely to be spotted. +Second, our inference depends on the mutational information that is present. If no +mutations fall on a particular edge in the tree sequence, then we have no way of +inferring that this edge existed. The fact that there are no mutations in the first, +third, and fifth trees shows that tree changes can occur without associated mutations. +As a result, there will be tree transitions that we cannot pick up. In the simulation +that we performed, the mutation rate is equal to the recombination rate, and so we +expect that many recombinations will be invisible to us. + +For similar reasons, there will be many nodes in the tree at which +polytomies occur. Here we correctly infer that sample nodes 0 and 4 group together, +as do 1 and 2. However, in the middle inferred tree we were not able to distinguish +whether node 3 was closer to node 1 or 2, so we have three children of node 7 in that +tree. + +Finally, you should note that inference does not always get the right answer. At the +first site in the inferred trees, node 5 is placed as an outgroup, but at this site +it is actually closer to the group consisting of nodes 0 and 4. + +:::{note} +Other than the sample node IDs, it is meaningless to compare node numbers in the +source and inferred tree sequences. +::: + +(sec_usage_data_example)= + +## Data example + +Inputting real data for inference is similar in principle to the examples above. +All that is required is a .vcz file, which can be created using +[vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as above + +(sec_usage_read_vcf)= + +### Reading a VCF + +For example data, we use a publicly available VCF file of the genetic +variants from chromosome 24 of ten Norwegian and French house sparrows, +*Passer domesticus* (thanks to Mark Ravinet for the data file): + +```{code-cell} ipython3 +:tags: ["remove-output"] +import zarr + +vcf_location = "_static/P_dom_chr24_phased.vcf.gz" +!python -m bio2zarr vcf2zarr convert --force {vcf_location} sparrows.vcz +``` + +This creates the `sparrows.vcz` datastore, which we open using `tsinfer.VariantData`. +The original VCF had ancestral alleles specified in the `AA` INFO field, so we can +simply provide the string `"variant_AA"` as the ancestral_allele parameter. + +```{code-cell} ipython3 +# Do the inference: this VCF has ancestral alleles in the AA field +vdata = tsinfer.VariantData("sparrows.vcz", ancestral_allele="variant_AA") +ts = tsinfer.infer(vdata) +print( + "Inferred tree sequence: {} trees over {} Mb ({} edges)".format( + ts.num_trees, ts.sequence_length / 1e6, ts.num_edges + ) +) +``` + +On a modern computer, this should only take a few seconds to run. + +### Adding more metadata + +We can add additional data to the zarr file, which will make it through to the tree sequence. +For instance, we might want to mark which population each individual comes from. +This can be done by adding some descriptive metadata for each population, and the assigning +each sample to one of those populations. In our case, the sample sparrow IDs beginning with +"FR" are from France: + +```{code-cell} ipython3 +import json +import numpy as np +import tskit +import zarr + +ds = zarr.load("sparrows.vcz") + +populations = ("Norway", "France") +# save the population data in json format +schema = json.dumps(tskit.MetadataSchema.permissive_json().schema).encode() +zarr.save("sparrows.vcz/populations_metadata_schema", schema) +metadata = [ + json.dumps({"name": pop, "description": "The country from which this sample comes"}).encode() + for pop in populations +] +zarr.save("sparrows.vcz/populations_metadata", metadata) + +# Now assign each diploid sample to a population +num_individuals = ds["sample_id"].shape[0] +individuals_population = np.full(num_individuals, tskit.NULL, dtype=np.int32) +for i, name in enumerate(ds["sample_id"]): + if name.startswith("FR"): + individuals_population[i] = populations.index("France") + else: + individuals_population[i] = populations.index("Norway") +zarr.save("sparrows.vcz/individuals_population", individuals_population) +``` + +Now when we carry out the inference, we get a tree sequence in which the nodes are +correctly assigned to named populations + +```{code-cell} ipython3 +vdata = tsinfer.VariantData("sparrows.vcz", ancestral_allele="variant_AA") +sparrow_ts = tsinfer.infer(vdata) + +for sample_node_id in sparrow_ts.samples(): + individual_id = sparrow_ts.node(sample_node_id).individual + population_id = sparrow_ts.node(sample_node_id).population + print( + "Node", + sample_node_id, + "labels a chr24 sampled from individual", + sparrow_ts.individual(individual_id).metadata["variant_data_sample_id"], + "in", + sparrow_ts.population(population_id).metadata["name"], + ) +``` + +### Analysis + +To analyse your inferred tree sequence you can use all the analysis functions built in to +the [tskit](https://tskit.dev/tskit/docs/stable/) library. The +{ref}`tskit tutorial` provides much more detail. Below we just give a +flavour of the possibilities. + +To quickly eyeball small datasets, we can draw the entire tree sequence, or +{meth}`~tskit.Tree.draw` the tree at any particular genomic position. The following +code demonstrates how to use the {meth}`tskit.TreeSequence.at` method to obtain the tree +1Mb from the start of the sequence, and plot it, colouring the tips according to +population: + +```{code-cell} ipython3 +colours = {"Norway": "red", "France": "blue"} +colours_for_node = {} +for n in sparrow_ts.samples(): + population_data = sparrow_ts.population(sparrow_ts.node(n).population) + colours_for_node[n] = colours[population_data.metadata["name"]] + +individual_for_node = {} +for n in sparrow_ts.samples(): + individual_data = sparrow_ts.individual(sparrow_ts.node(n).individual) + individual_for_node[n] = individual_data.metadata["variant_data_sample_id"] + +tree = sparrow_ts.at(1e6) +tree.draw( + path="tree_at_1Mb.svg", + height=700, + width=1200, + node_labels=individual_for_node, + node_colours=colours_for_node, +) +``` + +This tree seems to suggest that Norwegian and French individuals may not fall into +discrete groups on the tree, but be part of a larger mixing population. Note, however, +that this is only one of thousands of trees, and may not be typical of the genome as a +whole. Additionally, most data sets will have far more samples than this example, so +trees visualized in this way are likely to be huge and difficult to understand. As in +the {ref}`simulation example ` above, one possibility +is to {meth}`~tskit.TreeSequence.simplify` the tree sequence to a limited number of +samples, but it is likely that most studies will +instead rely on various statistical summaries of the trees. Storing genetic data as a +tree sequence makes many of these calculations fast and efficient, and tskit has both a +set of {ref}`commonly used methods` and a framework that +{ref}`generalizes population genetic statistics`. For example, +the allele or site frequency spectrum (SFS) can be calculated using +{meth}`tskit.TreeSequence.allele_frequency_spectrum` and the allelic diversity ("Tajima's +{math}`{\pi}`") using {meth}`tskit.TreeSequence.diversity`, both of which can also be +calculated locally (e.g. {ref}`per tree or in genomic windows `). As +a basic example, here's how to calculate genome-wide {math}`F_{st}` between the Norwegian +and French (sub)populations: + +```{code-cell} ipython3 +samples_listed_by_population = [ + sparrow_ts.samples(population=pop_id) + for pop_id in range(sparrow_ts.num_populations) +] + +print("Fst between populations:", sparrow_ts.Fst(samples_listed_by_population)) +``` + +As noted above, the times of nodes are uncalibrated so we shouldn't perform +calculations that reply on branch lengths. However, some statistics, such as the +genealogical nearest neighbour (GNN) proportions are calculated from the topology +of the trees. Here's an example, using the individual and population metadata to format the results table in +a tidy manner + +```{code-cell} ipython3 +import pandas as pd + +gnn = sparrow_ts.genealogical_nearest_neighbours( + sparrow_ts.samples(), samples_listed_by_population +) + +# Tabulate GNN nicely using a Pandas dataframe with named rows and columns +sample_nodes = [sparrow_ts.node(n) for n in sparrow_ts.samples()] +sample_ids = [n.id for n in sample_nodes] +sample_names = [ + sparrow_ts.individual(n.individual).metadata["variant_data_sample_id"] + for n in sample_nodes +] +sample_pops = [ + sparrow_ts.population(n.population).metadata["name"] + for n in sample_nodes +] +gnn_table = pd.DataFrame( + data=gnn, + index=[ + pd.Index(sample_ids, name="Sample node"), + pd.Index(sample_names, name="Bird"), + pd.Index(sample_pops, name="Country"), + ], + columns=[p.metadata["name"] for p in sparrow_ts.populations()], +) + +print(gnn_table) +# Summarize GNN for all birds from the same country +print(gnn_table.groupby(level="Country").mean()) +``` + + +From this, it can be seen that the genealogical nearest neighbours of birds in Norway +tend also to be in Norway, and vice versa for birds from France. In other words, there is +a small but noticable degree of population structure in the data. The bird ``8L19766`` +and one of the chromosomes of bird `FR046` seem to buck this trend, and it would +probably be worth checking these data points further (perhaps they are migrant birds), +and verifying if other chromosomes in these individuals show the same pattern. + +Much more can be done with the genomic statistics built into tskit. For further +information, please refer to the {ref}`statistics section` of the +tskit documentation. diff --git a/requirements/CI-docs/requirements.txt b/requirements/CI-docs/requirements.txt index 1a8ed794..661a6405 100644 --- a/requirements/CI-docs/requirements.txt +++ b/requirements/CI-docs/requirements.txt @@ -8,4 +8,6 @@ daiquiri==3.2.5.1 msprime==1.3.2 sgkit[vcf]==0.9.0 ipywidgets==8.1.3 +Bio==1.7.1 +bio2zarr==0.1.1 sphinx-book-theme #Unpinned to allow easy updating.