From 09865fbbbc47e40da986f8f8a7914866bce016cc Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 26 Jul 2024 16:17:40 +0100 Subject: [PATCH] Remove sgkit from the tutorial --- docs/.gitignore | 3 + docs/_static/example_data.vcz/.zattrs | 6 + docs/_static/example_data.vcz/.zgroup | 3 + docs/_static/example_data.vcz/.zmetadata | 243 ++++++++++++++++++ .../example_data.vcz/call_genotype/.zarray | 24 ++ .../example_data.vcz/call_genotype/.zattrs | 9 + .../example_data.vcz/call_genotype/0.0.0 | Bin 0 -> 64 bytes .../call_genotype_mask/.zarray | 24 ++ .../call_genotype_mask/.zattrs | 9 + .../example_data.vcz/call_genotype_mask/0.0.0 | Bin 0 -> 64 bytes .../call_genotype_phased/.zarray | 22 ++ .../call_genotype_phased/.zattrs | 8 + .../example_data.vcz/call_genotype_phased/0.0 | Bin 0 -> 40 bytes .../example_data.vcz/contig_id/.zarray | 20 ++ .../example_data.vcz/contig_id/.zattrs | 6 + docs/_static/example_data.vcz/contig_id/0 | Bin 0 -> 20 bytes .../example_data.vcz/sample_id/.zarray | 20 ++ .../example_data.vcz/sample_id/.zattrs | 6 + docs/_static/example_data.vcz/sample_id/0 | Bin 0 -> 40 bytes .../example_data.vcz/variant_allele/.zarray | 22 ++ .../example_data.vcz/variant_allele/.zattrs | 7 + .../example_data.vcz/variant_allele/0.0 | Bin 0 -> 32 bytes .../example_data.vcz/variant_contig/.zarray | 20 ++ .../example_data.vcz/variant_contig/.zattrs | 6 + .../_static/example_data.vcz/variant_contig/0 | Bin 0 -> 80 bytes .../example_data.vcz/variant_position/.zarray | 20 ++ .../example_data.vcz/variant_position/.zattrs | 6 + .../example_data.vcz/variant_position/0 | Bin 0 -> 80 bytes docs/tutorial.md | 38 +-- 29 files changed, 505 insertions(+), 17 deletions(-) create mode 100644 docs/_static/example_data.vcz/.zattrs create mode 100644 docs/_static/example_data.vcz/.zgroup create mode 100644 docs/_static/example_data.vcz/.zmetadata create mode 100644 docs/_static/example_data.vcz/call_genotype/.zarray create mode 100644 docs/_static/example_data.vcz/call_genotype/.zattrs create mode 100644 docs/_static/example_data.vcz/call_genotype/0.0.0 create mode 100644 docs/_static/example_data.vcz/call_genotype_mask/.zarray create mode 100644 docs/_static/example_data.vcz/call_genotype_mask/.zattrs create mode 100644 docs/_static/example_data.vcz/call_genotype_mask/0.0.0 create mode 100644 docs/_static/example_data.vcz/call_genotype_phased/.zarray create mode 100644 docs/_static/example_data.vcz/call_genotype_phased/.zattrs create mode 100644 docs/_static/example_data.vcz/call_genotype_phased/0.0 create mode 100644 docs/_static/example_data.vcz/contig_id/.zarray create mode 100644 docs/_static/example_data.vcz/contig_id/.zattrs create mode 100644 docs/_static/example_data.vcz/contig_id/0 create mode 100644 docs/_static/example_data.vcz/sample_id/.zarray create mode 100644 docs/_static/example_data.vcz/sample_id/.zattrs create mode 100644 docs/_static/example_data.vcz/sample_id/0 create mode 100644 docs/_static/example_data.vcz/variant_allele/.zarray create mode 100644 docs/_static/example_data.vcz/variant_allele/.zattrs create mode 100644 docs/_static/example_data.vcz/variant_allele/0.0 create mode 100644 docs/_static/example_data.vcz/variant_contig/.zarray create mode 100644 docs/_static/example_data.vcz/variant_contig/.zattrs create mode 100644 docs/_static/example_data.vcz/variant_contig/0 create mode 100644 docs/_static/example_data.vcz/variant_position/.zarray create mode 100644 docs/_static/example_data.vcz/variant_position/.zattrs create mode 100644 docs/_static/example_data.vcz/variant_position/0 diff --git a/docs/.gitignore b/docs/.gitignore index f3c3680f..f4b2a542 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,4 +1,7 @@ notebook-simulation.trees notebook-simulation.samples notebook-simulation-source.trees +notebook-simulation.vc* +notebook-simulation-AA.npy P_dom_chr24_phased.samples +sparrows.vcz diff --git a/docs/_static/example_data.vcz/.zattrs b/docs/_static/example_data.vcz/.zattrs new file mode 100644 index 00000000..f778611e --- /dev/null +++ b/docs/_static/example_data.vcz/.zattrs @@ -0,0 +1,6 @@ +{ + "contigs": [ + "0" + ], + "source": "sgkit-0.9.0" +} \ No newline at end of file diff --git a/docs/_static/example_data.vcz/.zgroup b/docs/_static/example_data.vcz/.zgroup new file mode 100644 index 00000000..3b7daf22 --- /dev/null +++ b/docs/_static/example_data.vcz/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/docs/_static/example_data.vcz/.zmetadata b/docs/_static/example_data.vcz/.zmetadata new file mode 100644 index 00000000..24b95bd0 --- /dev/null +++ b/docs/_static/example_data.vcz/.zmetadata @@ -0,0 +1,243 @@ +{ + "metadata": { + ".zattrs": { + "contigs": [ + "0" + ], + "source": "sgkit-0.9.0" + }, + ".zgroup": { + "zarr_format": 2 + }, + "call_genotype/.zarray": { + "chunks": [ + 8, + 3, + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 8, + 3, + 2 + ], + "zarr_format": 2 + }, + "call_genotype/.zattrs": { + "_ARRAY_DIMENSIONS": [ + "variants", + "samples", + "ploidy" + ], + "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.", + "mixed_ploidy": false + }, + "call_genotype_mask/.zarray": { + "chunks": [ + 8, + 3, + 2 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 8, + 3, + 2 + ], + "zarr_format": 2 + }, + "call_genotype_mask/.zattrs": { + "_ARRAY_DIMENSIONS": [ + "variants", + "samples", + "ploidy" + ], + "comment": "A flag for each call indicating which values are missing.", + "dtype": "bool" + }, + "call_genotype_phased/.zarray": { + "chunks": [ + 8, + 3 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 8, + 3 + ], + "zarr_format": 2 + }, + "call_genotype_phased/.zattrs": { + "_ARRAY_DIMENSIONS": [ + "variants", + "samples" + ], + "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.", + "dtype": "bool" + }, + "contig_id/.zarray": { + "chunks": [ + 1 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "yMcMNfNb_@XkGl&HX literal 0 HcmV?d00001 diff --git a/docs/_static/example_data.vcz/variant_contig/.zarray b/docs/_static/example_data.vcz/variant_contig/.zarray new file mode 100644 index 00000000..bb6f54dd --- /dev/null +++ b/docs/_static/example_data.vcz/variant_contig/.zarray @@ -0,0 +1,20 @@ +{ + "chunks": [ + 8 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "`N;5-g7AVaMrP-h~JCp_hP!Rzu literal 0 HcmV?d00001 diff --git a/docs/tutorial.md b/docs/tutorial.md index 23101b93..27bbf80a 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -27,26 +27,24 @@ _Tsinfer_ takes as input a [Zarr](https://zarr.readthedocs.io/) file, with phase [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) (.vcz) format. The standard route to create such a file is by conversion from a VCF file, e.g. using [vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as described later in this -document. For a quick introduction, however, we will instead create an example file using -[sgkit](https://sgkit-dev.github.io/sgkit/latest/). +document. However, for the moment we'll just use a pre-generated dataset: ```{code-cell} ipython3 -import sgkit -ds = sgkit.simulate_genotype_call_dataset(n_variant=8, n_sample=3, missing_pct=0, phased=True, seed=79) -sgkit.save_dataset(ds, "data.vcz", mode="w") +import zarr +ds = zarr.load("_static/example_data.vcz") ``` -This is what that generated data looks like: +This is what the genotypes stored in that datafile look like: ```{code-cell} :"tags": ["remove-input"] import numpy as np -assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele'].values) -assert any([np.sum(g.values) == 1 for g in ds['call_genotype']]) # at least one singleton -assert any([np.sum(g.values) == 0 for g in ds['call_genotype']]) # at least one non-variable +assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele']) +assert any([np.sum(g) == 1 for g in ds['call_genotype']]) # at least one singleton +assert any([np.sum(g) == 0 for g in ds['call_genotype']]) # at least one non-variable -alleles = ds['variant_allele'].values.astype(str) +alleles = ds['variant_allele'].astype(str) sites = np.arange(ds['call_genotype'].shape[0]) print(" " * 22, "Site:", " ".join(str(x) for x in range(8)), "\n") for sample in range(ds['call_genotype'].shape[1]): @@ -54,16 +52,20 @@ for sample in range(ds['call_genotype'].shape[1]): genotypes = ds['call_genotype'][:,sample, genome] print( f"Diploid sample {sample} (genome {genome}):", - " ".join(alleles[sites, genotypes.values]) + " ".join(alleles[sites, genotypes]) ) ``` +### VariantData and ancestral alleles + We wish to infer a genealogy that could have given rise to this data set. To run _tsinfer_ we wrap the .vcz file in a `tsinfer.VariantData` object. This requires an *ancestral allele* to be specified for each site; there are many methods for calculating there: details are outside the scope of this manual, but we have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523) -on this issue to provide some recommendations. Sometimes VCF files will contain the +on this issue to provide some recommendations. + +Sometimes VCF files will contain the ancestral allele in the "AA" info field, in which case it will be encoded in the `variant_AA` field of the .vcz file. It's also possible to provide a numpy array of ancestral alleles, of the same length as the number of variants. Ancestral @@ -73,17 +75,17 @@ and not used for inference (with a warning given). ```{code-cell} import tsinfer -# In this example, take the REF allele (index 0) as ancestral -ancestral_alleles = ds['variant_allele'].values[:,0].astype(str) -# set the last site as of unknown ancestral allele +# For this example take the REF allele (index 0) as ancestral +ancestral_alleles = ds['variant_allele'][:,0].astype(str) +# set the last site to an unknown ancestral allele, for this demo ancestral_alleles[-1] = "." -vdata = tsinfer.VariantData("data.vcz", ancestral_alleles) +vdata = tsinfer.VariantData("_static/example_data.vcz", ancestral_alleles) ``` Here we create a new `.VariantData` object for the 3 diploid samples in our dataset. Each diploid sample will correspond to an *individual* in the resulting tree -sequenece, and each of the 6 genomes will correspond to a sample node +sequence, and each of the 6 genomes will correspond to a sample node (hence `ts.num_samples == 6`). Not all sites are used for genealogical inference: this includes non-variable (fixed) @@ -98,6 +100,8 @@ via the `exclude_positions` parameter). Note, however, that even if a site is no for genealogical inference, its genetic variation can still be encoded in the final tree sequence. +### Topology inference + Once we have stored our data in a `.VariantData` object, we can easily infer a {ref}`tree sequence` using the Python API: