diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 50f53c18..7dd0bb7b 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -29,32 +29,22 @@ jobs:
 
       - uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.11"
+          cache: "pip"
 
-      - uses: actions/cache@v4
-        id: venv-cache
-        with:
-          path: venv
-          key: docs-venv-v6-${{ hashFiles('requirements/CI-docs/requirements.txt') }}
-
-      - name: Create venv and install deps (one by one to avoid conflict errors)
-        if: steps.venv-cache.outputs.cache-hit != 'true'
+      - name: Install deps (one by one to avoid conflict errors)
         run: |
-          python -m venv venv
-          . venv/bin/activate
           pip install --upgrade pip wheel
-          pip install -r requirements/CI-docs/requirements.txt          
-          
+          pip install -r requirements/CI-docs/requirements.txt
+          sudo apt-get install -y tabix
 
       - name: Build C module
         if: env.MAKE_TARGET
         run: |
-          . venv/bin/activate
           make $MAKE_TARGET
 
       - name: Build Docs
         run: |
-          . venv/bin/activate
           cd docs && make dist
 
       - name: Trigger docs site rebuild
diff --git a/docs/.gitignore b/docs/.gitignore
index f3c3680f..f4b2a542 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,7 @@
 notebook-simulation.trees
 notebook-simulation.samples
 notebook-simulation-source.trees
+notebook-simulation.vc*
+notebook-simulation-AA.npy
 P_dom_chr24_phased.samples
+sparrows.vcz
diff --git a/docs/_config.yml b/docs/_config.yml
index 75ed2ee2..e2025d9a 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -41,7 +41,7 @@ sphinx:
     config:
       html_theme: sphinx_book_theme
       html_theme_options:
-        pygment_dark_style: monokai
+        pygments_dark_style: monokai
       pygments_style: monokai
       myst_enable_extensions:
       - colon_fence
diff --git a/docs/_static/example_data.vcz/.zattrs b/docs/_static/example_data.vcz/.zattrs
new file mode 100644
index 00000000..f778611e
--- /dev/null
+++ b/docs/_static/example_data.vcz/.zattrs
@@ -0,0 +1,6 @@
+{
+    "contigs": [
+        "0"
+    ],
+    "source": "sgkit-0.9.0"
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/.zgroup b/docs/_static/example_data.vcz/.zgroup
new file mode 100644
index 00000000..3b7daf22
--- /dev/null
+++ b/docs/_static/example_data.vcz/.zgroup
@@ -0,0 +1,3 @@
+{
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/.zmetadata b/docs/_static/example_data.vcz/.zmetadata
new file mode 100644
index 00000000..24b95bd0
--- /dev/null
+++ b/docs/_static/example_data.vcz/.zmetadata
@@ -0,0 +1,243 @@
+{
+    "metadata": {
+        ".zattrs": {
+            "contigs": [
+                "0"
+            ],
+            "source": "sgkit-0.9.0"
+        },
+        ".zgroup": {
+            "zarr_format": 2
+        },
+        "call_genotype/.zarray": {
+            "chunks": [
+                8,
+                3,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|i1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                3,
+                2
+            ],
+            "zarr_format": 2
+        },
+        "call_genotype/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "samples",
+                "ploidy"
+            ],
+            "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.",
+            "mixed_ploidy": false
+        },
+        "call_genotype_mask/.zarray": {
+            "chunks": [
+                8,
+                3,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|i1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                3,
+                2
+            ],
+            "zarr_format": 2
+        },
+        "call_genotype_mask/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "samples",
+                "ploidy"
+            ],
+            "comment": "A flag for each call indicating which values are missing.",
+            "dtype": "bool"
+        },
+        "call_genotype_phased/.zarray": {
+            "chunks": [
+                8,
+                3
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|i1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                3
+            ],
+            "zarr_format": 2
+        },
+        "call_genotype_phased/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "samples"
+            ],
+            "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.",
+            "dtype": "bool"
+        },
+        "contig_id/.zarray": {
+            "chunks": [
+                1
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<U1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                1
+            ],
+            "zarr_format": 2
+        },
+        "contig_id/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "contigs"
+            ],
+            "comment": "Contig identifiers."
+        },
+        "sample_id/.zarray": {
+            "chunks": [
+                3
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<U2",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                3
+            ],
+            "zarr_format": 2
+        },
+        "sample_id/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "samples"
+            ],
+            "comment": "The unique identifier of the sample."
+        },
+        "variant_allele/.zarray": {
+            "chunks": [
+                8,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|S1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                2
+            ],
+            "zarr_format": 2
+        },
+        "variant_allele/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "alleles"
+            ],
+            "comment": "The possible alleles for the variant."
+        },
+        "variant_contig/.zarray": {
+            "chunks": [
+                8
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<i8",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8
+            ],
+            "zarr_format": 2
+        },
+        "variant_contig/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants"
+            ],
+            "comment": "Index corresponding to contig name for each variant. In some less common\nscenarios, this may also be equivalent to the contig names if the data\ngenerating process used contig names that were also integers."
+        },
+        "variant_position/.zarray": {
+            "chunks": [
+                8
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<i8",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8
+            ],
+            "zarr_format": 2
+        },
+        "variant_position/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants"
+            ],
+            "comment": "The reference position of the variant."
+        }
+    },
+    "zarr_consolidated_format": 1
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype/.zarray b/docs/_static/example_data.vcz/call_genotype/.zarray
new file mode 100644
index 00000000..105c8f40
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype/.zarray
@@ -0,0 +1,24 @@
+{
+    "chunks": [
+        8,
+        3,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        3,
+        2
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype/.zattrs b/docs/_static/example_data.vcz/call_genotype/.zattrs
new file mode 100644
index 00000000..7b7753c5
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype/.zattrs
@@ -0,0 +1,9 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples",
+        "ploidy"
+    ],
+    "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.",
+    "mixed_ploidy": false
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype/0.0.0 b/docs/_static/example_data.vcz/call_genotype/0.0.0
new file mode 100644
index 00000000..23f15862
Binary files /dev/null and b/docs/_static/example_data.vcz/call_genotype/0.0.0 differ
diff --git a/docs/_static/example_data.vcz/call_genotype_mask/.zarray b/docs/_static/example_data.vcz/call_genotype_mask/.zarray
new file mode 100644
index 00000000..105c8f40
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_mask/.zarray
@@ -0,0 +1,24 @@
+{
+    "chunks": [
+        8,
+        3,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        3,
+        2
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_mask/.zattrs b/docs/_static/example_data.vcz/call_genotype_mask/.zattrs
new file mode 100644
index 00000000..6a43b7ab
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_mask/.zattrs
@@ -0,0 +1,9 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples",
+        "ploidy"
+    ],
+    "comment": "A flag for each call indicating which values are missing.",
+    "dtype": "bool"
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_mask/0.0.0 b/docs/_static/example_data.vcz/call_genotype_mask/0.0.0
new file mode 100644
index 00000000..574f3547
Binary files /dev/null and b/docs/_static/example_data.vcz/call_genotype_mask/0.0.0 differ
diff --git a/docs/_static/example_data.vcz/call_genotype_phased/.zarray b/docs/_static/example_data.vcz/call_genotype_phased/.zarray
new file mode 100644
index 00000000..c862e179
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_phased/.zarray
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        8,
+        3
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        3
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_phased/.zattrs b/docs/_static/example_data.vcz/call_genotype_phased/.zattrs
new file mode 100644
index 00000000..7ae8d94f
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_phased/.zattrs
@@ -0,0 +1,8 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples"
+    ],
+    "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.",
+    "dtype": "bool"
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_phased/0.0 b/docs/_static/example_data.vcz/call_genotype_phased/0.0
new file mode 100644
index 00000000..b368df0f
Binary files /dev/null and b/docs/_static/example_data.vcz/call_genotype_phased/0.0 differ
diff --git a/docs/_static/example_data.vcz/contig_id/.zarray b/docs/_static/example_data.vcz/contig_id/.zarray
new file mode 100644
index 00000000..ce072443
--- /dev/null
+++ b/docs/_static/example_data.vcz/contig_id/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        1
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<U1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        1
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/contig_id/.zattrs b/docs/_static/example_data.vcz/contig_id/.zattrs
new file mode 100644
index 00000000..63b8cf86
--- /dev/null
+++ b/docs/_static/example_data.vcz/contig_id/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "contigs"
+    ],
+    "comment": "Contig identifiers."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/contig_id/0 b/docs/_static/example_data.vcz/contig_id/0
new file mode 100644
index 00000000..113105d5
Binary files /dev/null and b/docs/_static/example_data.vcz/contig_id/0 differ
diff --git a/docs/_static/example_data.vcz/sample_id/.zarray b/docs/_static/example_data.vcz/sample_id/.zarray
new file mode 100644
index 00000000..c27ef48d
--- /dev/null
+++ b/docs/_static/example_data.vcz/sample_id/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        3
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<U2",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        3
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/sample_id/.zattrs b/docs/_static/example_data.vcz/sample_id/.zattrs
new file mode 100644
index 00000000..801e30ed
--- /dev/null
+++ b/docs/_static/example_data.vcz/sample_id/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "samples"
+    ],
+    "comment": "The unique identifier of the sample."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/sample_id/0 b/docs/_static/example_data.vcz/sample_id/0
new file mode 100644
index 00000000..7ead0f07
Binary files /dev/null and b/docs/_static/example_data.vcz/sample_id/0 differ
diff --git a/docs/_static/example_data.vcz/variant_allele/.zarray b/docs/_static/example_data.vcz/variant_allele/.zarray
new file mode 100644
index 00000000..0ccff202
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_allele/.zarray
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        8,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|S1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        2
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_allele/.zattrs b/docs/_static/example_data.vcz/variant_allele/.zattrs
new file mode 100644
index 00000000..2de87677
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_allele/.zattrs
@@ -0,0 +1,7 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "alleles"
+    ],
+    "comment": "The possible alleles for the variant."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_allele/0.0 b/docs/_static/example_data.vcz/variant_allele/0.0
new file mode 100644
index 00000000..67ced593
Binary files /dev/null and b/docs/_static/example_data.vcz/variant_allele/0.0 differ
diff --git a/docs/_static/example_data.vcz/variant_contig/.zarray b/docs/_static/example_data.vcz/variant_contig/.zarray
new file mode 100644
index 00000000..bb6f54dd
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_contig/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        8
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_contig/.zattrs b/docs/_static/example_data.vcz/variant_contig/.zattrs
new file mode 100644
index 00000000..a22ac19d
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_contig/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants"
+    ],
+    "comment": "Index corresponding to contig name for each variant. In some less common\nscenarios, this may also be equivalent to the contig names if the data\ngenerating process used contig names that were also integers."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_contig/0 b/docs/_static/example_data.vcz/variant_contig/0
new file mode 100644
index 00000000..72c48136
Binary files /dev/null and b/docs/_static/example_data.vcz/variant_contig/0 differ
diff --git a/docs/_static/example_data.vcz/variant_position/.zarray b/docs/_static/example_data.vcz/variant_position/.zarray
new file mode 100644
index 00000000..bb6f54dd
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_position/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        8
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_position/.zattrs b/docs/_static/example_data.vcz/variant_position/.zattrs
new file mode 100644
index 00000000..a9a8102f
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_position/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants"
+    ],
+    "comment": "The reference position of the variant."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_position/0 b/docs/_static/example_data.vcz/variant_position/0
new file mode 100644
index 00000000..ce6b6630
Binary files /dev/null and b/docs/_static/example_data.vcz/variant_position/0 differ
diff --git a/docs/_toc.yml b/docs/_toc.yml
index 0f8fe1b7..8f15ad5a 100644
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@@ -8,9 +8,9 @@ parts:
 - caption: Installation
   chapters:
   - file: installation
-- caption: Tutorial
+- caption: Usage
   chapters:
-  - file: tutorial
+  - file: usage
 - caption: Inference
   chapters:
   - file: inference
diff --git a/docs/inference.md b/docs/inference.md
index 57f63422..e6ebac21 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -141,7 +141,7 @@ still not as efficently as it is possible to analyse an equivalent tree sequence
 Rather than require the user to understand the internal structure of this
 file format, we provide a simple {ref}`Python API <sec_api_file_formats>`
 to allow the user to efficiently construct it from their own data.
-An example of how to use this API is given in the {ref}`sec_tutorial`.
+An example of how to use this API is given in the {ref}`sec_usage` documentation.
 
 We do not provide an automatic means of importing data from VCF (or any
 other format) intentionally, as we believe that this would be extremely difficult to do.
diff --git a/docs/simulation-example.py b/docs/simulation-example.py
index 45ab12aa..b72601c1 100644
--- a/docs/simulation-example.py
+++ b/docs/simulation-example.py
@@ -1,44 +1,42 @@
-import os
+import builtins
+import subprocess
 import sys
 
 import msprime
-import tqdm
-
-sys.path.insert(0, os.path.abspath(".."))
-import tsinfer  # noqa
-
-if True:
-    ts = msprime.simulate(
-        sample_size=10000,
-        Ne=10**4,
-        recombination_rate=1e-8,
-        mutation_rate=1e-8,
-        length=10 * 10**6,
-        random_seed=42,
-    )
-    ts.dump("simulation-source.trees")
-    print("Simulation done:", ts.num_trees, "trees and", ts.num_sites)
-
-    with tsinfer.SampleData(
-        sequence_length=ts.sequence_length,
-        path="simulation.samples",
-        num_flush_threads=2,
-    ) as samples:
-        for var in tqdm.tqdm(ts.variants(), total=ts.num_sites):
-            samples.add_site(var.site.position, var.genotypes, var.alleles)
-
-else:
-    source = msprime.load("simulation-source.trees")
-    inferred = msprime.load("simulation.trees")
-
-    subset = range(0, 6)
-    source_subset = source.simplify(subset)
-    inferred_subset = inferred.simplify(subset)
-
-    tree = source_subset.first()
-    print("True tree: interval=", tree.interval)
-    print(tree.draw(format="unicode"))
-
-    tree = inferred_subset.first()
-    print("Inferred tree: interval=", tree.interval)
-    print(tree.draw(format="unicode"))
+import numpy as np
+from Bio import bgzf
+
+if getattr(builtins, "__IPYTHON__", False):  # if running IPython: e.g. in a notebook
+    num_diploids, seq_len = 100, 10_000
+    name = "notebook-simulation"
+else:  # Take parameters from the command-line
+    num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2])
+    name = "cli-simulation"
+
+ts = msprime.sim_ancestry(
+    num_diploids,
+    population_size=10**4,
+    recombination_rate=1e-8,
+    sequence_length=seq_len,
+    random_seed=6,
+)
+ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7)
+ts.dump(name + "-source.trees")
+print(
+    f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:",
+    f"{ts.num_trees} trees and {ts.num_sites} sites",
+)
+
+# Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see
+# https://github.com/sgkit-dev/bio2zarr/issues/232
+np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()])
+vcf_name = f"{name}.vcf.gz"
+with bgzf.open(vcf_name, "wt") as f:
+    ts.write_vcf(f)
+subprocess.run(["tabix", vcf_name])
+ret = subprocess.run(
+    "python -m bio2zarr vcf2zarr convert --force".split() + [vcf_name, name + ".vcz"],
+    stderr=subprocess.DEVNULL if name == "notebook-simulation" else None,
+)
+if ret.returncode == 0:
+    print(f"Converted to {name}.vcz")
diff --git a/docs/tutorial.md b/docs/tutorial.md
deleted file mode 100644
index 50241f35..00000000
--- a/docs/tutorial.md
+++ /dev/null
@@ -1,735 +0,0 @@
----
-jupytext:
-  text_representation:
-    extension: .md
-    format_name: myst
-    format_version: 0.12
-    jupytext_version: 1.9.1
-kernelspec:
-  display_name: Python 3
-  language: python
-  name: python3
----
-
-:::{currentmodule} tsinfer
-:::
-
-
-(sec_tutorial)=
-
-# Tutorial
-
-(sec_tutorial_toy_example)=
-
-## Toy example
-
-Suppose that we have observed the following data::
-
-    sample  alignment
-    0       A..G......C..G.....AT
-    1       T..G......A..C.....AG
-    2       A..G......A..C.....AC
-    3       A..C......C..G.....CT
-    4       A..C......C..G.....CT
-            |         |         |
-            0         10        20
-
-Here we have phased haplotype data for five samples at six variable sites. We wish to
-infer the genealogies that gave rise to this data set. To import the data into `tsinfer`
-we must know the *ancestral state* for each site we wish to use for inference; there are
-many methods for achieving this: details are outside the scope of this manual, but we
-have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523)
-on this issue to provide some recommendations. Assuming that we
-know the ancestral states for most sites, we can then import our data into a `tsinfer`
-{ref}`Sample data<sec_file_formats_samples>` file using the Python
-{ref}`API<sec_api_file_formats>`:
-
-```{code-cell} ipython3
-import tsinfer
-from tskit import MISSING_DATA
-
-with tsinfer.SampleData(sequence_length=21) as sample_data:
-    sample_data.add_site(0, [0, 1, 0, 0, 0], ["A", "T"], ancestral_allele=0)
-    sample_data.add_site(3, [0, 0, 0, 1, 1], ["G", "C"], ancestral_allele=0)
-    sample_data.add_site(10, [0, 1, 1, 0, 0], ["C", "A"], ancestral_allele=0)
-    sample_data.add_site(13, [0, 1, 1, 0, 0], ["G", "C"], ancestral_allele=MISSING_DATA)
-    sample_data.add_site(19, [0, 0, 0, 1, 1], ["A", "C"], ancestral_allele=0)
-    sample_data.add_site(20, [0, 1, 2, 0, 0], ["T", "G", "C"], ancestral_allele=0)
-```
-
-Here we create a new {class}`.SampleData` object for five samples. We then
-sequentially add the data for each site one-by-one using the
-{meth}`.SampleData.add_site` method. The first argument for `add_site` is the
-position of the site in genome coordinates. This can be any positive value
-(even floating point), but site positions must be unique and sites must be
-added in increasing order of position. Since `tskit` uses a zero-based notation,
-we've added the first site at position 0, and the sequence covers 21 bp. The second
-argument for `add_site` is a list of *genotypes* for the site. Each value in a
-genotypes array `g` is an integer, giving an index into the list provided as the third
-argument to ``add_site``: the list of *alleles* for this site. The fourth argument
-is the index in the allele list which represents the ancestral state.
-Each call to ``add_site`` thus stores a single variable site in the
-original haplotype data above. For example, the ancestral and derived states for
-the site at position 0 are "A" and "T" and the genotypes are 01000: this encodes
-the first column, ATAAA.
-
-Not all sites are used for genealogical inference: this includes non-variable (fixed)
-sites, singletons (in this example, the site at position 0), sites
-with more than 2 alleles (e.g. the site at position 20), and sites where the ancestral
-state is unknown (marked as {data}`tskit.MISSING_DATA`, e.g. the site at position 10).
-Additionally, during the next step, extra sites can be flagged as not for use in
-inferring the genealogy, for example if they are deemed unreliable (this is done
-via the `exclude_positions` parameter). Note, however, that even if a site is not used
-for genealogical inference, its genetic variation will still be encoded in the final
-tree sequence.
-
-Once we have stored our data in a {class}`.SampleData` object, we can easily infer 
-a {ref}`tree sequence<sec_python_api_trees_and_tree_sequences>` using the Python
-API:
-
-```{code-cell} ipython3
-inferred_ts = tsinfer.infer(sample_data)
-```
-
-And that's it: we now have a fully functional {class}`tskit.TreeSequence`
-object that we can interrogate in the usual ways. For example, we can look
-at the {meth}`aligned haplotypes<tskit.TreeSequence.alignments>` in the tree sequence:
-
-```{code-cell} ipython3
-print("Sample\tInferred sequence")
-for sample_id, seq in zip(
-    inferred_ts.samples(),
-    inferred_ts.alignments(missing_data_character="."),
-):
-    print(sample_id, seq, sep="\t")
-```
-
-You will notice that the sample sequences generated by this tree sequence are identical
-to the input haplotype data: apart from the imputation of
-{ref}`missing data<sec_inference_data_requirements>`, `tsinfer` is guaranteed to
-losslessly encode any given input data, regardless of the inferred topology. You can
-check this programatically if you want:
-
-```{code-cell} ipython3
-import numpy as np
-for v_orig, v_inferred in zip(sample_data.variants(), inferred_ts.variants()):
-    if any(
-        np.array(v_orig.alleles)[v_orig.genotypes] !=
-        np.array(v_inferred.alleles)[v_inferred.genotypes]
-    ):
-        raise ValueError("Genotypes in original dataset and inferred tree seq not equal")
-print("** Genotypes in original dataset and inferred tree sequence are the same **")
-```
-
-We can examine the inferred genetic genealogy,
-in the form of {ref}`local trees<tutorials:sec_what_is_local_trees>`. `Tsinfer` has also
-placed mutations on the genealogy to explain the observed genetic variation:
-
-```{code-cell} ipython3
-mut_labels = {
-    m.id: "{:g}: {}→{}".format(
-        s.position,
-        inferred_ts.mutation(m.parent).derived_state if m.parent >= 0 else s.ancestral_state,
-        m.derived_state,
-    )
-    for s in inferred_ts.sites()
-    for m in s.mutations
-}
-node_labels = {u: u for u in inferred_ts.samples()}
-
-inferred_ts.draw_svg(
-    size=(500, 200), node_labels=node_labels, mutation_labels=mut_labels, y_axis=True)
-```
-
-This small dataset is compatible with a single tree, so the inferred tree sequence
-consists of only one tree. Note, however, that it contains a *polytomy* at the root
-(i.e. the root node has more than two children) . This is a
-common feature of trees inferred by `tsinfer` and signals that there was not
-sufficient information to resolve the tree at this internal node.
-
-
-Each internal (non-sample) node in this inferred tree represents an ancestral sequence,
-constructed on the basis of shared, derived alleles at one or more of the sites. By
-default, the time of each such node is *not* measured in years or generations, but
-is simply the frequency of the shared derived allele(s) on which the ancestral sequence
-is based. For this reason, the time units are described as "uncalibrated" in the plot,
-and trying to calculate statistics based on branch lengths will raise an error:
-
-```{code-cell} ipython3
-:tags: ["raises-exception"]
-inferred_ts.diversity(mode="branch")
-```
-
-To add meaningful dates to an inferred tree sequence, allowing meaningful branch length
-statistics to be calulated, you must use additional
-software such as [tsdate](https://tskit.dev/software/tsdate.html): the `tsinfer`
-algorithm is only intended to infer the genetic relationships between the samples
-(i.e. the *topology* of the tree sequence).
-
-
-(sec_tutorial_simulation_example)=
-
-## Simulation example
-
-The previous example showed how we can infer a tree sequence using the Python API for a trivial
-toy example. However, for real data we will not prepare our data and infer the tree sequence all
-in one go; rather, we will usually split the process into at least two distinct steps.
-
-The first step in any inference is to prepare your data and import it into a
-{ref}`sample data<sec_file_formats_samples>` file. For simplicity here we'll use
-Python to simulate some data under the coalescent with recombination, using
-[msprime](https://msprime.readthedocs.io/en/stable/api.html#msprime.simulate):
-
-```{code-cell} ipython3
-
-import builtins
-import sys
-
-import msprime
-import tsinfer
-
-if getattr(builtins, "__IPYTHON__", False):  # if running IPython: e.g. in a notebook
-    from tqdm.notebook import tqdm
-    num_diploids, seq_len = 100, 10_000
-    name = "notebook-simulation"
-else:  # Take parameters from the command-line
-    from tqdm import tqdm
-    num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2])
-    name = "cli-simulation"
-    
-ts = msprime.sim_ancestry(
-    num_diploids,
-    population_size=10**4,
-    recombination_rate=1e-8,
-    sequence_length=seq_len,
-    random_seed=6,
-)
-ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7)
-ts.dump(name + "-source.trees")
-print(
-    f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:",
-    f"{ts.num_trees} trees and {ts.num_sites} sites"
-)
-
-with tsinfer.SampleData(
-    path=name + ".samples",
-    sequence_length=ts.sequence_length,
-    num_flush_threads=2
-) as sim_sample_data:
-    for var in tqdm(ts.variants(), total=ts.num_sites):
-        sim_sample_data.add_site(var.site.position, var.genotypes, var.alleles)
-print("Stored output in", name + ".samples")
-```
-
-Here we first run a simulation then, as before, we create a {class}`.SampleData`
-instance to store the data we have simulated. When doing this, we use the extra
-parameters `path`, `sequence_length` and `num_flush_threads`. The first provides a
-filename in which to permanently store the information. The second defines the
-overall coordinate space for site positions, so that this value can
-be recovered in the final tree sequence that we output later. Finally,
-`num_flush_threads=2` tells `tsinfer` to use two background threads for
-compressing data and flushing it to disk.
-
-If we store this code in a file named `simulate-data.py`, we can run it on the
-command line, providing parameters to generate much bigger datasets (and showing
-a progress meter using [tqdm](https://github.com/tqdm/tqdm) to keep track of how
-the process of compressing and storing the sample data is progressing):
-
-
-```{code} bash
-% python3 simulate-data.py 5000 10_000_000
-Simulated 10000 samples over 10.0 Mb: 36204 trees and 38988 sites
-100%|██████████████████████████████████████████| 38988/38988 [00:08<00:00, 4617.55it/s]
-```
-
-Here, we simulated a sample of 10 thousand chromosomes, each 10Mb long, under
-human-like parameters. That has created 39K segregating sites. The output
-above shows the state of the progress meter at the end of this process,
-indicating that it took about 8 seconds to import the data for this simulation into
-`tsinfer`'s sample data format.
-
-:::{note}
-If you already have a tree sequence file, and wish to create a sample data file from
-it, you can use the {meth}`tsinfer.SampleData.from_tree_sequence` method. For example,
-the following snippet is equivalent to the data file creation part of the code above:
-
-```{code} python
-tsinfer.SampleData.from_tree_sequence(
-    ts, path="cli-simulation.samples", num_flush_threads=2)
-```
-:::
-
-Examining the files on the command line, we then see the following::
-
-```{code} bash
-$ ls -lh cli-simulation*
--rw-r--r--  1 user  group   8.2M 14 Oct 13:45 cli-simulation-source.trees
--rw-r--r--  1 user  group    22M 14 Oct 13:45 cli-simulation.samples
-```
-
-The `cli-simulation.samples` file is quite small, being less than three times the size of the
-original `msprime` tree sequence file. The {ref}`tsinfer command line interface <sec_cli>`
-provides a useful way to examine files in more detail using the `list` (or `ls`) command:
-
-```{code} bash
-$ tsinfer ls cli-simulation.samples
-path                  = cli-simulation.samples
-file_size             = 21.6 MiB
-format_name           = tsinfer-sample-data
-format_version        = (5, 1)
-finalised             = True
-uuid                  = 422b25be-fe6e-4c25-b302-af2715fd9c93
-num_provenances       = 0
-provenances/timestamp = shape=(0,); dtype=object;
-provenances/record    = shape=(0,); dtype=object;
-sequence_length       = 10000000.0
-metadata_schema       = {'codec': 'json'}
-metadata              = {}
-num_populations       = 0
-num_individuals       = 10000
-num_samples           = 10000
-num_sites             = 38988
-populations/metadata_schema = None
-populations/metadata  = shape=(0,); dtype=object;
-individuals/metadata_schema = None
-individuals/metadata  = shape=(10000,); dtype=object;
-individuals/location  = shape=(10000,); dtype=object;
-individuals/time      = shape=(10000,); dtype=float64;uncompressed size=80.0 kB
-individuals/population = shape=(10000,); dtype=int32;uncompressed size=40.0 kB
-individuals/flags     = shape=(10000,); dtype=uint32;uncompressed size=40.0 kB
-samples/individual    = shape=(10000,); dtype=int32;uncompressed size=40.0 kB
-sites/position        = shape=(38988,); dtype=float64;uncompressed size=311.9 kB
-sites/time            = shape=(38988,); dtype=float64;uncompressed size=311.9 kB
-sites/alleles         = shape=(38988,); dtype=object;
-sites/genotypes       = shape=(38988, 10000); dtype=int8;uncompressed size=389.9 MB
-sites/metadata_schema = None
-sites/metadata        = shape=(38988,); dtype=object;
-```
-
-Most of this output is not particularly interesting here, but we can see that the
-`sites/genotypes` array which holds all of the sample genotypes (and thus the vast bulk of the
-actual data) requires about 390MB uncompressed. The `tsinfer` sample data format is therefore
-achieving a roughly 20X compression in this case. In practice this means we can keep such files
-lying around without taking up too much space.
-
-Once we have our `.samples` file created, running the inference is straightforward.
-We can do so within Python, or use `tsinfer` on
-the command-line, which is useful when inference is expected to take a long time:
-
-```{code-cell} ipython3
-:"tags": ["hide-input"]
-# Infer & save a ts from the notebook simulation. This could also be done 
-# from the command-line using `tsinfer infer notebook-simulation.samples`
-tsinfer.infer(sim_sample_data).dump(name + ".trees")
-```
-
-```{code} bash
-$ tsinfer infer cli-simulation.samples -p -t 4
-ga-add    (1/6)100%|█████████████████████████████| 39.0k/39.0k [00:02, 16.0kit/s]
-ga-gen    (2/6)100%|█████████████████████████████| 26.3k/26.3k [00:17, 1.52kit/s]
-ms-muts   (4/6)100%|█████████████████████████████| 35.1k/35.1k [00:00, 48.2kit/s]
-ma-match  (3/6)100%|████████████████████████████▉| 26.3k/26.3k [07:21, 60.5it/s]
-ms-match  (5/6)100%|█████████████████████████████| 10.0k/10.0k [10:32, 15.8it/s]
-ms-paths  (6/6)100%|█████████████████████████████| 10.0k/10.0k [00:00, 40.8kit/s]
-ms-muts   (7/6)100%|█████████████████████████████| 35.1k/35.1k [00:00, 42.1kit/s]
-ms-xsites (8/6)100%|█████████████████████████████| 3.85k/3.85k [00:05, 688it/s]
-```
-
-Running the `infer` command runs the full inference pipeline in one go (the individual steps
-are explained {ref}`here <sec_inference>`), writing the output, by default, to the tree sequence
-file ``simulation.trees``. We provided two extra arguments to `infer`: the `-p` flag
-(`--progress`) gives us the progress bars show above, and `-t 4` (``--num-threads=4``) tells
-`tsinfer` to use four worker threads whenever it can use them.
-
-This inference was run on a Core i5 processor (launched 2009) with 4GiB of RAM, and took
-about eighteen minutes. The maximum memory usage was about 600MiB.
-
-Looking at our output files, we see::
-
-```{code} bash
-$ ls -lh cli-simulation*
--rw-r--r--  1 user  group   8.2M 14 Oct 13:45 cli-simulation-source.trees
--rw-r--r--  1 user  group    22M 14 Oct 13:45 cli-simulation.samples
--rw-r--r--  1 user  group   8.1M 14 Oct 13:50 cli-simulation.trees
-```
-
-Therefore our output tree sequence file that we have just inferred in a few minutes is
-*even smaller* than the original ``msprime`` simulated tree sequence!
-Because the output file is also a {class}`tskit.TreeSequence`, we can use the same API
-to work with both. For example, to load up the original and inferred tree sequences
-from their corresponding `.trees` files you can simply use the {func}`tskit.load`
-function, as shown below.
-
-However, there are thousands of trees in these tree sequences, each of which has
-10 thousand samples: much too much to easily visualise. Therefore, in the code below, we
-we subset both tree sequences down to their minimal representations for the first six
-sample nodes using {meth}`tskit.TreeSequence.simplify`, while keeping all sites, even if
-they do not show genetic variation between the six selected samples.
-:::{note}
-Using this tiny subset of the overall data allows us to get an informal
-feel for the trees that are inferred by `tsinfer`, but this is certainly
-not a recommended approach for validating the inference!)
-:::
-
-Once we've subsetted the tree sequences down to something that we can comfortably look
-at, we can plot the trees in particular regions of interest, for example the first
-2kb of genome. Below, for simplicity we use the smaller dataset that was simulated and
-inferred within the the notebook, but exactly the same code will work with
-the larger version run from the cli simulation.  
-
-
-```{code-cell} ipython3
-import tskit
-
-subset = range(0, 6)  # show first 6 samples
-limit = (0, 2_000)    # show only the trees covering the first 2kb
-
-prefix = "notebook"  # Or use "cli" for the larger example
-source = tskit.load(prefix + "-simulation-source.trees")
-source_subset = source.simplify(subset, filter_sites=False)
-print(f"True tree seq, simplified to {len(subset)} sampled genomes")
-source_subset.draw_svg(size=(800, 200), x_lim=limit, time_scale="rank")
-```
-
-```{code-cell} ipython3
-inferred = tskit.load(prefix + "-simulation.trees")
-inferred_subset = inferred.simplify(subset, filter_sites=False)
-print(f"Inferred tree seq, simplified to {len(subset)} sampled genomes")
-inferred_subset.draw_svg(size=(800, 200), x_lim=limit)
-```
-
-There are a number of things to note when comparing the plots above. Most
-obviously, the first tree in the inferred tree sequence is empty: by default, `tsinfer`
-will not generate a genealogy before the first site in the genome (here, position 655)
-or past the last site, as there is, by definition, no data in these regions.
-
-You can also see that the inferred tree sequence has fewer trees than the original,
-simulated one. There are two reasons for this. First, there are some tree changes that
-do not involve a change of topology (e.g. the first two trees in the original tree
-sequence are topologically identical, as are two trees after that).
-Even if we had different mutations on the
-different edges in these trees, such tree changes are unlikely to be spotted.
-Second, our inference depends on the mutational information that is present. If no
-mutations fall on a particular edge in the tree sequence, then we have no way of
-inferring that this edge existed. The fact that there are no mutations in the first,
-third, and fifth trees shows that tree changes can occur without associated mutations.
-As a result, there will be tree transitions that we cannot pick up. In the simulation
-that we performed, the mutation rate is equal to the recombination rate, and so we
-expect that many recombinations will be invisible to us.
-
-For similar reasons, there will be many nodes in the tree at which
-polytomies occur. Here we correctly infer that sample nodes 0 and 4 group together,
-as do 1 and 2. However, in the middle inferred tree we were not able to distinguish
-whether node 3 was closer to node 1 or 2, so we have three children of node 7 in that
-tree.
-
-Finally, you should note that inference does not always get the right answer. At the
-first site in the inferred trees, node 5 is placed as an outgroup, but at this site
-it is actually closer to the group consisting of nodes 0 and 4.
-
-:::{note}
-Other than the sample node IDs, it is meaningless to compare node numbers in the
-source and inferred tree sequences.
-:::
-
-(sec_tutorial_data_example)=
-
-## Data example
-
-Inputting real data for inference is similar in principle to the examples above:
-you simply iterate over the sites in your data file, calling {meth}`.SampleData.add_site`
-for each variable site. We do not provide methods to read directly from other formats, as
-these formats are often too complex for automatic conversion. However, many Python
-libraries are available for reading different genomic data formats, and writing a script
-to iterate over the sites in such files is usually a fairly simple task. The example
-below shows how this can be done for a VCF file, using a freely available VCF reading
-library.
-
-(sec_tutorial_read_vcf)=
-
-### Reading a VCF
-
-A common way to store genetic variation data is as a VCF (Variant Call Format) file.
-The [CYVCF2 library](https://github.com/brentp/cyvcf2) provides a relatively fast
-Python interface for reading VCF files. The following script demonstrates how to infer a
-tree sequence from a simple VCF file. In particular the function below named
-`add_diploid_sites` illustrates how to iterate over the variants in a `CYVCF2.VCF`
-object and add them to a `tsinfer` sample data file. This particular script assumes
-that the VCF describes a set of phased, diploid individuals, and has the ancestral state
-present in the INFO field of the VCF (otherwise it takes the reference allele as the
-ancestral state). For example data, we use a publicly available VCF file of the genetic
-variants from chromosome 24 of ten Norwegian and French house sparrows,
-*Passer domesticus* (thanks to Mark Ravinet for the data file):
-
-
-```{code-cell} ipython3
-import cyvcf2
-import tsinfer
-
-def add_diploid_sites(vcf, samples):
-    """
-    Read the sites in the vcf and add them to the samples object.
-    """
-    # You may want to change the following line, e.g. here we allow
-    # "*" (a spanning deletion) to be a valid allele state
-    allele_chars = set("ATGCatgc*")
-    pos = 0
-    progressbar = tqdm(total=samples.sequence_length, desc="Read VCF", unit='bp')
-    for variant in vcf:  # Loop over variants, each assumed at a unique site
-        progressbar.update(variant.POS - pos)
-        if pos == variant.POS:
-            print(f"Duplicate entries at position {pos}, ignoring all but the first")
-            continue
-        else:
-            pos = variant.POS
-        if any([not phased for _, _, phased in variant.genotypes]):
-            raise ValueError("Unphased genotypes for variant at position", pos)
-        alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
-        ancestral = variant.INFO.get("AA", ".")  # "." means unknown
-        # some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
-        ancestral = ancestral.split("|")[0].upper()
-        if ancestral == "." or ancestral == "":
-            ancestral_allele = MISSING_DATA
-            # alternatively, you could specify `ancestral = variant.REF.upper()`
-        else:
-            ancestral_allele = alleles.index(ancestral)
-        # Check we have ATCG alleles
-        for a in alleles:
-            if len(set(a) - allele_chars) > 0:
-                print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
-                continue
-        # Map original allele indexes to their indexes in the new alleles list.
-        genotypes = [g for row in variant.genotypes for g in row[0:2]]
-        samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
-
-
-def chromosome_length(vcf):
-    assert len(vcf.seqlens) == 1
-    return vcf.seqlens[0]
-
-
-vcf_location = "_static/P_dom_chr24_phased.vcf.gz"
-# NB: could also read from an online version by setting vcf_location to
-# "https://github.com/tskit-dev/tsinfer/raw/main/docs/_static/P_dom_chr24_phased.vcf.gz"
-
-vcf = cyvcf2.VCF(vcf_location)
-with tsinfer.SampleData(
-    path="P_dom_chr24_phased.samples", sequence_length=chromosome_length(vcf)
-) as samples:
-    add_diploid_sites(vcf, samples)
-
-print(
-    "Sample file created for {} samples ".format(samples.num_samples)
-    + "({} individuals) ".format(samples.num_individuals)
-    + "with {} variable sites.".format(samples.num_sites),
-    flush=True,
-)
-
-# Do the inference
-ts = tsinfer.infer(samples)
-print(
-    "Inferred tree sequence: {} trees over {} Mb ({} edges)".format(
-        ts.num_trees, ts.sequence_length / 1e6, ts.num_edges
-    )
-)
-```
-
-On a modern computer, this should only take a few seconds to run.
-
-:::{note}
-This code can also be modified to read huge VCF files in parallel, see
-[this issue](https://github.com/tskit-dev/tsinfer/issues/277#issuecomment-652024871).
-:::
-
-#### Adding more information
-
-As the output indicates, we have created 20 individuals, each with a single chromosome,
-rather than (as we probably intended) 10 individuals each with 2 chromosomes. That's
-because the script has not called {meth}`.SampleData.add_individual`, so
-`tsinfer` has assumed that each input chromosome belongs to a single (haploid)
-{ref}`individual <sec_inference_data_model_individual>`. Although this is perfectly
-OK and does not invalidate further analysis, it may be difficult to match information
-from the original VCF file with nodes in the tree sequence. Ideally, you should aim to
-incorporate all necessary information into the sample data file, from where it will be
-carried through into the final inferred tree sequence. The code below demonstrates not
-only how to associate each pair of samples with a diploid individual, but also how to
-allocate an identifier to these individuals by using the individual's metadata field.
-The code also gives a basic illustration of adding extra information about the
-{ref}`population<sec_inference_data_model_population>` from which each individual has
-been sampled.
-
-```{code-cell} ipython3
-import json
-
-def add_populations(vcf, samples):
-    """
-    Add tsinfer Population objects and returns a list of IDs corresponding to the VCF samples.
-    """
-    # In this VCF, the first letter of the sample name refers to the population
-    samples_first_letter = [sample_name[0] for sample_name in vcf.samples]
-    pop_lookup = {}
-    pop_lookup["8"] = samples.add_population(metadata={"country": "Norway"})
-    pop_lookup["F"] = samples.add_population(metadata={"country": "France"})
-    return [pop_lookup[first_letter] for first_letter in samples_first_letter]
-
-
-def add_diploid_individuals(vcf, samples, populations):
-    for name, population in zip(vcf.samples, populations):
-        samples.add_individual(ploidy=2, metadata={"name": name}, population=population)
-
-
-# Repeat as previously but add both populations and individuals
-vcf_location = "_static/P_dom_chr24_phased.vcf.gz"
-# NB: could also read from an online version by setting vcf_location to
-# "https://github.com/tskit-dev/tsinfer/raw/main/docs/_static/P_dom_chr24_phased.vcf.gz"
-
-vcf = cyvcf2.VCF(vcf_location)
-with tsinfer.SampleData(
-    path="P_dom_chr24_phased.samples", sequence_length=chromosome_length(vcf)
-) as samples:
-    populations = add_populations(vcf, samples)
-    add_diploid_individuals(vcf, samples, populations)
-    add_diploid_sites(vcf, samples)
-
-print(
-    "Sample file created for {} samples ".format(samples.num_samples)
-    + "({} individuals) ".format(samples.num_individuals)
-    + "with {} variable sites.".format(samples.num_sites),
-    flush=True,
-)
-
-# Do the inference
-sparrow_ts = tsinfer.infer(samples)
-print(
-    "Inferred tree sequence `{}`: {} trees over {} Mb".format(
-        "sparrow_ts", sparrow_ts.num_trees, sparrow_ts.sequence_length / 1e6
-    )
-)
-# Check the metadata
-for sample_node_id in sparrow_ts.samples():
-    individual_id = sparrow_ts.node(sample_node_id).individual
-    population_id = sparrow_ts.node(sample_node_id).population
-    print(
-        "Node",
-        sample_node_id,
-        "labels a chr24 sampled from individual",
-        json.loads(sparrow_ts.individual(individual_id).metadata),
-        "in",
-        json.loads(sparrow_ts.population(population_id).metadata)["country"],
-    )
-```
-
-
-### Analysis
-
-To analyse your inferred tree sequence you can use all the analysis functions built in to
-the [tskit](https://tskit.dev/tskit/docs/stable/) library. The
-{ref}`tskit tutorial<sec_tutorial_stats>` provides much more detail. Below we just give a
-flavour of the possibilities.
-
-To quickly eyeball small datasets, we can draw the entire tree sequence, or
-{meth}`~tskit.Tree.draw` the tree at any particular genomic position. The following
-code demonstrates how to use the {meth}`tskit.TreeSequence.at` method to obtain the tree
-1Mb from the start of the sequence, and plot it, colouring the tips according to
-population:
-
-```{code-cell} ipython3
-colours = {"Norway": "red", "France": "blue"}
-colours_for_node = {}
-for n in sparrow_ts.samples():
-    population_data = sparrow_ts.population(sparrow_ts.node(n).population)
-    colours_for_node[n] = colours[json.loads(population_data.metadata)["country"]]
-
-individual_for_node = {}
-for n in sparrow_ts.samples():
-    individual_data = sparrow_ts.individual(sparrow_ts.node(n).individual)
-    individual_for_node[n] = json.loads(individual_data.metadata)["name"]
-
-tree = sparrow_ts.at(1e6)
-tree.draw(
-    path="tree_at_1Mb.svg",
-    height=700,
-    width=1200,
-    node_labels=individual_for_node,
-    node_colours=colours_for_node,
-)
-```
-
-This tree seems to suggest that Norwegian and French individuals may not fall into
-discrete groups on the tree, but be part of a larger mixing population. Note, however,
-that this is only one of thousands of trees, and may not be typical of the genome as a
-whole. Additionally, most data sets will have far more samples than this example, so
-trees visualized in this way are likely to be huge and difficult to understand. As in
-the {ref}`simulation example <sec_tutorial_simulation_example>` above, one possibility
-is to {meth}`~tskit.TreeSequence.simplify` the tree sequence to a limited number of
-samples, but it is likely that most studies will
-instead rely on various statistical summaries of the trees. Storing genetic data as a
-tree sequence makes many of these calculations fast and efficient, and tskit has both a
-set of {ref}`commonly used methods<sec_stats>` and a framework that
-{ref}`generalizes population genetic statistics<sec_stats_general_api>`. For example,
-the allele or site frequency spectrum (SFS) can be calculated using
-{meth}`tskit.TreeSequence.allele_frequency_spectrum` and the allelic diversity ("Tajima's
-{math}`{\pi}`") using {meth}`tskit.TreeSequence.diversity`, both of which can also be
-calculated locally (e.g. {ref}`per tree or in genomic windows <sec_stats_windows>`). As
-a basic example, here's how to calculate genome-wide {math}`F_{st}` between the Norwegian
-and French (sub)populations:
-
-```{code-cell} ipython3
-samples_listed_by_population = [
-    sparrow_ts.samples(population=pop_id)
-    for pop_id in range(sparrow_ts.num_populations)
-]
-
-print("Fst between populations:", sparrow_ts.Fst(samples_listed_by_population))
-```
-
-As noted above, the times of nodes are uncalibrated so we shouldn't perform
-calculations that reply on branch lengths. However, some statistics, such as the
-genealogical nearest neighbour (GNN) proportions are calculated from the topology
-of the trees.  using the individual and population metadata to format the results table in
-a tidy manner
-
-```{code-cell} ipython3
-import pandas as pd
-
-gnn = sparrow_ts.genealogical_nearest_neighbours(
-    sparrow_ts.samples(), samples_listed_by_population
-)
-
-# Tabulate GNN nicely using a Pandas dataframe with named rows and columns
-sample_nodes = [sparrow_ts.node(n) for n in sparrow_ts.samples()]
-sample_ids = [n.id for n in sample_nodes]
-sample_names = [
-    json.loads(sparrow_ts.individual(n.individual).metadata)["name"]
-    for n in sample_nodes
-]
-sample_pops = [
-    json.loads(sparrow_ts.population(n.population).metadata)["country"]
-    for n in sample_nodes
-]
-gnn_table = pd.DataFrame(
-    data=gnn,
-    index=[
-        pd.Index(sample_ids, name="Sample node"),
-        pd.Index(sample_names, name="Bird"),
-        pd.Index(sample_pops, name="Country"),
-    ],
-    columns=[json.loads(p.metadata)["country"] for p in sparrow_ts.populations()],
-)
-
-print(gnn_table)
-# Summarize GNN for all birds from the same country
-print(gnn_table.groupby(level="Country").mean())
-```
-
-
-From this, it can be seen that the genealogical nearest neighbours of birds in Norway
-tend also to be in Norway, and vice versa for birds from France. In other words, there is
-a small but noticable degree of population structure in the data. The bird ``8L19766``
-and one of the chromosomes of bird `FR046` seem to buck this trend, and it would
-probably be worth checking these data points further (perhaps they are migrant birds),
-and verifying if other chromosomes in these individuals show the same pattern.
-
-Much more can be done with the genomic statistics built into tskit. For further
-information, please refer to the {ref}`statistics section<sec_stats>` of the
-tskit documentation.
diff --git a/docs/usage.md b/docs/usage.md
new file mode 100644
index 00000000..7250acaa
--- /dev/null
+++ b/docs/usage.md
@@ -0,0 +1,639 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+:::{currentmodule} tsinfer
+:::
+
+
+(sec_usage)=
+
+# Usage
+
+(sec_usage_toy_example)=
+
+## Toy example
+
+_Tsinfer_ takes as input a [Zarr](https://zarr.readthedocs.io/) file, with phased variant data encoded in the
+[VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) (.vcz) format. The standard
+route to create such a file is by conversion from a VCF file, e.g. using
+[vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as described later in this
+document. However, for the moment we'll just use a pre-generated dataset:
+
+
+```{code-cell} ipython3
+import zarr
+ds = zarr.load("_static/example_data.vcz")
+```
+
+This is what the genotypes stored in that datafile look like:
+
+```{code-cell}
+:"tags": ["remove-input"]
+import numpy as np
+assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele']) 
+assert any([np.sum(g) == 1 for g in ds['call_genotype']]) # at least one singleton
+assert any([np.sum(g) == 0 for g in ds['call_genotype']]) # at least one non-variable
+
+alleles = ds['variant_allele'].astype(str)
+sites = np.arange(ds['call_genotype'].shape[0])
+print(" " * 22, "Site:", " ".join(str(x) for x in range(8)), "\n")
+for sample in range(ds['call_genotype'].shape[1]):
+    for genome in range(ds['call_genotype'].shape[2]):
+        genotypes = ds['call_genotype'][:,sample, genome]
+        print(
+            f"Diploid sample {sample} (genome {genome}):",
+            " ".join(alleles[sites, genotypes])
+        )
+```
+
+### VariantData and ancestral alleles
+
+We wish to infer a genealogy that could have given rise to this data set. To run _tsinfer_
+we wrap the .vcz file in a `tsinfer.VariantData` object. This requires an 
+*ancestral allele* to be specified for each site; there are
+many methods for calculating these: details are outside the scope of this manual, but we
+have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523)
+on this issue to provide some recommendations.
+
+Sometimes VCF files will contain the
+ancestral allele in the "AA" info field, in which case it will be encoded in the
+`variant_AA` field of the .vcz file. It's also possible to provide a numpy array
+of ancestral alleles, of the same length as the number of variants. Ancestral
+alleles that are not in the list of alleles for their respective site are treated as unknown
+and not used for inference (with a warning given).
+
+```{code-cell}
+import tsinfer
+
+# For this example take the REF allele (index 0) as ancestral
+ancestral_alleles = ds['variant_allele'][:,0].astype(str)
+# This is just a numpy array, set the last site to an unknown value, for demo purposes
+ancestral_alleles[-1] = "."
+
+vdata = tsinfer.VariantData("_static/example_data.vcz", ancestral_alleles)
+```
+
+The `VariantData` object is a lightweight wrapper around the .vcz file.
+We'll use it to infer a tree sequence on the basis of the sites that vary between the
+different samples. However, note that certain sites are not used by _tsinfer_ for inferring
+the genealogy (although they are still encoded in the final tree sequence), These are:
+
+* Non-variable (fixed) sites, e.g. site 4 above
+* Singleton sites, where only one genome has the derived allele e.g. site 5 above
+* Sites where the ancestral allele is unknown, e.g. demonstrated by site 7 above
+* Multialleleic sites, with more than 2 alleles (but see
+  [here](https://github.com/tskit-dev/tsinfer/issues/670) for a workaround)
+
+Additionally, during the inference step, additional sites can be flagged as not for use in
+inference, for example if they are deemed unreliable (this is done
+via the `exclude_positions` parameter).
+
+### Masks
+
+Sites which are not used for inference will still be included in the final tree sequence, with
+mutations at those sites being placed onto branches by parsimony. However, it is also possible
+to completely exclude sites and samples from the final tree sequence, by specifing a `site_mask`
+and/or a `sample_mask` when creating the `VariantData` object. Such sites or samples will be
+completely omitted from both inference and the final tree sequence. This can be useful, for
+example, to reduce the amount of computation required for an inference.
+
+### Topology inference
+
+Once we have stored our data in a `.VariantData` object, we can easily infer 
+a {ref}`tree sequence<sec_python_api_trees_and_tree_sequences>` using the Python
+API. Note that each sample in the original .vcz file will correspond to an *individual*
+in the resulting tree sequence. Since these three individuals are diploid, the resulting
+tree sequence will have `ts.num_samples == 6` (i.e. unlike in a .vcz file, a "sample" in
+tskit refers to a haploid genome, not a diploid individual).
+
+```{code-cell} ipython3
+inferred_ts = tsinfer.infer(vdata)
+print("Inferred a genetic genealogy for {inferred_ts.num_samples} (haploid) genomes")
+```
+
+And that's it: we now have a fully functional {class}`tskit.TreeSequence`
+object that we can interrogate in the usual ways. For example, we can look
+at the {meth}`aligned haplotypes<tskit.TreeSequence.alignments>` in the tree sequence:
+
+```{code-cell} ipython3
+print("Sample\tInferred sequence")
+for sample_id, seq in zip(
+    inferred_ts.samples(),
+    inferred_ts.alignments(missing_data_character="."),
+):
+    print(sample_id, seq, sep="\t")
+```
+
+You will notice that the sample sequences generated by this tree sequence are identical
+to the input haplotype data: apart from the imputation of
+{ref}`missing data<sec_inference_data_requirements>`, _tsinfer_ is guaranteed to
+losslessly encode any given input data, regardless of the inferred topology. You can
+check this programatically if you want:
+
+```{code-cell} ipython3
+import numpy as np
+for v_orig, v_inferred in zip(vdata.variants(), inferred_ts.variants()):
+    if any(
+        np.array(v_orig.alleles)[v_orig.genotypes] !=
+        np.array(v_inferred.alleles)[v_inferred.genotypes]
+    ):
+        raise ValueError("Genotypes in original dataset and inferred tree seq not equal")
+print("** Genotypes in original dataset and inferred tree sequence are the same **")
+```
+
+We can examine the inferred genetic genealogy,
+in the form of {ref}`local trees<tutorials:sec_what_is_local_trees>`. _Tsinfer_ has also
+placed mutations on the genealogy to explain the observed genetic variation:
+
+```{code-cell} ipython3
+mut_labels = {
+    m.id: "{:g}: {}→{}".format(
+        s.position,
+        inferred_ts.mutation(m.parent).derived_state if m.parent >= 0 else s.ancestral_state,
+        m.derived_state,
+    )
+    for s in inferred_ts.sites()
+    for m in s.mutations
+}
+node_labels = {u: u for u in inferred_ts.samples()}
+
+inferred_ts.draw_svg(
+    size=(500, 200), node_labels=node_labels, mutation_labels=mut_labels, y_axis=True)
+```
+
+We have inferred 4 trees aong the genome. Note, however, that there are "polytomies" in the
+trees, where some nodes have more than two children (e.g. the root node in the first tree).
+This is a common feature of trees inferred by _tsinfer_ and signals that there was not
+sufficient information to resolve the tree at this internal node.
+
+
+Each internal (non-sample) node in this inferred tree represents an ancestral sequence,
+constructed on the basis of shared, derived alleles at one or more of the sites. By
+default, the time of each such node is *not* measured in years or generations, but
+is simply the frequency of the shared derived allele(s) on which the ancestral sequence
+is based. For this reason, the time units are described as "uncalibrated" in the plot,
+and trying to calculate statistics based on branch lengths will raise an error:
+
+```{code-cell} ipython3
+:tags: ["raises-exception"]
+inferred_ts.diversity(mode="branch")
+```
+
+To add meaningful dates to an inferred tree sequence, allowing meaningful branch length
+statistics to be calulated, you must use additional
+software such as [tsdate](https://tskit.dev/software/tsdate.html): the _tsinfer_
+algorithm is only intended to infer the genetic relationships between the samples
+(i.e. the *topology* of the tree sequence).
+
+
+(sec_usage_simulation_example)=
+
+## Simulation example
+
+The previous example showed how we can infer a tree sequence using the Python API for a trivial
+toy example. However, for real data we will not prepare our data and infer the tree sequence all
+in one go; rather, we will usually split the process into at least two distinct steps.
+
+The first step in any inference is to prepare your data and create the .vcz file.
+For simplicity here we'll use
+Python to simulate some data under the coalescent with recombination, using
+[msprime](https://msprime.readthedocs.io/en/stable/api.html#msprime.simulate):
+
+```{code-cell} ipython3
+
+import builtins
+import sys
+import os
+import subprocess
+
+from Bio import bgzf
+import numpy as np
+
+import msprime
+import tsinfer
+
+if getattr(builtins, "__IPYTHON__", False):  # if running IPython: e.g. in a notebook
+    num_diploids, seq_len = 100, 10_000
+    name = "notebook-simulation"
+    python = sys.executable
+else:  # Take parameters from the command-line
+    num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2])
+    name = "cli-simulation"
+    python = "python"
+
+ts = msprime.sim_ancestry(
+    num_diploids,
+    population_size=10**4,
+    recombination_rate=1e-8,
+    sequence_length=seq_len,
+    random_seed=6,
+)
+ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7)
+ts.dump(name + "-source.trees")
+print(
+    f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:",
+    f"{ts.num_trees} trees and {ts.num_sites} sites"
+)
+
+# Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see
+# https://github.com/sgkit-dev/bio2zarr/issues/232
+np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()])
+vcf_name = f"{name}.vcf.gz"
+with bgzf.open(vcf_name, "wt") as f:
+    ts.write_vcf(f)
+subprocess.run(["tabix", vcf_name])
+ret = subprocess.run(
+    [python, "-m", "bio2zarr", "vcf2zarr", "convert", "--force", vcf_name, f"{name}.vcz"],
+    stderr = subprocess.DEVNULL if name == "notebook-simulation" else None,
+)
+assert os.path.exists(f"{name}.vcz")
+
+if ret.returncode == 0:
+    print(f"Converted to {name}.vcz")
+```
+
+Here we first run a simulation then we create a vcf file and convert it to .vcz format.
+If we store this code in a file named `simulate-data.py`, we can run it on the
+command line, providing parameters to generate much bigger datasets:
+
+
+```{code} bash
+% python simulate-data.py 5000 10_000_000
+Simulated 10000 samples over 10.0 Mb: 36204 trees and 38988 sites
+    Scan: 100%|████████████████████████████████████████████████████████████████████| 1.00/1.00 [00:00<00:00, 2.86files/s]
+ Explode: 100%|████████████████████████████████████████████████████████████████████| 39.0k/39.0k [01:17<00:00, 501vars/s]
+  Encode: 100%|███████████████████████████████████████████████████████████████████████| 976M/976M [00:12<00:00, 78.0MB/s]
+Finalise: 100%|█████████████████████████████████████████████████████████████████████| 10.0/10.0 [00:00<00:00, 421array/s]
+```
+
+Here, we simulated a sample of 10 thousand chromosomes, each 10Mb long, under
+human-like parameters. That has created 39K segregating sites. The output
+above shows the state of the vcf-to-zarr conversion at the end of this process,
+indicating that it took about a minute to convert the data into .vcz format
+(this only needs doing once)
+
+Examining the files on the command line, we then see the following::
+
+```{code} bash
+$ du -sh cli-simulation*
+156K	cli-simulation-AA.npy
+8.8M	cli-simulation-source.trees
+ 25M	cli-simulation.vcf.gz
+8.0K	cli-simulation.vcf.gz.tbi
+9.4M	cli-simulation.vcz
+```
+
+The `cli-simulation.vcz` zarr data is quite small, about the same size as the
+original `msprime` tree sequence file. Since (like all zarr datastores)
+ `cli-simulation.vcz` is a directory
+we can peer inside to look where everything is stored:
+
+```{code} bash
+$ du -sh cli-simulation.vcz/*
+8.7M	cli-simulation.vcz/call_genotype
+ 88K	cli-simulation.vcz/call_genotype_mask
+ 88K	cli-simulation.vcz/call_genotype_phased
+ 12K	cli-simulation.vcz/contig_id
+ 12K	cli-simulation.vcz/contig_length
+ 12K	cli-simulation.vcz/filter_id
+ 28K	cli-simulation.vcz/sample_id
+ 52K	cli-simulation.vcz/variant_allele
+ 24K	cli-simulation.vcz/variant_contig
+ 24K	cli-simulation.vcz/variant_filter
+ 48K	cli-simulation.vcz/variant_id
+ 24K	cli-simulation.vcz/variant_id_mask
+144K	cli-simulation.vcz/variant_position
+ 24K	cli-simulation.vcz/variant_quality
+```
+
+Most of this output is not particularly interesting here, but we can see that the
+`call_genotype` array which holds all of the sample genotypes (and thus the vast bulk of the
+actual data) only requires about 8.7MB compared to 25MB for the compressed vcf file.
+In practice this means we can keep such files lying around without taking up too much space.
+
+Once we have our `.vcz` file created, running the inference is straightforward.
+
+```{code-cell} ipython3
+# Infer & save a ts from the notebook simulation.
+ancestral_alleles = np.load(f"{name}-AA.npy")
+vdata = tsinfer.VariantData(f"{name}.vcz", ancestral_alleles)
+tsinfer.infer(vdata, progress_monitor=True, num_threads=4).dump(name + ".trees")
+```
+
+Running the `infer` command runs the full inference pipeline in one go (the individual steps
+are explained {ref}`here <sec_inference>`). We provided two extra arguments to `infer`:
+`progress_monitor` gives us the progress bars show above, and `num_threads=4` tells
+_tsinfer_ to use four worker threads whenever it can use them.
+
+Performing this inference on the `cli-simulation` data using Core i5 processor (launched 2009)
+with 4GiB of RAM, took about eighteen minutes. The maximum memory usage was about 600MiB.
+
+The output files look like this:
+
+```{code} bash
+$ ls -lh cli-simulation*
+-rw-r--r--  1 user  group   8.2M 14 Oct 13:45 cli-simulation-source.trees
+-rw-r--r--  1 user  group    22M 14 Oct 13:45 cli-simulation.samples
+-rw-r--r--  1 user  group   8.1M 14 Oct 13:50 cli-simulation.trees
+```
+
+Therefore our output tree sequence file that we have just inferred in a few minutes is
+*even smaller* than the original ``msprime`` simulated tree sequence!
+Because the output file is also a {class}`tskit.TreeSequence`, we can use the same API
+to work with both. For example, to load up the original and inferred tree sequences
+from their corresponding `.trees` files you can simply use the {func}`tskit.load`
+function, as shown below.
+
+However, there are thousands of trees in these tree sequences, each of which has
+10 thousand samples: much too much to easily visualise. Therefore, in the code below, we
+we subset both tree sequences down to their minimal representations for the first six
+sample nodes using {meth}`tskit.TreeSequence.simplify`, while keeping all sites, even if
+they do not show genetic variation between the six selected samples.
+:::{note}
+Using this tiny subset of the overall data allows us to get an informal
+feel for the trees that are inferred by _tsinfer_, but this is certainly
+not a recommended approach for validating the inference!
+:::
+
+Once we've subsetted the tree sequences down to something that we can comfortably look
+at, we can plot the trees in particular regions of interest, for example the first
+2kb of genome. Below, for simplicity we use the smaller dataset that was simulated and
+inferred within the the notebook, but exactly the same code will work with
+the larger version run from the cli simulation.  
+
+
+```{code-cell} ipython3
+import tskit
+
+subset = range(0, 6)  # show first 6 samples
+limit = (0, 2_000)    # show only the trees covering the first 2kb
+
+prefix = "notebook"  # Or use "cli" for the larger example
+source = tskit.load(prefix + "-simulation-source.trees")
+source_subset = source.simplify(subset, filter_sites=False)
+print(f"True tree seq, simplified to {len(subset)} sampled genomes")
+source_subset.draw_svg(size=(800, 200), x_lim=limit, time_scale="rank")
+```
+
+```{code-cell} ipython3
+inferred = tskit.load(prefix + "-simulation.trees")
+inferred_subset = inferred.simplify(subset, filter_sites=False)
+print(f"Inferred tree seq, simplified to {len(subset)} sampled genomes")
+inferred_subset.draw_svg(size=(800, 200), x_lim=limit)
+```
+
+There are a number of things to note when comparing the plots above. Most
+obviously, the first tree in the inferred tree sequence is empty: by default, _tsinfer_
+will not generate a genealogy before the first site in the genome (here, position 655)
+or past the last site, as there is, by definition, no data in these regions.
+
+You can also see that the inferred tree sequence has fewer trees than the original,
+simulated one. There are two reasons for this. First, there are some tree changes that
+do not involve a change of topology (e.g. the first two trees in the original tree
+sequence are topologically identical, as are two trees after that).
+Even if we had different mutations on the
+different edges in these trees, such tree changes are unlikely to be spotted.
+Second, our inference depends on the mutational information that is present. If no
+mutations fall on a particular edge in the tree sequence, then we have no way of
+inferring that this edge existed. The fact that there are no mutations in the first,
+third, and fifth trees shows that tree changes can occur without associated mutations.
+As a result, there will be tree transitions that we cannot pick up. In the simulation
+that we performed, the mutation rate is equal to the recombination rate, and so we
+expect that many recombinations will be invisible to us.
+
+For similar reasons, there will be many nodes in the tree at which
+polytomies occur. Here we correctly infer that sample nodes 0 and 4 group together,
+as do 1 and 2. However, in the middle inferred tree we were not able to distinguish
+whether node 3 was closer to node 1 or 2, so we have three children of node 7 in that
+tree.
+
+Finally, you should note that inference does not always get the right answer. At the
+first site in the inferred trees, node 5 is placed as an outgroup, but at this site
+it is actually closer to the group consisting of nodes 0 and 4.
+
+:::{note}
+Other than the sample node IDs, it is meaningless to compare node numbers in the
+source and inferred tree sequences.
+:::
+
+(sec_usage_data_example)=
+
+## Data example
+
+Inputting real data for inference is similar in principle to the examples above.
+All that is required is a .vcz file, which can be created using
+[vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as above
+
+(sec_usage_read_vcf)=
+
+### Reading a VCF
+
+For example data, we use a publicly available VCF file of the genetic
+variants from chromosome 24 of ten Norwegian and French house sparrows,
+*Passer domesticus* (thanks to Mark Ravinet for the data file):
+
+```{code-cell} ipython3
+:tags: ["remove-output"]
+import zarr
+
+vcf_location = "_static/P_dom_chr24_phased.vcf.gz"
+!python -m bio2zarr vcf2zarr convert --force {vcf_location} sparrows.vcz
+```
+
+This creates the `sparrows.vcz` datastore, which we open using `tsinfer.VariantData`.
+The original VCF had ancestral alleles specified in the `AA` INFO field, so we can
+simply provide the string `"variant_AA"` as the ancestral_allele parameter.
+
+```{code-cell} ipython3
+# Do the inference: this VCF has ancestral alleles in the AA field
+vdata = tsinfer.VariantData("sparrows.vcz", ancestral_allele="variant_AA")
+ts = tsinfer.infer(vdata)
+print(
+    "Inferred tree sequence: {} trees over {} Mb ({} edges)".format(
+        ts.num_trees, ts.sequence_length / 1e6, ts.num_edges
+    )
+)
+```
+
+On a modern computer, this should only take a few seconds to run.
+
+### Adding more metadata
+
+We can add additional data to the zarr file, which will make it through to the tree sequence.
+For instance, we might want to mark which population each individual comes from.
+This can be done by adding some descriptive metadata for each population, and the assigning
+each sample to one of those populations. In our case, the sample sparrow IDs beginning with
+"FR" are from France:
+
+```{code-cell} ipython3
+import json
+import numpy as np
+import tskit
+import zarr
+
+ds = zarr.load("sparrows.vcz")
+
+populations = ("Norway", "France")
+# save the population data in json format
+schema = json.dumps(tskit.MetadataSchema.permissive_json().schema).encode()
+zarr.save("sparrows.vcz/populations_metadata_schema", schema)
+metadata = [
+    json.dumps({"name": pop, "description": "The country from which this sample comes"}).encode()
+    for pop in populations
+]
+zarr.save("sparrows.vcz/populations_metadata", metadata)
+
+# Now assign each diploid sample to a population
+num_individuals = ds["sample_id"].shape[0]
+individuals_population = np.full(num_individuals, tskit.NULL, dtype=np.int32)
+for i, name in enumerate(ds["sample_id"]):
+    if name.startswith("FR"):
+        individuals_population[i] = populations.index("France")
+    else:
+        individuals_population[i] = populations.index("Norway")
+zarr.save("sparrows.vcz/individuals_population", individuals_population)
+```
+
+Now when we carry out the inference, we get a tree sequence in which the nodes are
+correctly assigned to named populations
+
+```{code-cell} ipython3
+vdata = tsinfer.VariantData("sparrows.vcz", ancestral_allele="variant_AA")
+sparrow_ts = tsinfer.infer(vdata)
+
+for sample_node_id in sparrow_ts.samples():
+    individual_id = sparrow_ts.node(sample_node_id).individual
+    population_id = sparrow_ts.node(sample_node_id).population
+    print(
+        "Node",
+        sample_node_id,
+        "labels a chr24 sampled from individual",
+        sparrow_ts.individual(individual_id).metadata["variant_data_sample_id"],
+        "in",
+        sparrow_ts.population(population_id).metadata["name"],
+    )
+```
+
+### Analysis
+
+To analyse your inferred tree sequence you can use all the analysis functions built in to
+the [tskit](https://tskit.dev/tskit/docs/stable/) library. The
+{ref}`tskit tutorial<sec_tutorial_stats>` provides much more detail. Below we just give a
+flavour of the possibilities.
+
+To quickly eyeball small datasets, we can draw the entire tree sequence, or
+{meth}`~tskit.Tree.draw` the tree at any particular genomic position. The following
+code demonstrates how to use the {meth}`tskit.TreeSequence.at` method to obtain the tree
+1Mb from the start of the sequence, and plot it, colouring the tips according to
+population:
+
+```{code-cell} ipython3
+colours = {"Norway": "red", "France": "blue"}
+colours_for_node = {}
+for n in sparrow_ts.samples():
+    population_data = sparrow_ts.population(sparrow_ts.node(n).population)
+    colours_for_node[n] = colours[population_data.metadata["name"]]
+
+individual_for_node = {}
+for n in sparrow_ts.samples():
+    individual_data = sparrow_ts.individual(sparrow_ts.node(n).individual)
+    individual_for_node[n] = individual_data.metadata["variant_data_sample_id"]
+
+tree = sparrow_ts.at(1e6)
+tree.draw(
+    path="tree_at_1Mb.svg",
+    height=700,
+    width=1200,
+    node_labels=individual_for_node,
+    node_colours=colours_for_node,
+)
+```
+
+This tree seems to suggest that Norwegian and French individuals may not fall into
+discrete groups on the tree, but be part of a larger mixing population. Note, however,
+that this is only one of thousands of trees, and may not be typical of the genome as a
+whole. Additionally, most data sets will have far more samples than this example, so
+trees visualized in this way are likely to be huge and difficult to understand. As in
+the {ref}`simulation example <sec_usage_simulation_example>` above, one possibility
+is to {meth}`~tskit.TreeSequence.simplify` the tree sequence to a limited number of
+samples, but it is likely that most studies will
+instead rely on various statistical summaries of the trees. Storing genetic data as a
+tree sequence makes many of these calculations fast and efficient, and tskit has both a
+set of {ref}`commonly used methods<sec_stats>` and a framework that
+{ref}`generalizes population genetic statistics<sec_stats_general_api>`. For example,
+the allele or site frequency spectrum (SFS) can be calculated using
+{meth}`tskit.TreeSequence.allele_frequency_spectrum` and the allelic diversity ("Tajima's
+{math}`{\pi}`") using {meth}`tskit.TreeSequence.diversity`, both of which can also be
+calculated locally (e.g. {ref}`per tree or in genomic windows <sec_stats_windows>`). As
+a basic example, here's how to calculate genome-wide {math}`F_{st}` between the Norwegian
+and French (sub)populations:
+
+```{code-cell} ipython3
+samples_listed_by_population = [
+    sparrow_ts.samples(population=pop_id)
+    for pop_id in range(sparrow_ts.num_populations)
+]
+
+print("Fst between populations:", sparrow_ts.Fst(samples_listed_by_population))
+```
+
+As noted above, the times of nodes are uncalibrated so we shouldn't perform
+calculations that reply on branch lengths. However, some statistics, such as the
+genealogical nearest neighbour (GNN) proportions are calculated from the topology
+of the trees. Here's an example, using the individual and population metadata to format the results table in
+a tidy manner
+
+```{code-cell} ipython3
+import pandas as pd
+
+gnn = sparrow_ts.genealogical_nearest_neighbours(
+    sparrow_ts.samples(), samples_listed_by_population
+)
+
+# Tabulate GNN nicely using a Pandas dataframe with named rows and columns
+sample_nodes = [sparrow_ts.node(n) for n in sparrow_ts.samples()]
+sample_ids = [n.id for n in sample_nodes]
+sample_names = [
+    sparrow_ts.individual(n.individual).metadata["variant_data_sample_id"]
+    for n in sample_nodes
+]
+sample_pops = [
+    sparrow_ts.population(n.population).metadata["name"]
+    for n in sample_nodes
+]
+gnn_table = pd.DataFrame(
+    data=gnn,
+    index=[
+        pd.Index(sample_ids, name="Sample node"),
+        pd.Index(sample_names, name="Bird"),
+        pd.Index(sample_pops, name="Country"),
+    ],
+    columns=[p.metadata["name"] for p in sparrow_ts.populations()],
+)
+
+print(gnn_table)
+# Summarize GNN for all birds from the same country
+print(gnn_table.groupby(level="Country").mean())
+```
+
+
+From this, it can be seen that the genealogical nearest neighbours of birds in Norway
+tend also to be in Norway, and vice versa for birds from France. In other words, there is
+a small but noticable degree of population structure in the data. The bird ``8L19766``
+and one of the chromosomes of bird `FR046` seem to buck this trend, and it would
+probably be worth checking these data points further (perhaps they are migrant birds),
+and verifying if other chromosomes in these individuals show the same pattern.
+
+Much more can be done with the genomic statistics built into tskit. For further
+information, please refer to the {ref}`statistics section<sec_stats>` of the
+tskit documentation.
diff --git a/requirements/CI-docs/requirements.txt b/requirements/CI-docs/requirements.txt
index 1a8ed794..661a6405 100644
--- a/requirements/CI-docs/requirements.txt
+++ b/requirements/CI-docs/requirements.txt
@@ -8,4 +8,6 @@ daiquiri==3.2.5.1
 msprime==1.3.2
 sgkit[vcf]==0.9.0
 ipywidgets==8.1.3
+Bio==1.7.1
+bio2zarr==0.1.1
 sphinx-book-theme #Unpinned to allow easy updating.