From 09865fbbbc47e40da986f8f8a7914866bce016cc Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Fri, 26 Jul 2024 16:17:40 +0100
Subject: [PATCH] Remove sgkit from the tutorial

---
 docs/.gitignore                               |   3 +
 docs/_static/example_data.vcz/.zattrs         |   6 +
 docs/_static/example_data.vcz/.zgroup         |   3 +
 docs/_static/example_data.vcz/.zmetadata      | 243 ++++++++++++++++++
 .../example_data.vcz/call_genotype/.zarray    |  24 ++
 .../example_data.vcz/call_genotype/.zattrs    |   9 +
 .../example_data.vcz/call_genotype/0.0.0      | Bin 0 -> 64 bytes
 .../call_genotype_mask/.zarray                |  24 ++
 .../call_genotype_mask/.zattrs                |   9 +
 .../example_data.vcz/call_genotype_mask/0.0.0 | Bin 0 -> 64 bytes
 .../call_genotype_phased/.zarray              |  22 ++
 .../call_genotype_phased/.zattrs              |   8 +
 .../example_data.vcz/call_genotype_phased/0.0 | Bin 0 -> 40 bytes
 .../example_data.vcz/contig_id/.zarray        |  20 ++
 .../example_data.vcz/contig_id/.zattrs        |   6 +
 docs/_static/example_data.vcz/contig_id/0     | Bin 0 -> 20 bytes
 .../example_data.vcz/sample_id/.zarray        |  20 ++
 .../example_data.vcz/sample_id/.zattrs        |   6 +
 docs/_static/example_data.vcz/sample_id/0     | Bin 0 -> 40 bytes
 .../example_data.vcz/variant_allele/.zarray   |  22 ++
 .../example_data.vcz/variant_allele/.zattrs   |   7 +
 .../example_data.vcz/variant_allele/0.0       | Bin 0 -> 32 bytes
 .../example_data.vcz/variant_contig/.zarray   |  20 ++
 .../example_data.vcz/variant_contig/.zattrs   |   6 +
 .../_static/example_data.vcz/variant_contig/0 | Bin 0 -> 80 bytes
 .../example_data.vcz/variant_position/.zarray |  20 ++
 .../example_data.vcz/variant_position/.zattrs |   6 +
 .../example_data.vcz/variant_position/0       | Bin 0 -> 80 bytes
 docs/tutorial.md                              |  38 +--
 29 files changed, 505 insertions(+), 17 deletions(-)
 create mode 100644 docs/_static/example_data.vcz/.zattrs
 create mode 100644 docs/_static/example_data.vcz/.zgroup
 create mode 100644 docs/_static/example_data.vcz/.zmetadata
 create mode 100644 docs/_static/example_data.vcz/call_genotype/.zarray
 create mode 100644 docs/_static/example_data.vcz/call_genotype/.zattrs
 create mode 100644 docs/_static/example_data.vcz/call_genotype/0.0.0
 create mode 100644 docs/_static/example_data.vcz/call_genotype_mask/.zarray
 create mode 100644 docs/_static/example_data.vcz/call_genotype_mask/.zattrs
 create mode 100644 docs/_static/example_data.vcz/call_genotype_mask/0.0.0
 create mode 100644 docs/_static/example_data.vcz/call_genotype_phased/.zarray
 create mode 100644 docs/_static/example_data.vcz/call_genotype_phased/.zattrs
 create mode 100644 docs/_static/example_data.vcz/call_genotype_phased/0.0
 create mode 100644 docs/_static/example_data.vcz/contig_id/.zarray
 create mode 100644 docs/_static/example_data.vcz/contig_id/.zattrs
 create mode 100644 docs/_static/example_data.vcz/contig_id/0
 create mode 100644 docs/_static/example_data.vcz/sample_id/.zarray
 create mode 100644 docs/_static/example_data.vcz/sample_id/.zattrs
 create mode 100644 docs/_static/example_data.vcz/sample_id/0
 create mode 100644 docs/_static/example_data.vcz/variant_allele/.zarray
 create mode 100644 docs/_static/example_data.vcz/variant_allele/.zattrs
 create mode 100644 docs/_static/example_data.vcz/variant_allele/0.0
 create mode 100644 docs/_static/example_data.vcz/variant_contig/.zarray
 create mode 100644 docs/_static/example_data.vcz/variant_contig/.zattrs
 create mode 100644 docs/_static/example_data.vcz/variant_contig/0
 create mode 100644 docs/_static/example_data.vcz/variant_position/.zarray
 create mode 100644 docs/_static/example_data.vcz/variant_position/.zattrs
 create mode 100644 docs/_static/example_data.vcz/variant_position/0

diff --git a/docs/.gitignore b/docs/.gitignore
index f3c3680f..f4b2a542 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,7 @@
 notebook-simulation.trees
 notebook-simulation.samples
 notebook-simulation-source.trees
+notebook-simulation.vc*
+notebook-simulation-AA.npy
 P_dom_chr24_phased.samples
+sparrows.vcz
diff --git a/docs/_static/example_data.vcz/.zattrs b/docs/_static/example_data.vcz/.zattrs
new file mode 100644
index 00000000..f778611e
--- /dev/null
+++ b/docs/_static/example_data.vcz/.zattrs
@@ -0,0 +1,6 @@
+{
+    "contigs": [
+        "0"
+    ],
+    "source": "sgkit-0.9.0"
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/.zgroup b/docs/_static/example_data.vcz/.zgroup
new file mode 100644
index 00000000..3b7daf22
--- /dev/null
+++ b/docs/_static/example_data.vcz/.zgroup
@@ -0,0 +1,3 @@
+{
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/.zmetadata b/docs/_static/example_data.vcz/.zmetadata
new file mode 100644
index 00000000..24b95bd0
--- /dev/null
+++ b/docs/_static/example_data.vcz/.zmetadata
@@ -0,0 +1,243 @@
+{
+    "metadata": {
+        ".zattrs": {
+            "contigs": [
+                "0"
+            ],
+            "source": "sgkit-0.9.0"
+        },
+        ".zgroup": {
+            "zarr_format": 2
+        },
+        "call_genotype/.zarray": {
+            "chunks": [
+                8,
+                3,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|i1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                3,
+                2
+            ],
+            "zarr_format": 2
+        },
+        "call_genotype/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "samples",
+                "ploidy"
+            ],
+            "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.",
+            "mixed_ploidy": false
+        },
+        "call_genotype_mask/.zarray": {
+            "chunks": [
+                8,
+                3,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|i1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                3,
+                2
+            ],
+            "zarr_format": 2
+        },
+        "call_genotype_mask/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "samples",
+                "ploidy"
+            ],
+            "comment": "A flag for each call indicating which values are missing.",
+            "dtype": "bool"
+        },
+        "call_genotype_phased/.zarray": {
+            "chunks": [
+                8,
+                3
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|i1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                3
+            ],
+            "zarr_format": 2
+        },
+        "call_genotype_phased/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "samples"
+            ],
+            "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.",
+            "dtype": "bool"
+        },
+        "contig_id/.zarray": {
+            "chunks": [
+                1
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<U1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                1
+            ],
+            "zarr_format": 2
+        },
+        "contig_id/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "contigs"
+            ],
+            "comment": "Contig identifiers."
+        },
+        "sample_id/.zarray": {
+            "chunks": [
+                3
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<U2",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                3
+            ],
+            "zarr_format": 2
+        },
+        "sample_id/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "samples"
+            ],
+            "comment": "The unique identifier of the sample."
+        },
+        "variant_allele/.zarray": {
+            "chunks": [
+                8,
+                2
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "|S1",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8,
+                2
+            ],
+            "zarr_format": 2
+        },
+        "variant_allele/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants",
+                "alleles"
+            ],
+            "comment": "The possible alleles for the variant."
+        },
+        "variant_contig/.zarray": {
+            "chunks": [
+                8
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<i8",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8
+            ],
+            "zarr_format": 2
+        },
+        "variant_contig/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants"
+            ],
+            "comment": "Index corresponding to contig name for each variant. In some less common\nscenarios, this may also be equivalent to the contig names if the data\ngenerating process used contig names that were also integers."
+        },
+        "variant_position/.zarray": {
+            "chunks": [
+                8
+            ],
+            "compressor": {
+                "blocksize": 0,
+                "clevel": 5,
+                "cname": "lz4",
+                "id": "blosc",
+                "shuffle": 1
+            },
+            "dtype": "<i8",
+            "fill_value": null,
+            "filters": null,
+            "order": "C",
+            "shape": [
+                8
+            ],
+            "zarr_format": 2
+        },
+        "variant_position/.zattrs": {
+            "_ARRAY_DIMENSIONS": [
+                "variants"
+            ],
+            "comment": "The reference position of the variant."
+        }
+    },
+    "zarr_consolidated_format": 1
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype/.zarray b/docs/_static/example_data.vcz/call_genotype/.zarray
new file mode 100644
index 00000000..105c8f40
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype/.zarray
@@ -0,0 +1,24 @@
+{
+    "chunks": [
+        8,
+        3,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        3,
+        2
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype/.zattrs b/docs/_static/example_data.vcz/call_genotype/.zattrs
new file mode 100644
index 00000000..7b7753c5
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype/.zattrs
@@ -0,0 +1,9 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples",
+        "ploidy"
+    ],
+    "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.",
+    "mixed_ploidy": false
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype/0.0.0 b/docs/_static/example_data.vcz/call_genotype/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..23f15862001156da78f92d5bf6fab49d72130ee9
GIT binary patch
literal 64
ncmZQ#G-fnlU|;}Y2Owr-WMp6j(qI;l4WywMDh5^oC4fQzGI0Ry

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/call_genotype_mask/.zarray b/docs/_static/example_data.vcz/call_genotype_mask/.zarray
new file mode 100644
index 00000000..105c8f40
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_mask/.zarray
@@ -0,0 +1,24 @@
+{
+    "chunks": [
+        8,
+        3,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        3,
+        2
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_mask/.zattrs b/docs/_static/example_data.vcz/call_genotype_mask/.zattrs
new file mode 100644
index 00000000..6a43b7ab
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_mask/.zattrs
@@ -0,0 +1,9 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples",
+        "ploidy"
+    ],
+    "comment": "A flag for each call indicating which values are missing.",
+    "dtype": "bool"
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_mask/0.0.0 b/docs/_static/example_data.vcz/call_genotype_mask/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..574f35477129d8a454163d31079d3ad83bae1fed
GIT binary patch
literal 64
VcmZQ#G-fnlU|;}Y2OuUD001yn0N4Nk

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/call_genotype_phased/.zarray b/docs/_static/example_data.vcz/call_genotype_phased/.zarray
new file mode 100644
index 00000000..c862e179
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_phased/.zarray
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        8,
+        3
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|i1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        3
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_phased/.zattrs b/docs/_static/example_data.vcz/call_genotype_phased/.zattrs
new file mode 100644
index 00000000..7ae8d94f
--- /dev/null
+++ b/docs/_static/example_data.vcz/call_genotype_phased/.zattrs
@@ -0,0 +1,8 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "samples"
+    ],
+    "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.",
+    "dtype": "bool"
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/call_genotype_phased/0.0 b/docs/_static/example_data.vcz/call_genotype_phased/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..b368df0f5594eadfe8178ee8e655be05a0db0bf0
GIT binary patch
literal 40
VcmZQ#G-i}wU|;}Y4IpO30RR+20H^=}

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/contig_id/.zarray b/docs/_static/example_data.vcz/contig_id/.zarray
new file mode 100644
index 00000000..ce072443
--- /dev/null
+++ b/docs/_static/example_data.vcz/contig_id/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        1
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<U1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        1
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/contig_id/.zattrs b/docs/_static/example_data.vcz/contig_id/.zattrs
new file mode 100644
index 00000000..63b8cf86
--- /dev/null
+++ b/docs/_static/example_data.vcz/contig_id/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "contigs"
+    ],
+    "comment": "Contig identifiers."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/contig_id/0 b/docs/_static/example_data.vcz/contig_id/0
new file mode 100644
index 0000000000000000000000000000000000000000..113105d5dbd5e8e0105d6cac25faca56be3dc137
GIT binary patch
literal 20
VcmZQ#G-hF8U|;}Y5g;}IVgLpC0EYko

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/sample_id/.zarray b/docs/_static/example_data.vcz/sample_id/.zarray
new file mode 100644
index 00000000..c27ef48d
--- /dev/null
+++ b/docs/_static/example_data.vcz/sample_id/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        3
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<U2",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        3
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/sample_id/.zattrs b/docs/_static/example_data.vcz/sample_id/.zattrs
new file mode 100644
index 00000000..801e30ed
--- /dev/null
+++ b/docs/_static/example_data.vcz/sample_id/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "samples"
+    ],
+    "comment": "The unique identifier of the sample."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/sample_id/0 b/docs/_static/example_data.vcz/sample_id/0
new file mode 100644
index 0000000000000000000000000000000000000000..7ead0f074664cbea13c2b63f1d8c6be1435953ee
GIT binary patch
literal 40
dcmZQ#H0F?CU|;}Y4ImB%Vgo2`2&IjHGyp1n0wVwb

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/variant_allele/.zarray b/docs/_static/example_data.vcz/variant_allele/.zarray
new file mode 100644
index 00000000..0ccff202
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_allele/.zarray
@@ -0,0 +1,22 @@
+{
+    "chunks": [
+        8,
+        2
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "|S1",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8,
+        2
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_allele/.zattrs b/docs/_static/example_data.vcz/variant_allele/.zattrs
new file mode 100644
index 00000000..2de87677
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_allele/.zattrs
@@ -0,0 +1,7 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants",
+        "alleles"
+    ],
+    "comment": "The possible alleles for the variant."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_allele/0.0 b/docs/_static/example_data.vcz/variant_allele/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..67ced5934ef3f1662a50d2f92571118a8e60fab1
GIT binary patch
literal 32
jcmZQ#G-ecFU|;}Y1t4}0aSw5J4{>yMcMNfNb_@XkGl&HX

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/variant_contig/.zarray b/docs/_static/example_data.vcz/variant_contig/.zarray
new file mode 100644
index 00000000..bb6f54dd
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_contig/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        8
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_contig/.zattrs b/docs/_static/example_data.vcz/variant_contig/.zattrs
new file mode 100644
index 00000000..a22ac19d
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_contig/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants"
+    ],
+    "comment": "Index corresponding to contig name for each variant. In some less common\nscenarios, this may also be equivalent to the contig names if the data\ngenerating process used contig names that were also integers."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_contig/0 b/docs/_static/example_data.vcz/variant_contig/0
new file mode 100644
index 0000000000000000000000000000000000000000..72c48136bff6a256718b29f241d505b3b85f73f4
GIT binary patch
literal 80
VcmZQ#H0E$%U|;~@03fCm002#K0S^EG

literal 0
HcmV?d00001

diff --git a/docs/_static/example_data.vcz/variant_position/.zarray b/docs/_static/example_data.vcz/variant_position/.zarray
new file mode 100644
index 00000000..bb6f54dd
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_position/.zarray
@@ -0,0 +1,20 @@
+{
+    "chunks": [
+        8
+    ],
+    "compressor": {
+        "blocksize": 0,
+        "clevel": 5,
+        "cname": "lz4",
+        "id": "blosc",
+        "shuffle": 1
+    },
+    "dtype": "<i8",
+    "fill_value": null,
+    "filters": null,
+    "order": "C",
+    "shape": [
+        8
+    ],
+    "zarr_format": 2
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_position/.zattrs b/docs/_static/example_data.vcz/variant_position/.zattrs
new file mode 100644
index 00000000..a9a8102f
--- /dev/null
+++ b/docs/_static/example_data.vcz/variant_position/.zattrs
@@ -0,0 +1,6 @@
+{
+    "_ARRAY_DIMENSIONS": [
+        "variants"
+    ],
+    "comment": "The reference position of the variant."
+}
\ No newline at end of file
diff --git a/docs/_static/example_data.vcz/variant_position/0 b/docs/_static/example_data.vcz/variant_position/0
new file mode 100644
index 0000000000000000000000000000000000000000..ce6b66302645e6f0c1a4ebd6934f1fd9b0b3a7cf
GIT binary patch
literal 80
ncmZQ#H0E$%U|;~@03e0|Mks>`N;5-g7AVaMrP-h~JCp_hP!Rzu

literal 0
HcmV?d00001

diff --git a/docs/tutorial.md b/docs/tutorial.md
index 23101b93..27bbf80a 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -27,26 +27,24 @@ _Tsinfer_ takes as input a [Zarr](https://zarr.readthedocs.io/) file, with phase
 [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) (.vcz) format. The standard
 route to create such a file is by conversion from a VCF file, e.g. using
 [vcf2zarr](https://sgkit-dev.github.io/bio2zarr/vcf2zarr/overview.html) as described later in this
-document. For a quick introduction, however, we will instead create an example file using
-[sgkit](https://sgkit-dev.github.io/sgkit/latest/).
+document. However, for the moment we'll just use a pre-generated dataset:
 
 
 ```{code-cell} ipython3
-import sgkit
-ds = sgkit.simulate_genotype_call_dataset(n_variant=8, n_sample=3, missing_pct=0, phased=True, seed=79)
-sgkit.save_dataset(ds, "data.vcz", mode="w")
+import zarr
+ds = zarr.load("_static/example_data.vcz")
 ```
 
-This is what that generated data looks like:
+This is what the genotypes stored in that datafile look like:
 
 ```{code-cell}
 :"tags": ["remove-input"]
 import numpy as np
-assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele'].values) 
-assert any([np.sum(g.values) == 1 for g in ds['call_genotype']]) # at least one singleton
-assert any([np.sum(g.values) == 0 for g in ds['call_genotype']]) # at least one non-variable
+assert all(len(np.unique(a)) == len(a) for a in ds['variant_allele']) 
+assert any([np.sum(g) == 1 for g in ds['call_genotype']]) # at least one singleton
+assert any([np.sum(g) == 0 for g in ds['call_genotype']]) # at least one non-variable
 
-alleles = ds['variant_allele'].values.astype(str)
+alleles = ds['variant_allele'].astype(str)
 sites = np.arange(ds['call_genotype'].shape[0])
 print(" " * 22, "Site:", " ".join(str(x) for x in range(8)), "\n")
 for sample in range(ds['call_genotype'].shape[1]):
@@ -54,16 +52,20 @@ for sample in range(ds['call_genotype'].shape[1]):
         genotypes = ds['call_genotype'][:,sample, genome]
         print(
             f"Diploid sample {sample} (genome {genome}):",
-            " ".join(alleles[sites, genotypes.values])
+            " ".join(alleles[sites, genotypes])
         )
 ```
 
+### VariantData and ancestral alleles
+
 We wish to infer a genealogy that could have given rise to this data set. To run _tsinfer_
 we wrap the .vcz file in a `tsinfer.VariantData` object. This requires an 
 *ancestral allele* to be specified for each site; there are
 many methods for calculating there: details are outside the scope of this manual, but we
 have started a [discussion topic](https://github.com/tskit-dev/tsinfer/discussions/523)
-on this issue to provide some recommendations. Sometimes VCF files will contain the
+on this issue to provide some recommendations.
+
+Sometimes VCF files will contain the
 ancestral allele in the "AA" info field, in which case it will be encoded in the
 `variant_AA` field of the .vcz file. It's also possible to provide a numpy array
 of ancestral alleles, of the same length as the number of variants. Ancestral
@@ -73,17 +75,17 @@ and not used for inference (with a warning given).
 ```{code-cell}
 import tsinfer
 
-# In this example, take the REF allele (index 0) as ancestral
-ancestral_alleles = ds['variant_allele'].values[:,0].astype(str)
-# set the last site as of unknown ancestral allele
+# For this example take the REF allele (index 0) as ancestral
+ancestral_alleles = ds['variant_allele'][:,0].astype(str)
+# set the last site to an unknown ancestral allele, for this demo
 ancestral_alleles[-1] = "."
 
-vdata = tsinfer.VariantData("data.vcz", ancestral_alleles)
+vdata = tsinfer.VariantData("_static/example_data.vcz", ancestral_alleles)
 ```
 
 Here we create a new `.VariantData` object for the 3 diploid samples in our
 dataset. Each diploid sample will correspond to an *individual* in the resulting tree
-sequenece, and each of the 6 genomes will correspond to a sample node
+sequence, and each of the 6 genomes will correspond to a sample node
 (hence `ts.num_samples == 6`). 
 
 Not all sites are used for genealogical inference: this includes non-variable (fixed)
@@ -98,6 +100,8 @@ via the `exclude_positions` parameter). Note, however, that even if a site is no
 for genealogical inference, its genetic variation can still be encoded in the final
 tree sequence.
 
+### Topology inference
+
 Once we have stored our data in a `.VariantData` object, we can easily infer 
 a {ref}`tree sequence<sec_python_api_trees_and_tree_sequences>` using the Python
 API: