From 1342e5da39f34c2df7e5810b0d74a378dcbfbec4 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 26 Jul 2024 19:21:37 +0100 Subject: [PATCH] Install tabix via apt-get, and use the notebook python for install --- .github/workflows/docs.yml | 2 + docs/simulation-example.py | 80 +++++++++++++++++++------------------- docs/tutorial.md | 14 +++++-- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ffdcb52e..f0207876 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -56,6 +56,8 @@ jobs: - name: Build Docs run: | . venv/bin/activate + eval "$(conda shell.bash hook)" + conda activate cd docs && make dist - name: Trigger docs site rebuild diff --git a/docs/simulation-example.py b/docs/simulation-example.py index 45ab12aa..b72601c1 100644 --- a/docs/simulation-example.py +++ b/docs/simulation-example.py @@ -1,44 +1,42 @@ -import os +import builtins +import subprocess import sys import msprime -import tqdm - -sys.path.insert(0, os.path.abspath("..")) -import tsinfer # noqa - -if True: - ts = msprime.simulate( - sample_size=10000, - Ne=10**4, - recombination_rate=1e-8, - mutation_rate=1e-8, - length=10 * 10**6, - random_seed=42, - ) - ts.dump("simulation-source.trees") - print("Simulation done:", ts.num_trees, "trees and", ts.num_sites) - - with tsinfer.SampleData( - sequence_length=ts.sequence_length, - path="simulation.samples", - num_flush_threads=2, - ) as samples: - for var in tqdm.tqdm(ts.variants(), total=ts.num_sites): - samples.add_site(var.site.position, var.genotypes, var.alleles) - -else: - source = msprime.load("simulation-source.trees") - inferred = msprime.load("simulation.trees") - - subset = range(0, 6) - source_subset = source.simplify(subset) - inferred_subset = inferred.simplify(subset) - - tree = source_subset.first() - print("True tree: interval=", tree.interval) - print(tree.draw(format="unicode")) - - tree = inferred_subset.first() - print("Inferred tree: interval=", tree.interval) - print(tree.draw(format="unicode")) +import numpy as np +from Bio import bgzf + +if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook + num_diploids, seq_len = 100, 10_000 + name = "notebook-simulation" +else: # Take parameters from the command-line + num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2]) + name = "cli-simulation" + +ts = msprime.sim_ancestry( + num_diploids, + population_size=10**4, + recombination_rate=1e-8, + sequence_length=seq_len, + random_seed=6, +) +ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7) +ts.dump(name + "-source.trees") +print( + f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:", + f"{ts.num_trees} trees and {ts.num_sites} sites", +) + +# Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see +# https://github.com/sgkit-dev/bio2zarr/issues/232 +np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()]) +vcf_name = f"{name}.vcf.gz" +with bgzf.open(vcf_name, "wt") as f: + ts.write_vcf(f) +subprocess.run(["tabix", vcf_name]) +ret = subprocess.run( + "python -m bio2zarr vcf2zarr convert --force".split() + [vcf_name, name + ".vcz"], + stderr=subprocess.DEVNULL if name == "notebook-simulation" else None, +) +if ret.returncode == 0: + print(f"Converted to {name}.vcz") diff --git a/docs/tutorial.md b/docs/tutorial.md index 113a041c..d2802472 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -203,8 +203,10 @@ Python to simulate some data under the coalescent with recombination, using import builtins import sys -from Bio import bgzf +import os import subprocess + +from Bio import bgzf import numpy as np import msprime @@ -213,10 +215,12 @@ import tsinfer if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook num_diploids, seq_len = 100, 10_000 name = "notebook-simulation" + python = sys.executable else: # Take parameters from the command-line num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2]) name = "cli-simulation" - + python = "python" + ts = msprime.sim_ancestry( num_diploids, population_size=10**4, @@ -239,9 +243,11 @@ with bgzf.open(vcf_name, "wt") as f: ts.write_vcf(f) subprocess.run(["tabix", vcf_name]) ret = subprocess.run( - "python -m bio2zarr vcf2zarr convert --force".split() + [vcf_name, name+".vcz"], - stderr = subprocess.DEVNULL if name == "notebook-simulation" else None + [python, "-m", "bio2zarr", "vcf2zarr", "convert", "--force", vcf_name, f"{name}.vcz"], + stderr = subprocess.DEVNULL if name == "notebook-simulation" else None, ) +assert os.path.exists(f"{name}.vcz") + if ret.returncode == 0: print(f"Converted to {name}.vcz") ```