Skip to content

Commit

Permalink
Add iterative python implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery committed Oct 21, 2022
1 parent 76e7af0 commit b8c9809
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 3 deletions.
4 changes: 2 additions & 2 deletions tree_performance/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import sys
import os
import pathlib
import benchmarks.numba
import benchmarks.numba_vectorised
import benchmarks.numba_recursive
import benchmarks.numba_recursive_vectorised
import benchmarks.pythran
import pandas as pd
import numpy as np
Expand Down
74 changes: 74 additions & 0 deletions tree_performance/benchmarks/numba_iterative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import tskit
import util
import functools
import numba
import numpy as np
import msprime

VECTORISED = False

@numba.njit()
def _iterative_hartigan_parsimony(genotypes, samples, left_child, parent, postorder, preorder, root):
num_nodes = len(postorder)
num_alleles = np.max(genotypes) + 1
optimal_set = np.zeros((num_nodes, num_alleles), dtype=np.int8)
# Simple version assuming non missing data and one root
for allele, u in zip(genotypes, samples):
optimal_set[u, allele] = 1

allele_count = np.zeros((num_nodes, num_alleles), dtype=np.int32)
for node_j in postorder:
if left_child[node_j] != tskit.NULL:
max_allele_count = 0
for allele_k in range(num_alleles):
if allele_count[node_j, allele_k] > max_allele_count:
max_allele_count = allele_count[node_j, allele_k]
for allele_k in range(num_alleles):
if allele_count[node_j, allele_k] == max_allele_count:
optimal_set[node_j, allele_k] = 1
for allele_k in range(num_alleles):
allele_count[parent[node_j], allele_k] += optimal_set[node_j, allele_k]

anc_index, max_val = -1, -1
for i in range(len(optimal_set[root])):
if optimal_set[root, i] > max_val:
anc_index, max_val = i, optimal_set[root, i]

state = np.zeros((num_nodes), dtype=np.int32)
state[:] = anc_index
mutations = 0
for node_j in preorder:
state[node_j] = state[parent[node_j]]
if optimal_set[node_j, state[node_j]] == 0:
maxval, argmax = -1, -1
for k in range(num_alleles):
if optimal_set[node_j, k] > maxval:
maxval = optimal_set[node_j, k]
argmax = k
state[node_j] = argmax
mutations += 1
return mutations

def iterative_hartigan_parsimony(tree, genotypes, alleles):
samples = tree.tree_sequence.samples()
left_child = tree.left_child_array
parent = tree.parent_array
postorder = tree.postorder()
preorder = tree.preorder()
return _iterative_hartigan_parsimony(genotypes, samples, left_child, parent, postorder, preorder, tree.root)

def run(ts_path, max_sites):
ts = tskit.load(ts_path)
assert ts.num_trees == 1
tree = ts.first()
return util.benchmark_python(
ts,
functools.partial(iterative_hartigan_parsimony, tree),
"py_numba_iterative",
max_sites=max_sites,
)

def warmup():
ts = msprime.sim_ancestry(100, sequence_length=100000, random_seed=43)
genotypes = np.zeros(ts.num_samples, dtype=np.int8)
iterative_hartigan_parsimony(ts.first(), genotypes, ["0"])
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def run(ts_path, max_sites):
return util.benchmark_python(
ts,
functools.partial(numba_hartigan_parsimony, tree),
"py_numba",
"py_numba_recursive",
max_sites=max_sites,
)

Expand Down

0 comments on commit b8c9809

Please sign in to comment.