From 78284a1313069e6397b3d1434f562aaf864baca1 Mon Sep 17 00:00:00 2001 From: Joel Hellewell Date: Wed, 1 May 2024 16:39:22 +0100 Subject: [PATCH] Renaming and tidying gen_list into mutation.rs --- src/gen_list.rs | 181 --------------------------------------------- src/graveyard.rs | 89 ++++++++++++++++++++++ src/hillclimb.rs | 4 +- src/lib.rs | 7 +- src/likelihoods.rs | 108 ++++++++++++++++----------- src/mutation.rs | 84 +++++++++++++++++++++ src/node.rs | 23 +++--- src/tests.rs | 2 +- src/tree.rs | 18 ++++- 9 files changed, 271 insertions(+), 245 deletions(-) delete mode 100644 src/gen_list.rs create mode 100644 src/mutation.rs diff --git a/src/gen_list.rs b/src/gen_list.rs deleted file mode 100644 index 3d45987..0000000 --- a/src/gen_list.rs +++ /dev/null @@ -1,181 +0,0 @@ -use crate::Tree; -use needletail::*; - -#[derive(Debug, Copy, Clone)] -pub struct Mutation(pub f64, pub f64, pub f64, pub f64); - -const NEGINF: f64 = -f64::INFINITY; - // (A, C, G, T) -const AMUT: Mutation = Mutation(0.0, NEGINF, NEGINF, NEGINF); -const CMUT: Mutation = Mutation(NEGINF, 0.0, NEGINF, NEGINF); -const GMUT: Mutation = Mutation(NEGINF, NEGINF, 0.0, NEGINF); -const TMUT: Mutation = Mutation(NEGINF, NEGINF, NEGINF, 0.0); -const YMUT: Mutation = Mutation(NEGINF, 0.0, NEGINF, 0.0); -const WMUT: Mutation = Mutation(0.0, NEGINF, NEGINF, 0.0); -const RMUT: Mutation = Mutation(0.0, NEGINF, 0.0, NEGINF); -const KMUT: Mutation = Mutation(NEGINF, NEGINF, 0.0, 0.0); -const SMUT: Mutation = Mutation(NEGINF, 0.0, 0.0, NEGINF); -const MMUT: Mutation = Mutation(0.0, 0.0, NEGINF, NEGINF); -const BMUT: Mutation = Mutation(NEGINF, 0.0, 0.0, 0.0); -const DMUT: Mutation = Mutation(0.0, NEGINF, 0.0, 0.0); -const VMUT: Mutation = Mutation(0.0, 0.0, 0.0, NEGINF); - -pub fn char_to_mutation(e: &char) -> Mutation { - match e { - 'A' => AMUT, - 'C' => CMUT, - 'G' => GMUT, - 'T' => TMUT, - 'Y' => YMUT, - 'W' => WMUT, - 'R' => RMUT, - 'K' => KMUT, - 'S' => SMUT, - 'M' => MMUT, - 'B' => BMUT, - 'D' => DMUT, - 'V' => VMUT, - '-' => { - // This way of coding gives same answer as other tree programs - Mutation(0.0, 0.0, 0.0, 0.0)}, - _ => panic!("Unrecognised character: {}", e), - } -} - -// Combines two vectors of Mutations into a single vector -// pub fn combine_lists( -// seq1: Option<&Vec>, -// seq2: Option<&Vec>, -// branchlengths: (f64, f64), -// rate_matrix: &na::Matrix4, -// ) -> Vec { -// let mut out: Vec = Vec::new(); - -// // Probability matrices -// let p1 = na::Matrix::exp(&(rate_matrix * branchlengths.0)); -// let p2 = na::Matrix::exp(&(rate_matrix * branchlengths.1)); - -// let mut s1 = seq1.unwrap().iter(); -// let mut s2 = seq2.unwrap().iter(); - -// let mut mut1 = s1.next(); -// let mut mut2 = s2.next(); - -// while mut1.is_some() | mut2.is_some() { -// if mut1.is_none() { -// // First iterator empty, push second -// out.push(mut2.unwrap().child_log_likelihood(&p2)); -// mut2 = s2.next(); -// } else if mut2.is_none() { -// // Second iterator empty, push first -// out.push(mut1.unwrap().child_log_likelihood(&p1)); -// mut1 = s1.next(); -// } else { -// // println!("mut1 = {:?} mut2 = {:?}", mut1.unwrap(), mut2.unwrap()); -// // Neither iterator empty, compare indices of mutations and push highest -// // or combine likelihood if mutations at same location -// match mut1.unwrap().0.cmp(&mut2.unwrap().0) { -// Ordering::Equal => { -// // println!("mut1 == mut2 so pushing {:?}", mut1.unwrap()); -// out.push( -// mut1.unwrap() -// .child_log_likelihood(&p1) -// .sum(mut2.unwrap().child_log_likelihood(&p2)), -// ); -// mut1 = s1.next(); -// mut2 = s2.next(); -// } -// Ordering::Greater => { -// // println!("mut1 > mut2 so pushing {:?}", mut2.unwrap()); -// out.push(mut2.unwrap().child_log_likelihood(&p2)); -// mut2 = s2.next(); -// } -// Ordering::Less => { -// // println!("mut2 > mut1 so pushing {:?}", mut1.unwrap()); -// out.push(mut1.unwrap().child_log_likelihood(&p1)); -// mut1 = s1.next(); -// } -// } -// } -// } -// out -// } - -pub fn combine_lists( - seq1: &[Mutation], - seq2: &[Mutation], - branchlengths: (f64, f64), - rate_matrix: &na::Matrix4, -) -> Vec { - - // Probability matrices - let p1 = na::Matrix::exp(&(rate_matrix * branchlengths.0)); - let p2 = na::Matrix::exp(&(rate_matrix * branchlengths.1)); - - let out: Vec = seq1.iter() - .zip(seq2.iter()) - .map(|(b1, b2)| b1.child_log_likelihood(&p1) - .sum(b2.child_log_likelihood(&p2))) - .collect(); - - out -} - -impl Tree { - pub fn add_genetic_data(&mut self, filename: &str) { - let mut reader = parse_fastx_file(filename).expect("Error parsing file"); - - // Add genetic data - while let Some(rec) = reader.next() { - let newrec: Vec = rec.unwrap().seq().iter().map(|l| *l as char).collect(); - self.mutation_lists.push(create_list(&newrec)); - } - - // Add empty lists for internal nodes - let leafn = self.mutation_lists.len() - 1; - for _ in 0..leafn { - self.mutation_lists.push(Vec::new()); - } - } -} - -// Takes a reference sequence and another sequence in SequenceRecord<'_> format -// Returns a vector of Mutations for how the latter sequence differs from the reference -pub fn create_list(seq: &[char]) -> Vec { - let mut out: Vec = Vec::new(); - - for base in seq.iter() { - out.push(char_to_mutation(base)); - } - - out -} - -// pub fn create_dummy_genetic_data(n_leaves: usize, n_mutations: usize, sequence_length: usize) -> Vec> { -// let mut output: Vec> = Vec::new(); -// let mut rng = rand::thread_rng(); - -// for i in 0..n_leaves { -// let mut temp: Vec = Vec::new(); -// for j in 0..n_mutations { -// let mut mutation = Mutation(rng.gen_range(1..sequence_length), 0.0, 0.0, 0.0, 0.0); -// match rng.gen_range(1..=4) { -// 1 => {mutation.1 = 1.0}, -// 2 => {mutation.2 = 1.0}, -// 3 => {mutation.3 = 1.0}, -// 4 => {mutation.4 = 1.0}, -// _ => {}, -// } -// temp.push(mutation); -// } -// temp.sort_by(|a, b| a.0.cmp(&b.0)); -// temp.dedup_by(|a, b| a.0.eq(&b.0)); -// output.push(temp); -// } - -// for _ in 0..(n_leaves + 1) { -// output.push(Vec::new()); -// } - -// output -// } \ No newline at end of file diff --git a/src/graveyard.rs b/src/graveyard.rs index 2dac599..28c0147 100644 --- a/src/graveyard.rs +++ b/src/graveyard.rs @@ -147,3 +147,92 @@ // pub fn get_tips(&self) -> Vec<&Node> { // self.nodes.iter().filter(|n| n.tip).collect() // } + + +// pub fn create_dummy_genetic_data(n_leaves: usize, n_mutations: usize, sequence_length: usize) -> Vec> { +// let mut output: Vec> = Vec::new(); +// let mut rng = rand::thread_rng(); + +// for i in 0..n_leaves { +// let mut temp: Vec = Vec::new(); +// for j in 0..n_mutations { +// let mut mutation = Mutation(rng.gen_range(1..sequence_length), 0.0, 0.0, 0.0, 0.0); +// match rng.gen_range(1..=4) { +// 1 => {mutation.1 = 1.0}, +// 2 => {mutation.2 = 1.0}, +// 3 => {mutation.3 = 1.0}, +// 4 => {mutation.4 = 1.0}, +// _ => {}, +// } +// temp.push(mutation); +// } +// temp.sort_by(|a, b| a.0.cmp(&b.0)); +// temp.dedup_by(|a, b| a.0.eq(&b.0)); +// output.push(temp); +// } + +// for _ in 0..(n_leaves + 1) { +// output.push(Vec::new()); +// } + +// output +// } + +// Combines two vectors of Mutations into a single vector +// pub fn combine_lists( +// seq1: Option<&Vec>, +// seq2: Option<&Vec>, +// branchlengths: (f64, f64), +// rate_matrix: &na::Matrix4, +// ) -> Vec { +// let mut out: Vec = Vec::new(); + +// // Probability matrices +// let p1 = na::Matrix::exp(&(rate_matrix * branchlengths.0)); +// let p2 = na::Matrix::exp(&(rate_matrix * branchlengths.1)); + +// let mut s1 = seq1.unwrap().iter(); +// let mut s2 = seq2.unwrap().iter(); + +// let mut mut1 = s1.next(); +// let mut mut2 = s2.next(); + +// while mut1.is_some() | mut2.is_some() { +// if mut1.is_none() { +// // First iterator empty, push second +// out.push(mut2.unwrap().child_log_likelihood(&p2)); +// mut2 = s2.next(); +// } else if mut2.is_none() { +// // Second iterator empty, push first +// out.push(mut1.unwrap().child_log_likelihood(&p1)); +// mut1 = s1.next(); +// } else { +// // println!("mut1 = {:?} mut2 = {:?}", mut1.unwrap(), mut2.unwrap()); +// // Neither iterator empty, compare indices of mutations and push highest +// // or combine likelihood if mutations at same location +// match mut1.unwrap().0.cmp(&mut2.unwrap().0) { +// Ordering::Equal => { +// // println!("mut1 == mut2 so pushing {:?}", mut1.unwrap()); +// out.push( +// mut1.unwrap() +// .child_log_likelihood(&p1) +// .sum(mut2.unwrap().child_log_likelihood(&p2)), +// ); +// mut1 = s1.next(); +// mut2 = s2.next(); +// } +// Ordering::Greater => { +// // println!("mut1 > mut2 so pushing {:?}", mut2.unwrap()); +// out.push(mut2.unwrap().child_log_likelihood(&p2)); +// mut2 = s2.next(); +// } +// Ordering::Less => { +// // println!("mut2 > mut1 so pushing {:?}", mut1.unwrap()); +// out.push(mut1.unwrap().child_log_likelihood(&p1)); +// mut1 = s1.next(); +// } +// } +// } +// } +// out +// } \ No newline at end of file diff --git a/src/hillclimb.rs b/src/hillclimb.rs index a573860..1de1e22 100644 --- a/src/hillclimb.rs +++ b/src/hillclimb.rs @@ -45,7 +45,7 @@ impl Tree { println!("Optimisation step {} out of {}", k, iterations); // println!("Old vector {:?}", self.tree_vec); - println!("Tree log likelihood: {}", self.get_tree_likelihood()); + println!("Current best likelihood: {}", best_likelihood); candidate_vec = hill_peturb(&self.tree_vec, self.tree_vec.len()); working_tree.update(&candidate_vec); @@ -53,7 +53,7 @@ impl Tree { let new_likelihood = working_tree.get_tree_likelihood(); // println!("New vector {:?}", candidate_vec); - println!("New likelihood {}", new_likelihood); + println!("Candidate likelihood {}", new_likelihood); if new_likelihood > best_likelihood { println!("Climbing hill!"); diff --git a/src/lib.rs b/src/lib.rs index 224e93c..bbfedbd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -mod gen_list; +mod mutation; mod import; mod likelihoods; mod node; @@ -9,12 +9,9 @@ mod hillclimb; mod tree_iterators; mod tree_to_newick; -use crate::gen_list::*; use crate::build_tree::*; use crate::tree::Tree; extern crate nalgebra as na; - - pub mod cli; use crate::cli::*; @@ -46,7 +43,7 @@ pub fn main() { println!("{:?}", tr.tree_vec); if !args.no_optimise { - // tr.hillclimb(&q, 100); + tr.hillclimb(&q, 25); } // let end = Instant::now(); diff --git a/src/likelihoods.rs b/src/likelihoods.rs index acdae11..b5f9384 100644 --- a/src/likelihoods.rs +++ b/src/likelihoods.rs @@ -1,10 +1,70 @@ -use crate::combine_lists; -use crate::Mutation; +use crate::mutation::{Mutation, to_mutation}; use crate::Tree; +// Default base frequencies +const BF_DEFAULT: [f64; 4] = [0.25, 0.25, 0.25, 0.25]; + +// Calculates the likelihood at a base given the bases at each child and probability matrices +pub fn base_likelihood(mut1: &Mutation, mut2: &Mutation, p1: &na::Matrix4, p2: &na::Matrix4) -> Mutation { + let v1: Vec = mut1.to_vector(); + let v2: Vec = mut2.to_vector(); + let mut x1: Vec = Vec::new(); + let mut x2: Vec = Vec::new(); + + for i in 0..=3 { + x1.push(logse(p1.row(i).iter().zip(&v1).map(|(a, b)| a.ln() + b).collect())); + x2.push(logse(p2.row(i).iter().zip(&v2).map(|(a, b)| a.ln() + b).collect())); + }; + + to_mutation(x1).sum(to_mutation(x2)) +} + +// Calculates the likelihood for a Node by calculating the likelihood at each base +pub fn calculate_likelihood( + seq1: &[Mutation], + seq2: &[Mutation], + branchlengths: (f64, f64), + rate_matrix: &na::Matrix4, +) -> Vec { + + // Probability matrices + let p1 = na::Matrix::exp(&(rate_matrix * branchlengths.0)); + let p2 = na::Matrix::exp(&(rate_matrix * branchlengths.1)); + + let out: Vec = seq1.iter() + .zip(seq2.iter()) + .map(|(b1, b2)| base_likelihood(b1, b2, &p1, &p2)) + .collect(); + + out +} + +// LogSumExp function +pub fn logse(x: Vec) -> f64 { + let xstar = x.iter().max_by(|x, y| x.total_cmp(y)).unwrap(); + xstar + x.iter().fold(0.0,|acc, el| acc + f64::exp(el - xstar)).ln() +} + +// LogSumExp function that includes base frequency values for final likelihood calculation +pub fn base_freq_logse(muta: &Mutation, bf: [f64; 4]) -> f64 { + (f64::exp(muta.0) * bf[0] + f64::exp(muta.1) * bf[1] + f64::exp(muta.2) * bf[2] + f64::exp(muta.3) * bf[3]).ln() +} impl Tree { + + // Updates the genetic likelihood at a given node + pub fn update_node_likelihood(&mut self, index: usize, rate_matrix: &na::Matrix4) { + if let (Some(ch1), Some(ch2)) = self.get_node(index).unwrap().children { + let branchlengths = (self.get_branchlength(ch1), self.get_branchlength(ch2)); + + let seq1 = self.mutation_lists.get(ch1).unwrap(); + let seq2 = self.mutation_lists.get(ch2).unwrap(); + + self.mutation_lists[index] = calculate_likelihood(seq1, seq2, branchlengths, rate_matrix); + } + } + // Goes through all nodes that have changed and updates genetic likelihood - // Used after update_tree() + // Used after tree.update() pub fn update_likelihood(&mut self, rate_matrix: &na::Matrix4) { if self.changes.is_empty() { @@ -60,18 +120,6 @@ impl Tree { } } - // Updates the genetic likelihood at a given node - pub fn update_node_likelihood(&mut self, index: usize, rate_matrix: &na::Matrix4) { - if let (Some(ch1), Some(ch2)) = self.get_node(index).unwrap().children { - let branchlengths = (self.get_branchlength(ch1), self.get_branchlength(ch2)); - - let seq1 = self.mutation_lists.get(ch1).unwrap(); - let seq2 = self.mutation_lists.get(ch2).unwrap(); - - self.mutation_lists[index] = combine_lists(seq1, seq2, branchlengths, rate_matrix); - } - } - // Fetches likelihood value for a tree pub fn get_tree_likelihood(&self) -> f64 { self.mutation_lists @@ -79,35 +127,9 @@ impl Tree { .unwrap() .iter() .fold(0.0, |acc, muta| { - acc + (f64::exp(muta.0) * 0.25 + f64::exp(muta.1) * 0.25 + f64::exp(muta.2) * 0.25 + f64::exp(muta.3) * 0.25).ln() + // acc + (f64::exp(muta.0) * 0.25 + f64::exp(muta.1) * 0.25 + f64::exp(muta.2) * 0.25 + f64::exp(muta.3) * 0.25).ln() + acc + base_freq_logse(muta, BF_DEFAULT) }) } -} - -impl Mutation { - - pub fn sum(self, r: Mutation) -> Mutation { - Mutation( - self.0 + r.0, - self.1 + r.1, - self.2 + r.2, - self.3 + r.3, - ) - } - - pub fn child_log_likelihood(self, prob_matrix: &na::Matrix4) -> Self { - let lnx = vec![self.0, self.1, self.2, self.3]; - let mut x: Vec = Vec::new(); - - for i in 0..=3 { - x.push(logse(prob_matrix.row(i).iter().zip(&lnx).map(|(a, b)| a.ln() + b).collect())); - } - Mutation(*x.first().unwrap(), *x.get(1).unwrap(), *x.get(2).unwrap(), *x.get(3).unwrap()) - } -} - -pub fn logse(x: Vec) -> f64 { - let xstar = x.iter().max_by(|x, y| x.total_cmp(y)).unwrap(); - xstar + x.iter().fold(0.0,|acc, el| acc + f64::exp(el - xstar)).ln() } \ No newline at end of file diff --git a/src/mutation.rs b/src/mutation.rs new file mode 100644 index 0000000..2e2d4b6 --- /dev/null +++ b/src/mutation.rs @@ -0,0 +1,84 @@ + +#[derive(Debug, Copy, Clone)] +pub struct Mutation(pub f64, pub f64, pub f64, pub f64); + +const NEGINF: f64 = -f64::INFINITY; + // (A, C, G, T) +const AMUT: Mutation = Mutation(0.0, NEGINF, NEGINF, NEGINF); +const CMUT: Mutation = Mutation(NEGINF, 0.0, NEGINF, NEGINF); +const GMUT: Mutation = Mutation(NEGINF, NEGINF, 0.0, NEGINF); +const TMUT: Mutation = Mutation(NEGINF, NEGINF, NEGINF, 0.0); +const YMUT: Mutation = Mutation(NEGINF, 0.0, NEGINF, 0.0); +const WMUT: Mutation = Mutation(0.0, NEGINF, NEGINF, 0.0); +const RMUT: Mutation = Mutation(0.0, NEGINF, 0.0, NEGINF); +const KMUT: Mutation = Mutation(NEGINF, NEGINF, 0.0, 0.0); +const SMUT: Mutation = Mutation(NEGINF, 0.0, 0.0, NEGINF); +const MMUT: Mutation = Mutation(0.0, 0.0, NEGINF, NEGINF); +const BMUT: Mutation = Mutation(NEGINF, 0.0, 0.0, 0.0); +const DMUT: Mutation = Mutation(0.0, NEGINF, 0.0, 0.0); +const VMUT: Mutation = Mutation(0.0, 0.0, 0.0, NEGINF); + +// Takes a reference sequence and another sequence in SequenceRecord<'_> format +// Returns a vector of Mutations for how the latter sequence differs from the reference +pub fn create_list(seq: &[char]) -> Vec { + let mut out: Vec = Vec::new(); + + for base in seq.iter() { + out.push(char_to_mutation(base)); + } + + out +} + +// Turns characters into Mutations +pub fn char_to_mutation(e: &char) -> Mutation { + match e { + 'A' => AMUT, + 'C' => CMUT, + 'G' => GMUT, + 'T' => TMUT, + 'Y' => YMUT, + 'W' => WMUT, + 'R' => RMUT, + 'K' => KMUT, + 'S' => SMUT, + 'M' => MMUT, + 'B' => BMUT, + 'D' => DMUT, + 'V' => VMUT, + '-' => { + // This way of coding gives same answer as other tree programs + Mutation(0.0, 0.0, 0.0, 0.0)}, + _ => panic!("Unrecognised character: {}", e), + } +} + +// Converts a vector of correct length to Mutation +pub fn to_mutation(x1: Vec) -> Mutation { + if x1.len().ne(&4) { + panic!("Length of vector too long to cast to Mutation"); + } else { + Mutation(*x1.first().unwrap(), *x1.get(1).unwrap(), *x1.get(2).unwrap(), *x1.get(3).unwrap()) + } +} + +impl Mutation { + + // Converts a Mutation to a vector + pub fn to_vector(self) -> Vec { + vec![self.0, self.1, self.2, self.3] + } + + // Adds two Mutations together + pub fn sum(self, r: Mutation) -> Mutation { + Mutation( + self.0 + r.0, + self.1 + r.1, + self.2 + r.2, + self.3 + r.3, + ) + } + +} + + diff --git a/src/node.rs b/src/node.rs index db835c2..f153d1d 100644 --- a/src/node.rs +++ b/src/node.rs @@ -1,4 +1,3 @@ -use crate::gen_list::Mutation; use std::fmt; #[derive(Debug, Clone)] @@ -43,17 +42,17 @@ impl Node { } // Remove a child from a Node - pub fn remove_child(&mut self, child: usize) { - self.children = match self.children { - (Some(a), Some(b)) if a == child => (Some(b), None), - (Some(a), Some(b)) if b == child => (Some(a), None), - (Some(a), None) if a == child => (None, None), - (None, Some(_b)) => { - panic!("Trying to remove child from parent with only a right child!") - } - _ => panic!("Trying to remove child that does not exist in parent"), - }; - } + // pub fn remove_child(&mut self, child: usize) { + // self.children = match self.children { + // (Some(a), Some(b)) if a == child => (Some(b), None), + // (Some(a), Some(b)) if b == child => (Some(a), None), + // (Some(a), None) if a == child => (None, None), + // (None, Some(_b)) => { + // panic!("Trying to remove child from parent with only a right child!") + // } + // _ => panic!("Trying to remove child that does not exist in parent"), + // }; + // } } impl fmt::Display for Node { diff --git a/src/tests.rs b/src/tests.rs index dfe9b8c..ef76806 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,6 +1,6 @@ #[cfg(test)] mod tests { - use crate::gen_list::Mutation; + use crate::mutation::Mutation; use crate::build_tree::phylo2vec_lin; use crate::build_tree::vector_to_tree; use crate::tree::Tree; diff --git a/src/tree.rs b/src/tree.rs index 2bef656..6068d59 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -1,7 +1,8 @@ -use crate::gen_list::Mutation; +use crate::mutation::{Mutation, create_list}; use crate::node::Node; use std::collections::HashMap; use crate::vector_to_tree; +use needletail::*; #[derive(Debug)] pub struct Tree { @@ -67,6 +68,21 @@ impl Tree { } + pub fn add_genetic_data(&mut self, filename: &str) { + let mut reader = parse_fastx_file(filename).expect("Error parsing file"); + + // Add genetic data + while let Some(rec) = reader.next() { + let newrec: Vec = rec.unwrap().seq().iter().map(|l| *l as char).collect(); + self.mutation_lists.push(create_list(&newrec)); + } + + // Add empty lists for internal nodes + let leafn = self.mutation_lists.len() - 1; + for _ in 0..leafn { + self.mutation_lists.push(Vec::new()); + } + } // Get a specified node pub fn get_node(&self, index: usize) -> Option<&Node> {