diff --git a/packages_rs/nextclade/src/analyze/divergence.rs b/packages_rs/nextclade/src/analyze/divergence.rs index bf389863d..6662e410c 100644 --- a/packages_rs/nextclade/src/analyze/divergence.rs +++ b/packages_rs/nextclade/src/analyze/divergence.rs @@ -3,20 +3,57 @@ use crate::coord::range::NucRefGlobalRange; use crate::tree::params::TreeBuilderParams; use crate::tree::tree::DivergenceUnits; -/// Calculate number of nuc muts, only considering ACGT characters -pub fn count_nuc_muts(nuc_muts: &[NucSub]) -> usize { - nuc_muts +pub struct NucMutsCounted<'a> { + muts: Vec<&'a NucSub>, + masked_muts: Vec<&'a NucSub>, + other_muts: Vec<&'a NucSub>, + n_muts: usize, + n_masked_muts: usize, + n_other_muts: usize, +} + +pub fn count_nuc_muts<'a>(nuc_muts: &'a [NucSub], masked_ranges: &[NucRefGlobalRange]) -> NucMutsCounted<'a> { + // Split away non_acgt mutations + let (nuc_muts, other_muts): (Vec<_>, Vec<_>) = nuc_muts .iter() - .filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt()) - .count() + .partition(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt()); + + // Split away masked mutations + let (masked_muts, muts): (Vec<_>, Vec<_>) = nuc_muts + .iter() + .partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos))); + + let n_muts = muts.len(); + let n_masked_muts = masked_muts.len(); + let n_other_muts = other_muts.len(); + + NucMutsCounted { + muts, + masked_muts, + other_muts, + n_muts, + n_masked_muts, + n_other_muts, + } } pub fn calculate_branch_length( - private_mutations: &[NucSub], + nuc_muts: &[NucSub], + masked_ranges: &[NucRefGlobalRange], divergence_units: DivergenceUnits, ref_seq_len: usize, ) -> f64 { - let mut this_div = count_nuc_muts(private_mutations) as f64; + let NucMutsCounted { + n_muts, + n_masked_muts, + n_other_muts, + .. + } = count_nuc_muts(nuc_muts, masked_ranges); + + let mut this_div = n_muts as f64; + if n_muts == 0 && n_masked_muts + n_other_muts > 0 { + this_div += 0.06; + } // If divergence is measured per site, divide by the length of reference sequence. // The unit of measurement is deduced from what's already is used in the reference tree nodes. @@ -29,14 +66,23 @@ pub fn calculate_branch_length( /// Calculate nuc mut score pub fn score_nuc_muts(nuc_muts: &[NucSub], masked_ranges: &[NucRefGlobalRange], params: &TreeBuilderParams) -> f64 { - // Only consider ACGT characters - let nuc_muts = nuc_muts.iter().filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt()); - - // Split away masked mutations - let (masked_muts, muts): (Vec<_>, Vec<_>) = - nuc_muts.partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos))); + let NucMutsCounted { + n_muts, + n_masked_muts, + n_other_muts, + .. + } = count_nuc_muts(nuc_muts, masked_ranges); + let mut score = n_muts as f64; + // modify the score by sub-integer amounts for masked and other mutations. this effectively means + // scoring is first by n_muts, then by masked_muts, then by other_muts + if n_masked_muts > 0 { + // independent of their number, masked nucleotides increase the score by 0.5 + score += 0.5; + } + if n_other_muts > 0 { + // other mutations (mostly to and from gap) by 0.1 + score += 0.1; + } - let n_muts = muts.len() as f64; - let n_masked_muts = masked_muts.len() as f64; - n_muts + n_masked_muts * params.masked_muts_weight + score } diff --git a/packages_rs/nextclade/src/analyze/find_private_nuc_mutations.rs b/packages_rs/nextclade/src/analyze/find_private_nuc_mutations.rs index 260b95748..ffcfa8b5d 100644 --- a/packages_rs/nextclade/src/analyze/find_private_nuc_mutations.rs +++ b/packages_rs/nextclade/src/analyze/find_private_nuc_mutations.rs @@ -1,7 +1,6 @@ use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; use crate::analyze::aa_sub::AaSub; -use crate::analyze::divergence::count_nuc_muts; use crate::analyze::is_sequenced::{is_nuc_non_acgtn, is_nuc_sequenced}; use crate::analyze::letter_ranges::NucRange; use crate::analyze::nuc_del::{NucDel, NucDelRange}; diff --git a/packages_rs/nextclade/src/run/nextclade_run_one.rs b/packages_rs/nextclade/src/run/nextclade_run_one.rs index de7eb4d03..fc6931fe3 100644 --- a/packages_rs/nextclade/src/run/nextclade_run_one.rs +++ b/packages_rs/nextclade/src/run/nextclade_run_one.rs @@ -294,9 +294,11 @@ pub fn nextclade_run_one( gene_map, ); let parent_div = nearest_node.node_attrs.div.unwrap_or(0.0); + let masked_ranges = graph.data.meta.placement_mask_ranges(); let divergence = parent_div + calculate_branch_length( &private_nuc_mutations.private_substitutions, + masked_ranges, graph.data.tmp.divergence_units, ref_seq.len(), ); diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 69dcdbba5..4b638ad04 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -1,6 +1,6 @@ use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; -use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts, score_nuc_muts}; +use crate::analyze::divergence::{calculate_branch_length, score_nuc_muts}; use crate::analyze::find_private_nuc_mutations::BranchMutations; use crate::analyze::nuc_del::NucDel; use crate::analyze::nuc_sub::NucSub; @@ -293,6 +293,7 @@ pub fn knit_into_graph( ref_seq_len: usize, params: &TreeBuilderParams, ) -> Result<(), Report> { + let masked_ranges = graph.data.meta.placement_mask_ranges().to_owned(); let divergence_units = graph.data.tmp.divergence_units; // the target node will be the sister of the new node defined by "private mutations" and the "result" @@ -334,11 +335,31 @@ pub fn knit_into_graph( muts_new_node, } }; - // if the node is a leaf or if there are shared mutations, need to split the branch above and insert aux node + // if the node is a leaf or if there are non-shared mutations, need to split the branch above and insert aux node if target_node.is_leaf() || !muts_target_node.nuc_muts.is_empty() { // determine divergence of new internal node by subtracting shared reversions from target_node - let divergence_middle_node = - target_node_div - calculate_branch_length(&muts_target_node.nuc_muts, divergence_units, ref_seq_len); + let divergence_middle_node = if target_node.is_root() { + target_node_div + - calculate_branch_length( + &muts_target_node.nuc_muts, + &masked_ranges, + divergence_units, + ref_seq_len, + ) + } else { + let parent_node = graph.parent_of(target_node).unwrap(); + let parent_node_auspice = parent_node.payload(); + let parent_node_div = &parent_node_auspice.node_attrs.div.unwrap_or(0.0); + target_node_div.min( + parent_node_div + + calculate_branch_length( + &muts_common_branch.nuc_muts, + &masked_ranges, + divergence_units, + ref_seq_len, + ), + ) + }; // generate new internal node // add private mutations, divergence, name and branch attrs to new internal node @@ -381,7 +402,8 @@ pub fn knit_into_graph( new_internal_node_key, &muts_new_node, result, - divergence_middle_node + calculate_branch_length(&muts_new_node.nuc_muts, divergence_units, ref_seq_len), + divergence_middle_node + + calculate_branch_length(&muts_new_node.nuc_muts, &masked_ranges, divergence_units, ref_seq_len), )?; } else { //can simply attach node @@ -390,7 +412,7 @@ pub fn knit_into_graph( target_key, private_mutations, result, - target_node_div + calculate_branch_length(&muts_new_node.nuc_muts, divergence_units, ref_seq_len), + target_node_div + calculate_branch_length(&muts_new_node.nuc_muts, &masked_ranges, divergence_units, ref_seq_len), )?; } Ok(())