Skip to content

Commit

Permalink
Merge pull request #1256 from nextstrain/fix/branchlength
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov authored Oct 3, 2023
2 parents aaac7ce + 15307c4 commit f5199fb
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 23 deletions.
78 changes: 62 additions & 16 deletions packages_rs/nextclade/src/analyze/divergence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,57 @@ use crate::coord::range::NucRefGlobalRange;
use crate::tree::params::TreeBuilderParams;
use crate::tree::tree::DivergenceUnits;

/// Calculate number of nuc muts, only considering ACGT characters
pub fn count_nuc_muts(nuc_muts: &[NucSub]) -> usize {
nuc_muts
pub struct NucMutsCounted<'a> {
muts: Vec<&'a NucSub>,
masked_muts: Vec<&'a NucSub>,
other_muts: Vec<&'a NucSub>,
n_muts: usize,
n_masked_muts: usize,
n_other_muts: usize,
}

pub fn count_nuc_muts<'a>(nuc_muts: &'a [NucSub], masked_ranges: &[NucRefGlobalRange]) -> NucMutsCounted<'a> {
// Split away non_acgt mutations
let (nuc_muts, other_muts): (Vec<_>, Vec<_>) = nuc_muts
.iter()
.filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt())
.count()
.partition(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt());

// Split away masked mutations
let (masked_muts, muts): (Vec<_>, Vec<_>) = nuc_muts
.iter()
.partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos)));

let n_muts = muts.len();
let n_masked_muts = masked_muts.len();
let n_other_muts = other_muts.len();

NucMutsCounted {
muts,
masked_muts,
other_muts,
n_muts,
n_masked_muts,
n_other_muts,
}
}

pub fn calculate_branch_length(
private_mutations: &[NucSub],
nuc_muts: &[NucSub],
masked_ranges: &[NucRefGlobalRange],
divergence_units: DivergenceUnits,
ref_seq_len: usize,
) -> f64 {
let mut this_div = count_nuc_muts(private_mutations) as f64;
let NucMutsCounted {
n_muts,
n_masked_muts,
n_other_muts,
..
} = count_nuc_muts(nuc_muts, masked_ranges);

let mut this_div = n_muts as f64;
if n_muts == 0 && n_masked_muts + n_other_muts > 0 {
this_div += 0.06;
}

// If divergence is measured per site, divide by the length of reference sequence.
// The unit of measurement is deduced from what's already is used in the reference tree nodes.
Expand All @@ -29,14 +66,23 @@ pub fn calculate_branch_length(

/// Calculate nuc mut score
pub fn score_nuc_muts(nuc_muts: &[NucSub], masked_ranges: &[NucRefGlobalRange], params: &TreeBuilderParams) -> f64 {
// Only consider ACGT characters
let nuc_muts = nuc_muts.iter().filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt());

// Split away masked mutations
let (masked_muts, muts): (Vec<_>, Vec<_>) =
nuc_muts.partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos)));
let NucMutsCounted {
n_muts,
n_masked_muts,
n_other_muts,
..
} = count_nuc_muts(nuc_muts, masked_ranges);
let mut score = n_muts as f64;
// modify the score by sub-integer amounts for masked and other mutations. this effectively means
// scoring is first by n_muts, then by masked_muts, then by other_muts
if n_masked_muts > 0 {
// independent of their number, masked nucleotides increase the score by 0.5
score += 0.5;
}
if n_other_muts > 0 {
// other mutations (mostly to and from gap) by 0.1
score += 0.1;
}

let n_muts = muts.len() as f64;
let n_masked_muts = masked_muts.len() as f64;
n_muts + n_masked_muts * params.masked_muts_weight
score
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::alphabet::letter::Letter;
use crate::alphabet::nuc::Nuc;
use crate::analyze::aa_sub::AaSub;
use crate::analyze::divergence::count_nuc_muts;
use crate::analyze::is_sequenced::{is_nuc_non_acgtn, is_nuc_sequenced};
use crate::analyze::letter_ranges::NucRange;
use crate::analyze::nuc_del::{NucDel, NucDelRange};
Expand Down
2 changes: 2 additions & 0 deletions packages_rs/nextclade/src/run/nextclade_run_one.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,11 @@ pub fn nextclade_run_one(
gene_map,
);
let parent_div = nearest_node.node_attrs.div.unwrap_or(0.0);
let masked_ranges = graph.data.meta.placement_mask_ranges();
let divergence = parent_div
+ calculate_branch_length(
&private_nuc_mutations.private_substitutions,
masked_ranges,
graph.data.tmp.divergence_units,
ref_seq.len(),
);
Expand Down
34 changes: 28 additions & 6 deletions packages_rs/nextclade/src/tree/tree_builder.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::analyze::aa_del::AaDel;
use crate::analyze::aa_sub::AaSub;
use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts, score_nuc_muts};
use crate::analyze::divergence::{calculate_branch_length, score_nuc_muts};
use crate::analyze::find_private_nuc_mutations::BranchMutations;
use crate::analyze::nuc_del::NucDel;
use crate::analyze::nuc_sub::NucSub;
Expand Down Expand Up @@ -293,6 +293,7 @@ pub fn knit_into_graph(
ref_seq_len: usize,
params: &TreeBuilderParams,
) -> Result<(), Report> {
let masked_ranges = graph.data.meta.placement_mask_ranges().to_owned();
let divergence_units = graph.data.tmp.divergence_units;

// the target node will be the sister of the new node defined by "private mutations" and the "result"
Expand Down Expand Up @@ -334,11 +335,31 @@ pub fn knit_into_graph(
muts_new_node,
}
};
// if the node is a leaf or if there are shared mutations, need to split the branch above and insert aux node
// if the node is a leaf or if there are non-shared mutations, need to split the branch above and insert aux node
if target_node.is_leaf() || !muts_target_node.nuc_muts.is_empty() {
// determine divergence of new internal node by subtracting shared reversions from target_node
let divergence_middle_node =
target_node_div - calculate_branch_length(&muts_target_node.nuc_muts, divergence_units, ref_seq_len);
let divergence_middle_node = if target_node.is_root() {
target_node_div
- calculate_branch_length(
&muts_target_node.nuc_muts,
&masked_ranges,
divergence_units,
ref_seq_len,
)
} else {
let parent_node = graph.parent_of(target_node).unwrap();
let parent_node_auspice = parent_node.payload();
let parent_node_div = &parent_node_auspice.node_attrs.div.unwrap_or(0.0);
target_node_div.min(
parent_node_div
+ calculate_branch_length(
&muts_common_branch.nuc_muts,
&masked_ranges,
divergence_units,
ref_seq_len,
),
)
};

// generate new internal node
// add private mutations, divergence, name and branch attrs to new internal node
Expand Down Expand Up @@ -381,7 +402,8 @@ pub fn knit_into_graph(
new_internal_node_key,
&muts_new_node,
result,
divergence_middle_node + calculate_branch_length(&muts_new_node.nuc_muts, divergence_units, ref_seq_len),
divergence_middle_node
+ calculate_branch_length(&muts_new_node.nuc_muts, &masked_ranges, divergence_units, ref_seq_len),
)?;
} else {
//can simply attach node
Expand All @@ -390,7 +412,7 @@ pub fn knit_into_graph(
target_key,
private_mutations,
result,
target_node_div + calculate_branch_length(&muts_new_node.nuc_muts, divergence_units, ref_seq_len),
target_node_div + calculate_branch_length(&muts_new_node.nuc_muts, &masked_ranges, divergence_units, ref_seq_len),
)?;
}
Ok(())
Expand Down

1 comment on commit f5199fb

@vercel
Copy link

@vercel vercel bot commented on f5199fb Oct 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

nextclade – ./

nextclade.vercel.app
nextclade-nextstrain.vercel.app
nextclade-git-master-nextstrain.vercel.app

Please sign in to comment.