diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index e1dd929c7..41a6753c3 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -46,7 +46,7 @@ pub fn align_nuc( ); } - if ref_len + qry_len < (10 * params.seed_length) { + if ref_len + qry_len < (20 * params.kmer_length) { // for very short sequences, use full square let stripes = full_matrix(ref_len, qry_len); trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix"); diff --git a/packages_rs/nextclade/src/align/params.rs b/packages_rs/nextclade/src/align/params.rs index f08cba7d6..00d7069ec 100644 --- a/packages_rs/nextclade/src/align/params.rs +++ b/packages_rs/nextclade/src/align/params.rs @@ -1,6 +1,10 @@ +use crate::{make_error, o}; use clap::{Parser, ValueEnum}; +use eyre::Report; +use itertools::Itertools; use optfield::optfield; use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; #[derive(ValueEnum, Copy, Clone, Debug, Deserialize, Serialize, schemars::JsonSchema)] #[serde(rename_all = "kebab-case")] @@ -52,30 +56,6 @@ pub struct AlignPairwiseParams { #[clap(long)] pub max_band_area: usize, - /// Maximum length of insertions or deletions allowed to proceed with alignment. Alignments with long indels are slow to compute and require substantial memory in the current implementation. Alignment of sequences with indels longer that this value, will not be attempted and a warning will be emitted. - #[clap(long)] - pub max_indel: usize, - - /// k-mer length to determine approximate alignments between query and reference and determine the bandwidth of the banded alignment. - #[clap(long)] - pub seed_length: usize, - - /// Maximum number of mismatching nucleotides allowed for a seed to be considered a match. - #[clap(long)] - pub mismatches_allowed: usize, - - /// Minimum number of seeds to search for during nucleotide alignment. Relevant for short sequences. In long sequences, the number of seeds is determined by `--seed-spacing`. - #[clap(long)] - pub min_seeds: i32, - - /// Minimum seed mathing rate (a ratio of seed matches to total number of attempted seeds) - #[clap(long)] - pub min_match_rate: f64, - - /// Spacing between seeds during nucleotide alignment. - #[clap(long)] - pub seed_spacing: i32, - /// Retry seed matching step with a reverse complement if the first attempt failed #[clap(long)] #[clap(num_args=0..=1, default_missing_value = "true")] @@ -107,15 +87,15 @@ pub struct AlignPairwiseParams { #[clap(long, value_enum)] pub gap_alignment_side: GapAlignmentSide, - /// Length of exactly matching kmers used in the seed alignment of the query to the reference. + /// Length of exactly matching k-mers used in the seed alignment of the query to the reference. #[clap(long)] pub kmer_length: usize, - /// Interval of successive kmers on the query sequence. Should be small compared to the query length. + /// Interval of successive k-mers on the query sequence. Should be small compared to the query length. #[clap(long)] pub kmer_distance: usize, - /// Exactly matching kmers are extended to the left and right until more + /// Exactly matching k-mers are extended to the left and right until more /// than `allowed_mismatches` are observed in a sliding window (`window_size`). #[clap(long)] pub allowed_mismatches: usize, @@ -124,7 +104,7 @@ pub struct AlignPairwiseParams { #[clap(long)] pub window_size: usize, - /// Minimum length of extended kmers + /// Minimum length of extended k-mers #[clap(long)] pub min_match_length: usize, @@ -136,6 +116,31 @@ pub struct AlignPairwiseParams { /// Number of times Nextclade will retry alignment with more relaxed results if alignment band boundaries are hit #[clap(long)] pub max_alignment_attempts: usize, + + // The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set) + /// REMOVED + #[clap(long, hide_long_help = true, hide_short_help = true)] + pub max_indel: Option, + + /// REMOVED + #[clap(long, hide_long_help = true, hide_short_help = true)] + pub seed_length: Option, + + /// REMOVED + #[clap(long, hide_long_help = true, hide_short_help = true)] + pub mismatches_allowed: Option, + + /// REMOVED + #[clap(long, hide_long_help = true, hide_short_help = true)] + pub min_seeds: Option, + + /// REMOVED + #[clap(long, hide_long_help = true, hide_short_help = true)] + pub min_match_rate: Option, + + /// REMOVED + #[clap(long, hide_long_help = true, hide_short_help = true)] + pub seed_spacing: Option, } impl Default for AlignPairwiseParams { @@ -149,12 +154,6 @@ impl Default for AlignPairwiseParams { penalty_mismatch: 1, score_match: 3, max_band_area: 500_000_000, // requires around 500Mb for paths, 2GB for the scores - max_indel: 400, // obsolete - seed_length: 21, // obsolete - min_seeds: 10, // obsolete - min_match_rate: 0.3, // obsolete - seed_spacing: 100, // obsolete - mismatches_allowed: 3, // obsolete retry_reverse_complement: false, no_translate_past_stop: false, left_terminal_gaps_free: true, @@ -164,11 +163,49 @@ impl Default for AlignPairwiseParams { terminal_bandwidth: 50, min_seed_cover: 0.33, kmer_length: 10, // Should not be much larger than 1/divergence of amino acids - kmer_distance: 50, // Distance between successive kmers + kmer_distance: 50, // Distance between successive k-mers min_match_length: 40, // Experimentally determined, to keep off-target matches reasonably low allowed_mismatches: 8, // Ns count as mismatches window_size: 30, max_alignment_attempts: 3, + + // The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set) + max_indel: None, + seed_length: None, + min_seeds: None, + min_match_rate: None, + seed_spacing: None, + mismatches_allowed: None, + } + } +} + +impl AlignPairwiseParams { + pub fn validate(&self) -> Result<(), Report> { + #[rustfmt::skip] + let deprecated = BTreeMap::from([ + (o!("--min-seeds (minSeeds)"), &self.min_seeds), + (o!("--seed-length (seedLength)"), &self.seed_length), + (o!("--seed-spacing (seedSpacing)"), &self.seed_spacing), + (o!("--min-match-rate (minMatchRate)"), &self.min_match_rate), + (o!("--mismatches-allowed (mismatchesAllowed)"), &self.mismatches_allowed), + (o!("--max-indel (maxIndel)"), &self.max_indel), + ]); + + if deprecated.values().any(|val| val.is_some()) { + let keys = deprecated.keys().map(|key| format!(" {key}")).join("\n"); + + return make_error!( + r#"The following alignment parameters (CLI arguments and config file fields) were removed in Nextclade 3.0.0 due to changes in the alignment algorithm: + +{keys} + +Please remove them from your dataset and/or from command-line invocation. + +Refer to user documentation for more details."# + ); } + + Ok(()) } } diff --git a/packages_rs/nextclade/src/align/seed_alignment.rs b/packages_rs/nextclade/src/align/seed_alignment.rs index 5c8ee9e61..1b4ebfdc4 100644 --- a/packages_rs/nextclade/src/align/seed_alignment.rs +++ b/packages_rs/nextclade/src/align/seed_alignment.rs @@ -45,97 +45,6 @@ pub struct SeedMatch { pub score: usize, } -/// Determine seed matches between query and reference sequence. will only attempt to -/// match k-mers without ambiguous characters. Search is performed via a left-to-right -/// search starting and the previous valid seed and extending at most to the maximally -/// allowed insertion/deletion (shift) distance. -pub fn get_seed_matches>( - qry_seq: &[L], - ref_seq: &[L], - params: &AlignPairwiseParams, -) -> (Vec, i32) { - let mut seed_matches = Vec::::new(); - - // get list of valid k-mer start positions - let map_to_good_positions = get_map_to_good_positions(qry_seq, params.seed_length); - let n_good_positions = map_to_good_positions.len(); - if n_good_positions < params.seed_length { - return (seed_matches, 0); - } - - // use 1/seed_spacing for long sequences, min_seeds otherwise - let n_seeds = if ref_seq.len() > (params.min_seeds * params.seed_spacing) as usize { - (ref_seq.len() as f32 / params.seed_spacing as f32) as i32 - } else { - params.min_seeds - }; - - // distance of first seed from the end of the sequence (third of seed spacing) - let margin = (ref_seq.len() as f32 / (n_seeds * 3) as f32).round() as i32; - - // Generate kmers equally spaced on the query - let effective_margin = (margin as f32).min(n_good_positions as f32 / 4.0); - let seed_cover = n_good_positions as f32 - 2.0 * effective_margin; - let kmer_spacing = (seed_cover - 1.0) / ((n_seeds - 1) as f32); - - // loop over seeds and find matches, store in seed_matches - let mut start_pos = 0; // start position of ref search - let mut end_pos = ref_seq.len(); // end position of ref search - let qry_pos = 0; - - for ni in 0..n_seeds { - // pick index of of seed in map - let good_position_index = (effective_margin + (kmer_spacing * ni as f32)).round() as usize; - // get new query kmer-position - let qry_pos_old = qry_pos; - let qry_pos = map_to_good_positions[good_position_index]; - // increment upper bound for search in reference - end_pos += qry_pos - qry_pos_old; - - //extract seed and find match - let seed = &qry_seq[qry_pos..(qry_pos + params.seed_length)]; - let tmp_match = seed_match(seed, ref_seq, start_pos, end_pos, params.mismatches_allowed); - - // Only use seeds with at most allowed_mismatches - if tmp_match.score >= params.seed_length - params.mismatches_allowed { - // if this isn't the first match, check that reference position of current match after previous - // if previous seed matched AFTER current seed, remove previous seed - if let Some(prev_match) = seed_matches.last() { - if tmp_match.ref_pos > prev_match.ref_pos { - start_pos = prev_match.ref_pos; - } else { - // warn!("Crossed over seed removed. {:?}", prev_match); - seed_matches.pop(); - } - } - - let seed_match = SeedMatch { - qry_pos, - ref_pos: tmp_match.ref_pos, - score: tmp_match.score, - }; - - // check that current seed matches AFTER previous seed (-2 if already triggered above) - // and add current seed to list of matches - if seed_matches - .last() - .map_or(true, |prev_match| prev_match.ref_pos < tmp_match.ref_pos) - { - //ensure seed positions increase strictly monotonically - seed_matches.push(seed_match); - // } else { - // warn!( - // "Seed not used because of identical ref_pos with previous seed: {:?}", - // seed_match - // ); - } - // increment the "reference search end-pos" as the current reference + maximally allowed indel - end_pos = tmp_match.ref_pos + params.max_indel; - } - } - (seed_matches, n_seeds) -} - fn abs_shift(seed1: &SeedMatch2, seed2: &SeedMatch2) -> isize { abs(seed2.offset - seed1.offset) } diff --git a/packages_rs/nextclade/src/run/nextclade_wasm.rs b/packages_rs/nextclade/src/run/nextclade_wasm.rs index d26947fb9..3e91b28ec 100644 --- a/packages_rs/nextclade/src/run/nextclade_wasm.rs +++ b/packages_rs/nextclade/src/run/nextclade_wasm.rs @@ -157,7 +157,7 @@ impl Nextclade { virus_properties, } = inputs; - let params = NextcladeInputParams::from_optional(params, &virus_properties); + let params = NextcladeInputParams::from_optional(params, &virus_properties)?; let ref_seq = to_nuc_seq(&ref_record.seq).wrap_err("When converting reference sequence")?; let seed_index = CodonSpacedIndex::from_sequence(&ref_seq); diff --git a/packages_rs/nextclade/src/run/params.rs b/packages_rs/nextclade/src/run/params.rs index 2a9738e20..afafc100b 100644 --- a/packages_rs/nextclade/src/run/params.rs +++ b/packages_rs/nextclade/src/run/params.rs @@ -2,8 +2,12 @@ use crate::align::params::{AlignPairwiseParams, AlignPairwiseParamsOptional}; use crate::analyze::virus_properties::VirusProperties; use crate::run::params_general::{NextcladeGeneralParams, NextcladeGeneralParamsOptional}; use crate::tree::params::{TreeBuilderParams, TreeBuilderParamsOptional}; +use crate::{make_error, o}; use clap::Parser; +use eyre::Report; +use itertools::Itertools; use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; #[derive(Parser, Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] @@ -27,9 +31,11 @@ pub struct NextcladeInputParams { } impl NextcladeInputParams { - pub fn from_optional(params: &NextcladeInputParamsOptional, virus_properties: &VirusProperties) -> Self { + pub fn from_optional( + params: &NextcladeInputParamsOptional, + virus_properties: &VirusProperties, + ) -> Result { // FIXME: this code is repetitive and error prone - let general = { // Start with defaults let mut general_params = NextcladeGeneralParams::default(); @@ -58,6 +64,8 @@ impl NextcladeInputParams { alignment_params }; + alignment.validate()?; + let tree_builder = { // Start with defaults let mut tree_builder_params = TreeBuilderParams::default(); @@ -72,10 +80,10 @@ impl NextcladeInputParams { tree_builder_params }; - Self { + Ok(Self { general, tree_builder, alignment, - } + }) } } diff --git a/packages_rs/nextclade/src/translate/translate_genes.rs b/packages_rs/nextclade/src/translate/translate_genes.rs index d8081a08d..3ef06ba1e 100644 --- a/packages_rs/nextclade/src/translate/translate_genes.rs +++ b/packages_rs/nextclade/src/translate/translate_genes.rs @@ -250,7 +250,7 @@ pub fn translate_cds( // Set to false for internal genes left_terminal_gaps_free: first(&qry_cds_seq)?.is_gap(), right_terminal_gaps_free: last(&qry_cds_seq)?.is_gap(), - ..*params + ..params.clone() }; // Make sure subsequent gap stripping does not introduce frame shift