Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: deprecate old alignment params #1270

Merged
merged 2 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/align/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pub fn align_nuc(
);
}

if ref_len + qry_len < (10 * params.seed_length) {
if ref_len + qry_len < (20 * params.kmer_length) {
// for very short sequences, use full square
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to Richard's recommendation, for short sequence check let's use kmer_length instead of removed seed_length and adjust the coefficient to roughly match the old value.

let stripes = full_matrix(ref_len, qry_len);
trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
Expand Down
107 changes: 72 additions & 35 deletions packages_rs/nextclade/src/align/params.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use crate::{make_error, o};
use clap::{Parser, ValueEnum};
use eyre::Report;
use itertools::Itertools;
use optfield::optfield;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

#[derive(ValueEnum, Copy, Clone, Debug, Deserialize, Serialize, schemars::JsonSchema)]
#[serde(rename_all = "kebab-case")]
Expand Down Expand Up @@ -52,30 +56,6 @@ pub struct AlignPairwiseParams {
#[clap(long)]
pub max_band_area: usize,

/// Maximum length of insertions or deletions allowed to proceed with alignment. Alignments with long indels are slow to compute and require substantial memory in the current implementation. Alignment of sequences with indels longer that this value, will not be attempted and a warning will be emitted.
#[clap(long)]
pub max_indel: usize,

/// k-mer length to determine approximate alignments between query and reference and determine the bandwidth of the banded alignment.
#[clap(long)]
pub seed_length: usize,

/// Maximum number of mismatching nucleotides allowed for a seed to be considered a match.
#[clap(long)]
pub mismatches_allowed: usize,

/// Minimum number of seeds to search for during nucleotide alignment. Relevant for short sequences. In long sequences, the number of seeds is determined by `--seed-spacing`.
#[clap(long)]
pub min_seeds: i32,

/// Minimum seed mathing rate (a ratio of seed matches to total number of attempted seeds)
#[clap(long)]
pub min_match_rate: f64,

/// Spacing between seeds during nucleotide alignment.
#[clap(long)]
pub seed_spacing: i32,

/// Retry seed matching step with a reverse complement if the first attempt failed
#[clap(long)]
#[clap(num_args=0..=1, default_missing_value = "true")]
Expand Down Expand Up @@ -107,15 +87,15 @@ pub struct AlignPairwiseParams {
#[clap(long, value_enum)]
pub gap_alignment_side: GapAlignmentSide,

/// Length of exactly matching kmers used in the seed alignment of the query to the reference.
/// Length of exactly matching k-mers used in the seed alignment of the query to the reference.
#[clap(long)]
pub kmer_length: usize,

/// Interval of successive kmers on the query sequence. Should be small compared to the query length.
/// Interval of successive k-mers on the query sequence. Should be small compared to the query length.
#[clap(long)]
pub kmer_distance: usize,

/// Exactly matching kmers are extended to the left and right until more
/// Exactly matching k-mers are extended to the left and right until more
/// than `allowed_mismatches` are observed in a sliding window (`window_size`).
#[clap(long)]
pub allowed_mismatches: usize,
Expand All @@ -124,7 +104,7 @@ pub struct AlignPairwiseParams {
#[clap(long)]
pub window_size: usize,

/// Minimum length of extended kmers
/// Minimum length of extended k-mers
#[clap(long)]
pub min_match_length: usize,

Expand All @@ -136,6 +116,31 @@ pub struct AlignPairwiseParams {
/// Number of times Nextclade will retry alignment with more relaxed results if alignment band boundaries are hit
#[clap(long)]
pub max_alignment_attempts: usize,

// The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set)
/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub max_indel: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub seed_length: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub mismatches_allowed: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub min_seeds: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub min_match_rate: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub seed_spacing: Option<f64>,
}

impl Default for AlignPairwiseParams {
Expand All @@ -149,12 +154,6 @@ impl Default for AlignPairwiseParams {
penalty_mismatch: 1,
score_match: 3,
max_band_area: 500_000_000, // requires around 500Mb for paths, 2GB for the scores
max_indel: 400, // obsolete
seed_length: 21, // obsolete
min_seeds: 10, // obsolete
min_match_rate: 0.3, // obsolete
seed_spacing: 100, // obsolete
mismatches_allowed: 3, // obsolete
retry_reverse_complement: false,
no_translate_past_stop: false,
left_terminal_gaps_free: true,
Expand All @@ -164,11 +163,49 @@ impl Default for AlignPairwiseParams {
terminal_bandwidth: 50,
min_seed_cover: 0.33,
kmer_length: 10, // Should not be much larger than 1/divergence of amino acids
kmer_distance: 50, // Distance between successive kmers
kmer_distance: 50, // Distance between successive k-mers
min_match_length: 40, // Experimentally determined, to keep off-target matches reasonably low
allowed_mismatches: 8, // Ns count as mismatches
window_size: 30,
max_alignment_attempts: 3,

// The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set)
max_indel: None,
seed_length: None,
min_seeds: None,
min_match_rate: None,
seed_spacing: None,
mismatches_allowed: None,
}
}
}

impl AlignPairwiseParams {
pub fn validate(&self) -> Result<(), Report> {
#[rustfmt::skip]
let deprecated = BTreeMap::from([
(o!("--min-seeds (minSeeds)"), &self.min_seeds),
(o!("--seed-length (seedLength)"), &self.seed_length),
(o!("--seed-spacing (seedSpacing)"), &self.seed_spacing),
(o!("--min-match-rate (minMatchRate)"), &self.min_match_rate),
(o!("--mismatches-allowed (mismatchesAllowed)"), &self.mismatches_allowed),
(o!("--max-indel (maxIndel)"), &self.max_indel),
Comment on lines +187 to +192
Copy link
Member Author

@ivan-aksamentov ivan-aksamentov Oct 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the params being deprecated.

And the error message is constructed just below this fragment.

]);

if deprecated.values().any(|val| val.is_some()) {
let keys = deprecated.keys().map(|key| format!(" {key}")).join("\n");

return make_error!(
r#"The following alignment parameters (CLI arguments and config file fields) were removed in Nextclade 3.0.0 due to changes in the alignment algorithm:

{keys}

Please remove them from your dataset and/or from command-line invocation.

Refer to user documentation for more details."#
);
}

Ok(())
}
}
91 changes: 0 additions & 91 deletions packages_rs/nextclade/src/align/seed_alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,97 +45,6 @@ pub struct SeedMatch {
pub score: usize,
}

/// Determine seed matches between query and reference sequence. will only attempt to
/// match k-mers without ambiguous characters. Search is performed via a left-to-right
/// search starting and the previous valid seed and extending at most to the maximally
/// allowed insertion/deletion (shift) distance.
pub fn get_seed_matches<L: Letter<L>>(
qry_seq: &[L],
ref_seq: &[L],
params: &AlignPairwiseParams,
) -> (Vec<SeedMatch>, i32) {
let mut seed_matches = Vec::<SeedMatch>::new();

// get list of valid k-mer start positions
let map_to_good_positions = get_map_to_good_positions(qry_seq, params.seed_length);
let n_good_positions = map_to_good_positions.len();
if n_good_positions < params.seed_length {
return (seed_matches, 0);
}

// use 1/seed_spacing for long sequences, min_seeds otherwise
let n_seeds = if ref_seq.len() > (params.min_seeds * params.seed_spacing) as usize {
(ref_seq.len() as f32 / params.seed_spacing as f32) as i32
} else {
params.min_seeds
};

// distance of first seed from the end of the sequence (third of seed spacing)
let margin = (ref_seq.len() as f32 / (n_seeds * 3) as f32).round() as i32;

// Generate kmers equally spaced on the query
let effective_margin = (margin as f32).min(n_good_positions as f32 / 4.0);
let seed_cover = n_good_positions as f32 - 2.0 * effective_margin;
let kmer_spacing = (seed_cover - 1.0) / ((n_seeds - 1) as f32);

// loop over seeds and find matches, store in seed_matches
let mut start_pos = 0; // start position of ref search
let mut end_pos = ref_seq.len(); // end position of ref search
let qry_pos = 0;

for ni in 0..n_seeds {
// pick index of of seed in map
let good_position_index = (effective_margin + (kmer_spacing * ni as f32)).round() as usize;
// get new query kmer-position
let qry_pos_old = qry_pos;
let qry_pos = map_to_good_positions[good_position_index];
// increment upper bound for search in reference
end_pos += qry_pos - qry_pos_old;

//extract seed and find match
let seed = &qry_seq[qry_pos..(qry_pos + params.seed_length)];
let tmp_match = seed_match(seed, ref_seq, start_pos, end_pos, params.mismatches_allowed);

// Only use seeds with at most allowed_mismatches
if tmp_match.score >= params.seed_length - params.mismatches_allowed {
// if this isn't the first match, check that reference position of current match after previous
// if previous seed matched AFTER current seed, remove previous seed
if let Some(prev_match) = seed_matches.last() {
if tmp_match.ref_pos > prev_match.ref_pos {
start_pos = prev_match.ref_pos;
} else {
// warn!("Crossed over seed removed. {:?}", prev_match);
seed_matches.pop();
}
}

let seed_match = SeedMatch {
qry_pos,
ref_pos: tmp_match.ref_pos,
score: tmp_match.score,
};

// check that current seed matches AFTER previous seed (-2 if already triggered above)
// and add current seed to list of matches
if seed_matches
.last()
.map_or(true, |prev_match| prev_match.ref_pos < tmp_match.ref_pos)
{
//ensure seed positions increase strictly monotonically
seed_matches.push(seed_match);
// } else {
// warn!(
// "Seed not used because of identical ref_pos with previous seed: {:?}",
// seed_match
// );
}
// increment the "reference search end-pos" as the current reference + maximally allowed indel
end_pos = tmp_match.ref_pos + params.max_indel;
}
}
(seed_matches, n_seeds)
}

fn abs_shift(seed1: &SeedMatch2, seed2: &SeedMatch2) -> isize {
abs(seed2.offset - seed1.offset)
}
Expand Down
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/run/nextclade_wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ impl Nextclade {
virus_properties,
} = inputs;

let params = NextcladeInputParams::from_optional(params, &virus_properties);
let params = NextcladeInputParams::from_optional(params, &virus_properties)?;
let ref_seq = to_nuc_seq(&ref_record.seq).wrap_err("When converting reference sequence")?;
let seed_index = CodonSpacedIndex::from_sequence(&ref_seq);

Expand Down
16 changes: 12 additions & 4 deletions packages_rs/nextclade/src/run/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ use crate::align::params::{AlignPairwiseParams, AlignPairwiseParamsOptional};
use crate::analyze::virus_properties::VirusProperties;
use crate::run::params_general::{NextcladeGeneralParams, NextcladeGeneralParamsOptional};
use crate::tree::params::{TreeBuilderParams, TreeBuilderParamsOptional};
use crate::{make_error, o};
use clap::Parser;
use eyre::Report;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

#[derive(Parser, Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema)]
#[serde(rename_all = "camelCase")]
Expand All @@ -27,9 +31,11 @@ pub struct NextcladeInputParams {
}

impl NextcladeInputParams {
pub fn from_optional(params: &NextcladeInputParamsOptional, virus_properties: &VirusProperties) -> Self {
pub fn from_optional(
params: &NextcladeInputParamsOptional,
virus_properties: &VirusProperties,
) -> Result<Self, Report> {
// FIXME: this code is repetitive and error prone

let general = {
// Start with defaults
let mut general_params = NextcladeGeneralParams::default();
Expand Down Expand Up @@ -58,6 +64,8 @@ impl NextcladeInputParams {
alignment_params
};

alignment.validate()?;

let tree_builder = {
// Start with defaults
let mut tree_builder_params = TreeBuilderParams::default();
Expand All @@ -72,10 +80,10 @@ impl NextcladeInputParams {
tree_builder_params
};

Self {
Ok(Self {
general,
tree_builder,
alignment,
}
})
}
}
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/translate/translate_genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ pub fn translate_cds(
// Set to false for internal genes
left_terminal_gaps_free: first(&qry_cds_seq)?.is_gap(),
right_terminal_gaps_free: last(&qry_cds_seq)?.is_gap(),
..*params
..params.clone()
};

// Make sure subsequent gap stripping does not introduce frame shift
Expand Down