From 56af4d4a7444bfc15bfb3d6cd0202f4fb3076e90 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 26 Nov 2023 09:55:44 -0500 Subject: [PATCH] cli: add simple flag suggestions We look for similar flag names via Jaccard index on ngrams. In my experience this tends to work better than Levenshtein or other edit distance based metrics. Principally because it allows for out-of-order suggestions. For example, --case-smart will result in a suggestion for --smart-case, even though the edit distance between them is pretty big. This is something Clap did for us. I initially thought it wasn't necessary to add this back in, but I realized it wouldn't be much work and might actually be helpful to folks. --- crates/core/flags/parse.rs | 88 +++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/crates/core/flags/parse.rs b/crates/core/flags/parse.rs index 8151110da..f1f2dcc16 100644 --- a/crates/core/flags/parse.rs +++ b/crates/core/flags/parse.rs @@ -2,7 +2,7 @@ Parses command line arguments into a structured and typed representation. */ -use std::ffi::OsString; +use std::{borrow::Cow, collections::BTreeSet, ffi::OsString}; use anyhow::Context; @@ -263,7 +263,11 @@ impl Parser { anyhow::bail!("unrecognized flag -{name}") } FlagLookup::UnrecognizedLong(name) => { - anyhow::bail!("unrecognized flag --{name}") + let mut msg = format!("unrecognized flag --{name}"); + if let Some(suggest_msg) = suggest(&name) { + msg = format!("{msg}\n\n{suggest_msg}"); + } + anyhow::bail!("{msg}") } }; let value = if matches!(mat.kind, FlagInfoKind::Negated) { @@ -390,3 +394,83 @@ impl FlagMap { self.map.get(name).copied() } } + +/// Possibly return a message suggesting flags similar in the name to the one +/// given. +/// +/// The one given should be a flag given by the user (without the leading +/// dashes) that was unrecognized. This attempts to find existing flags that +/// are similar to the one given. +fn suggest(unrecognized: &str) -> Option { + let similars = find_similar_names(unrecognized); + if similars.is_empty() { + return None; + } + let list = similars + .into_iter() + .map(|name| format!("--{name}")) + .collect::>() + .join(", "); + Some(format!("similar flags that are available: {list}")) +} + +/// Return a sequence of names similar to the unrecognized name given. +fn find_similar_names(unrecognized: &str) -> Vec<&'static str> { + // The jaccard similarity threshold at which we consider two flag names + // similar enough that it's worth suggesting it to the end user. + // + // This value was determined by some ad hoc experimentation. It might need + // further tweaking. + const THRESHOLD: f64 = 0.4; + + let mut similar = vec![]; + let bow_given = ngrams(unrecognized); + for &flag in FLAGS.iter() { + let name = flag.name_long(); + let bow = ngrams(name); + if jaccard_index(&bow_given, &bow) >= THRESHOLD { + similar.push(name); + } + if let Some(name) = flag.name_negated() { + let bow = ngrams(name); + if jaccard_index(&bow_given, &bow) >= THRESHOLD { + similar.push(name); + } + } + for name in flag.aliases() { + let bow = ngrams(name); + if jaccard_index(&bow_given, &bow) >= THRESHOLD { + similar.push(name); + } + } + } + similar +} + +/// A "bag of words" is a set of ngrams. +type BagOfWords<'a> = BTreeSet>; + +/// Returns the jaccard index (a measure of similarity) between sets of ngrams. +fn jaccard_index(ngrams1: &BagOfWords<'_>, ngrams2: &BagOfWords<'_>) -> f64 { + let union = u32::try_from(ngrams1.union(ngrams2).count()) + .expect("fewer than u32::MAX flags"); + let intersection = u32::try_from(ngrams1.intersection(ngrams2).count()) + .expect("fewer than u32::MAX flags"); + f64::from(intersection) / f64::from(union) +} + +/// Returns all 3-grams in the slice given. +/// +/// If the slice doesn't contain a 3-gram, then one is artificially created by +/// padding it out with a character that will never appear in a flag name. +fn ngrams(flag_name: &str) -> BagOfWords<'_> { + // We only allow ASCII flag names, so we can just use bytes. + let slice = flag_name.as_bytes(); + let seq: Vec> = match slice.len() { + 0 => vec![Cow::Owned(b"!!!".to_vec())], + 1 => vec![Cow::Owned(vec![slice[0], b'!', b'!'])], + 2 => vec![Cow::Owned(vec![slice[0], slice[1], b'!'])], + _ => slice.windows(3).map(Cow::Borrowed).collect(), + }; + BTreeSet::from_iter(seq) +}