diff --git a/Cargo.toml b/Cargo.toml index 2919015..cced806 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ authors = ["Edd Barrett ", "Laurence Tratt " readme = "README.md" license = "Apache-2.0/MIT" categories = ["development-tools"] -edition = "2018" +edition = "2021" [dependencies] regex = "1.8" diff --git a/README.md b/README.md index 1aff1af..1ce964b 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,115 @@ # fm -`fm` is a simple non-backtracking fuzzy text matcher useful for matching -multi-line patterns and text. At its most basic the wildcard operator `...` -default) can be used in the following ways: - - * If a line consists solely of `...` it means "match zero or more lines of text". - * If a line starts with `...`, the search is not anchored to the start of the line. - * If a line ends with `...`, the search is not anchored to the end of the line. - -Note that `...` can appear both at the start and end of a line and if a line -consists of `......` (i.e. starts and ends with the wildcard with nothing -inbetween), it will match exactly one line. If the wildcard operator appears in -any other locations, it is matched literally. Wildcard matching does not -backtrack, so if a line consists solely of `...` then the next matching line -anchors the remainder of the search. - -The following examples show `fm` in action using its defaults: +`fm` is a simple limited backtracking fuzzy text matcher useful for matching +multi-line *patterns* and *literal* text. Wildcard operators can be used to +match parts of a line and to skip multiple lines of text. For example this +*pattern*: + +```text +...A +... +D... +``` + +will successfully match against literals such as: + +```text +xyzA +B +C +Dxyz +``` + + +## Intraline matching + +The intraline wildcard operator `...` can appear at the start and/or end of a +line. `...X...` matches any literal line that contains "X"; `...X` matches any +literal line that ends with "X"; and `X...` matches any literal line that +starts with "X". `......` matches exactly one literal line (i.e. the contents +of the literal line are irrelevant but this will not match against the end +of the literal text). + + +## Interline matching + +There are two interline wildcard operators that match zero or more literal +lines until a match for the next *item* is found, at which point the search is +*anchored* (i.e. backtracking will not occur before the anchor). An item is +either: + + * A single pattern line. + * A group of pattern lines. A group is the sequence of pattern lines between + two interline wildcard operators or, if no wildcard operator is found, the + end of the pattern. + +The interline wildcards are: + + * The *prefix match* wildcard `...` matches until it finds a match for the + line immediately after the interline operator ("the prefix"), at which + point the search is anchored. This wildcard does not backtrack. + * The *group match* wildcard `..~` matches until it finds a match for the + next group, at which point the search is anchored. This wildcard + backtracks, though never further than one group. + +Interline wildcards cannot directly follow each other i.e. `...\n...?` is an +invalid pattern. Interline wildcards can appear at the beginning or end of +a pattern: at the end of a pattern, both interline wildcards have identical +semantics to each other. + +Consider this pattern: + +```text +A +... +B +C +... +``` + +This will match successfully against the literal: + +```text +A +D +B +C +E +``` + +but fail to match against the literal: + +```text +A +D +B +B +C +E +``` + +because the `...` matches against the first "B", which anchors the search, then +immediately fails to match against the second "B". + +In contrast the pattern: + +```text +A +..~ +B +C +... +``` + +does match the literal because `..~` backtracks on the second "B". + +There are two reasons why you should default to using `...` rather than `..~`. +Most obviously `...` does not backtrack and has linear performance. Less +obviously `...` is a more rigorous test, since it cannot skip prefix matches +(i.e. the next line after the `...` in the pattern) in the literal. + + +## API ```rust use fm::FMatcher; diff --git a/src/lib.rs b/src/lib.rs index 2ac9e92..c9f8229 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,52 +1,6 @@ +#![doc = include_str!("../README.md")] #![allow(clippy::upper_case_acronyms)] -//! `fm` is a simple non-backtracking fuzzy text matcher useful for matching multi-line patterns -//! and text. At its most basic the wildcard operator `...` can be used in the following ways: -//! -//! * If a line consists solely of `...` it means "match zero or more lines of text". -//! * If a line starts with `...`, the search is not anchored to the start of the line. -//! * If a line ends with `...`, the search is not anchored to the end of the line. -//! -//! Note that `...` can appear both at the start and end of a line and if a line consists of -//! `......` (i.e. starts and ends with the wildcard with nothing inbetween), it will match exactly -//! one line. If the wildcard operator appears in any other locations, it is matched literally. -//! Wildcard matching does not backtrack, so if a line consists solely of `...` then the next -//! matching line anchors the remainder of the search. -//! -//! The following examples show `fm` in action using its defaults: -//! -//! ```rust -//! use fm::FMatcher; -//! -//! assert!(FMatcher::new("a").unwrap().matches("a").is_ok()); -//! assert!(FMatcher::new(" a ").unwrap().matches("a").is_ok()); -//! assert!(FMatcher::new("a").unwrap().matches("b").is_err()); -//! assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb").is_ok()); -//! assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb\nb").is_err()); -//! ``` -//! -//! When a match fails, the matcher returns an error indicating the line number at which the match -//! failed. The error can be formatted for human comprehension using the provided `Display` -//! implementation. -//! -//! If you want to use non-default options, you will first need to use `FMBuilder` before having -//! access to an `FMatcher`. For example, to use "name matching" (where you specify that the same -//! chunk of text must appear at multiple points in the text, but without specifying exactly what -//! the chunk must contain) you can set options as follows: -//! -//! ```rust -//! use {fm::FMBuilder, regex::Regex}; -//! -//! let ptn_re = Regex::new(r"\$.+?\b").unwrap(); -//! let text_re = Regex::new(r".+?\b").unwrap(); -//! let matcher = FMBuilder::new("$1 $1") -//! .unwrap() -//! .name_matcher(ptn_re, text_re) -//! .build() -//! .unwrap(); -//! assert!(matcher.matches("a a").is_ok()); -//! assert!(matcher.matches("a b").is_err()); -//! ``` use std::{ collections::hash_map::{Entry, HashMap}, default::Default, @@ -58,7 +12,9 @@ use std::{ use regex::Regex; const ERROR_CONTEXT: usize = 3; -const WILDCARD: &str = "..."; +const LINE_ANCHOR_WILDCARD: &str = "..."; +const GROUP_ANCHOR_WILDCARD: &str = "..~"; +const INTRALINE_WILDCARD: &str = "..."; const ERROR_MARKER: &str = ">>"; #[derive(Debug)] @@ -86,7 +42,7 @@ impl Default for FMOptions { /// How should an [FMatchError] format itself? Where: /// -/// * `Input` means the raw text passed to fmt. +/// * `Input` means the literal text passed to fmt. /// * `Summary` is the subset of pattern and text where an error was detected. /// /// For example a summary may look as follows (where `...` means "text above/below was elided"): @@ -103,7 +59,7 @@ impl Default for FMOptions { /// |8 /// ... /// -/// Text (error at line 5): +/// Literal text (error at line 5): /// ... /// |2 /// |3 @@ -229,12 +185,15 @@ impl<'a> FMBuilder<'a> { } fn validate(&self) -> Result<(), Box> { - let lines = self.ptn.lines().collect::>(); + let lines = self.ptn.lines().map(|x| x.trim()).collect::>(); + for i in 0..lines.len() { - if i < lines.len() - 1 && lines[i].trim() == WILDCARD && lines[i + 1].trim() == WILDCARD + if i < lines.len() - 1 + && (lines[i] == LINE_ANCHOR_WILDCARD || lines[i] == GROUP_ANCHOR_WILDCARD) + && (lines[i + 1] == LINE_ANCHOR_WILDCARD || lines[i + 1] == GROUP_ANCHOR_WILDCARD) { return Err(Box::::from(format!( - "Can't have two consecutive wildcards lines at lines {} and {}.", + "Can't have two consecutive interline wildcards lines at lines {} and {}.", i + 1, i + 2 ))); @@ -263,8 +222,7 @@ pub struct FMatcher<'a> { } impl<'a> FMatcher<'a> { - /// A convenience method that automatically builds a pattern for you using `FMBuilder`'s - /// default options. + /// A convenience method that automatically builds a pattern using `FMBuilder` defaults. pub fn new(ptn: &'a str) -> Result> { FMBuilder::new(ptn)?.build() } @@ -281,7 +239,7 @@ impl<'a> FMatcher<'a> { loop { match (ptnl, textl) { (Some(x), Some(y)) => { - if x.trim() == WILDCARD { + if x.trim() == LINE_ANCHOR_WILDCARD { ptnl = ptn_lines.next(); ptn_lines_off += 1; match ptnl { @@ -297,6 +255,85 @@ impl<'a> FMatcher<'a> { } None => return Ok(()), } + } else if x.trim() == GROUP_ANCHOR_WILDCARD { + let ptn_lines_off_orig = ptn_lines_off; + let text_lines_off_orig = text_lines_off; + ptnl = ptn_lines.next(); + // If the interline wildcard is the last part of the pattern, then we + // implicitly match any remaining text: i.e. we're done! + if ptnl.is_none() { + return Ok(()); + } + // We now have to perform (bounded) backtracking + ptn_lines_off += 1; + let mut ptn_lines_sub = ptn_lines.clone(); + let mut ptnl_sub = ptnl; + let mut ptn_lines_off_sub = ptn_lines_off; + let mut text_lines_sub = text_lines.clone(); + let mut text_lines_off_sub = text_lines_off; + let mut textl_sub = textl; + let mut names_sub = names.clone(); + loop { + match (ptnl_sub, textl_sub) { + (None, None) => return Ok(()), + (Some(x), _) + if x.trim() == GROUP_ANCHOR_WILDCARD + || x.trim() == LINE_ANCHOR_WILDCARD => + { + // We've matched everything successfully + ptn_lines = ptn_lines_sub; + ptnl = ptnl_sub; + ptn_lines_off = ptn_lines_off_sub; + text_lines = text_lines_sub; + textl = textl_sub; + text_lines_off = text_lines_off_sub; + names = names_sub; + break; + } + (None, Some(_)) => { + match self.skip_blank_lines(&mut text_lines_sub, textl_sub) { + (Some(_), _) => { + return Err(FMatchError { + output_formatter: self.options.output_formatter, + ptn: self.ptn.to_owned(), + text: text.to_owned(), + ptn_line_off: ptn_lines_off_orig, + text_line_off: text_lines_off_orig, + }) + } + (None, _) => return Ok(()), + } + } + (Some(_), None) => { + return Err(FMatchError { + output_formatter: self.options.output_formatter, + ptn: self.ptn.to_owned(), + text: text.to_owned(), + ptn_line_off: ptn_lines_off_orig, + text_line_off: text_lines_off_orig, + }); + } + (Some(x), Some(y)) => { + if self.match_line(&mut names_sub, x, y) { + ptnl_sub = ptn_lines_sub.next(); + ptn_lines_off_sub += 1; + textl_sub = text_lines_sub.next(); + text_lines_off_sub += 1; + } else { + // We failed to match, so we need to reset the + // pattern, but advance the text. + ptn_lines_sub = ptn_lines.clone(); + ptnl_sub = ptnl; + ptn_lines_off_sub += 1; + textl_sub = text_lines.next(); + text_lines_off += 1; + text_lines_sub = text_lines.clone(); + text_lines_off_sub = text_lines_off; + names_sub = names.clone(); + } + } + } + } } else if self.match_line(&mut names, x, y) { ptnl = ptn_lines.next(); ptn_lines_off += 1; @@ -314,20 +351,16 @@ impl<'a> FMatcher<'a> { } (None, None) => return Ok(()), (Some(x), None) => { - if x.trim() == WILDCARD { - for ptnl in ptn_lines { - ptn_lines_off += 1; - if !self.match_line(&mut names, ptnl, "") { - return Err(FMatchError { - output_formatter: self.options.output_formatter, - ptn: self.ptn.to_owned(), - text: text.to_owned(), - ptn_line_off: ptn_lines_off, - text_line_off: text_lines_off, - }); - } + if let LINE_ANCHOR_WILDCARD | GROUP_ANCHOR_WILDCARD = x.trim() { + ptnl = ptn_lines.next(); + ptn_lines_off += 1; + // If the interline wildcard is the last line in the pattern, we're done. + // If it isn't, then the pattern hasn't matched: rather than explicitly + // throw an error in this clause, we let the next iteration of the outer + // while loop catch this case. + if ptnl.is_none() { + return Ok(()); } - return Ok(()); } else { match self.skip_blank_lines(&mut ptn_lines, Some(x)) { (Some(_), skipped) => { @@ -413,15 +446,15 @@ impl<'a> FMatcher<'a> { text = text.trim_end(); } - let sww = ptn.starts_with(WILDCARD); - let eww = ptn.ends_with(WILDCARD); + let sww = ptn.starts_with(INTRALINE_WILDCARD); + let eww = ptn.ends_with(INTRALINE_WILDCARD); if sww && eww { - text.contains(&ptn[WILDCARD.len()..ptn.len() - WILDCARD.len()]) + text.contains(&ptn[INTRALINE_WILDCARD.len()..ptn.len() - INTRALINE_WILDCARD.len()]) } else if sww { - text.ends_with(&ptn[WILDCARD.len()..]) + text.ends_with(&ptn[INTRALINE_WILDCARD.len()..]) } else if self.options.name_matchers.is_empty() { if eww { - text.starts_with(&ptn[..ptn.len() - WILDCARD.len()]) + text.starts_with(&ptn[..ptn.len() - INTRALINE_WILDCARD.len()]) } else { ptn == text } @@ -480,7 +513,9 @@ impl<'a> FMatcher<'a> { break; } } - if (eww && text.starts_with(&ptn[..ptn.len() - WILDCARD.len()])) || ptn == text { + if (eww && text.starts_with(&ptn[..ptn.len() - INTRALINE_WILDCARD.len()])) + || ptn == text + { names.extend(new_names); true } else { @@ -541,7 +576,7 @@ fn fmt_raw(f: &mut fmt::Formatter, text: &str) -> fmt::Result { let lhs = &format!("\n{}|", " ".repeat(err_mk_chars)); writeln!( f, - "Raw text:{}{}", + "Literal text:{}{}", lhs, text.split("\n").collect::>().join(lhs) ) @@ -663,6 +698,37 @@ mod tests { assert!(!helper("a\n", "a\n\nb")); } + #[test] + fn groupings() { + fn helper(ptn: &str, text: &str) -> bool { + FMatcher::new(ptn).unwrap().matches(text).is_ok() + } + assert!(helper("a\n..~\nc\n", "a\nb\nc")); + assert!(helper("a\n..~\nc\n", "a\nc")); + assert!(helper("a\n..~\nc\nd\n", "a\nc\nd")); + assert!(helper("a\n..~\nc\nd\n", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\nd\ne", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\nd\n", "a\nb\nc\ne")); + assert!(!helper("a\n..~\nc\nd\n", "a\nc\ne\nc\ne")); + assert!(!helper("..~\nc", "")); + assert!(!helper("..~\nc", "c\nd")); + assert!(helper("a\n..~\nc\nd\n", "a\nc\ne\nc\nd")); + assert!(helper("a\n..~\nc\nd\ne", "a\nc\ne\nc\nd\ne")); + assert!(helper("a\n..~\nc\n..~", "a\nb\nc")); + assert!(helper("a\n..~\nc\n...", "a\nb\nc")); + assert!(helper("a\n..~\nc\n..~\nd", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\n..~\nd", "a\nb\nc\ne")); + assert!(helper("a\n..~\nc\n...\nd", "a\nb\nc\nd")); + assert!(helper("a\n..~\nc\n..~\nd", "a\nb\nc\ne\nf\nd")); + assert!(helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd\nd")); + assert!(!helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd\nd\nd")); + assert!(helper("..~\na\n..~\nb", "a\nb")); + assert!(!helper("..~\na\n..~\nb", "a")); + assert!(!helper("..~\na\n..~\nb", "a\nb\nc")); + assert!(helper("..~\na\n..~", "a\nb")); + } + #[test] fn dont_ignore_surrounding_blank_lines() { fn helper(ptn: &str, text: &str) -> bool { @@ -815,6 +881,13 @@ mod tests { assert!(!helper("$1 &1\n$1 &1...", "a b\na a c d")); assert!(!helper("$1 &1\n$1 &1 c...\n$1", "a b\na b c")); assert!(!helper("$1 &1\n$1 &1 c...\n$1", "a b\na b c\na\nb")); + + assert!(helper("..~\n$1, $1\n..~", "a, a")); + assert!(helper("..~\n$1\n$1\n..~", "a\na")); + assert!(helper("..~\n$1\n$1\n..~", "a\nb\nb")); + assert!(helper("..~\n$1\n$1\n..~", "a\nb\na\na")); + assert!(helper("..~\n$1\n$1\n..~", "a\nb\na\nc\nc")); + assert!(!helper("..~\n$1\n$1\n..~", "a\nb\na\nb")); } #[test] @@ -856,6 +929,10 @@ mod tests { assert_eq!(helper("...\nb\nc\nd\n", "a\nb\nc\n0\ne"), (4, 4)); assert_eq!(helper("...\nc\nd\n", "a\nb\nc\n0\ne"), (3, 4)); assert_eq!(helper("...\nd\n", "a\nb\nc\n0\ne"), (2, 5)); + + assert_eq!(helper("a\n..~\nc\nd\ne", "a\nb\nc\nd"), (2, 2)); + assert_eq!(helper("a\n..~\nc\nd", "a\nb\nc\ne"), (2, 2)); + assert_eq!(helper("a\n..~\nc\nd", "a\nc\ne\nc\ne"), (2, 2)); } #[test] @@ -891,31 +968,31 @@ mod tests { match FMatcher::new("...\n...") { Err(e) if e.to_string() - == "Can't have two consecutive wildcards lines at lines 1 and 2." => + == "Can't have two consecutive interline wildcards lines at lines 1 and 2." => { () } - _ => panic!(), + x => panic!("{x:?}"), } - match FMatcher::new("...\n...\n...") { + match FMatcher::new("..~\n...") { Err(e) if e.to_string() - == "Can't have two consecutive wildcards lines at lines 1 and 2." => + == "Can't have two consecutive interline wildcards lines at lines 1 and 2." => { () } - _ => panic!(), + x => panic!("{x:?}"), } - match FMatcher::new("a\nb\n...\n...") { + match FMatcher::new("a\nb\n...\n..~") { Err(e) if e.to_string() - == "Can't have two consecutive wildcards lines at lines 3 and 4." => + == "Can't have two consecutive interline wildcards lines at lines 3 and 4." => { () } - _ => panic!(), + x => panic!("{x:?}"), } } @@ -1039,7 +1116,7 @@ Text (error at line 5): assert_eq!( helper(OutputFormatter::InputThenSummary, &ptn, &text), - "Raw text: + "Literal text: |1 |2 |3 @@ -1076,7 +1153,7 @@ Text (error at line 5): assert_eq!( helper(OutputFormatter::InputOnly, &ptn, &text), - "Raw text: + "Literal text: |1 |2 |3