From c6fa53e60b6d1d4a5c9f42884df41ead47cbcdd1 Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Thu, 16 May 2024 08:11:09 +0100 Subject: [PATCH 1/7] Split apart the two different uses of the wildcard operator. Wildcards within, and between, lines behave very differently, so split them into two different constants. Right now this doesn't make any difference (the constants are the same!), but it makes it clearer in the code which is which at different points. --- src/lib.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2ac9e92..1d92edc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,7 +58,8 @@ use std::{ use regex::Regex; const ERROR_CONTEXT: usize = 3; -const WILDCARD: &str = "..."; +const INTERLINE_WILDCARD: &str = "..."; +const INTRALINE_WILDCARD: &str = "..."; const ERROR_MARKER: &str = ">>"; #[derive(Debug)] @@ -231,7 +232,9 @@ impl<'a> FMBuilder<'a> { fn validate(&self) -> Result<(), Box> { let lines = self.ptn.lines().collect::>(); for i in 0..lines.len() { - if i < lines.len() - 1 && lines[i].trim() == WILDCARD && lines[i + 1].trim() == WILDCARD + if i < lines.len() - 1 + && lines[i].trim() == INTERLINE_WILDCARD + && lines[i + 1].trim() == INTERLINE_WILDCARD { return Err(Box::::from(format!( "Can't have two consecutive wildcards lines at lines {} and {}.", @@ -281,7 +284,7 @@ impl<'a> FMatcher<'a> { loop { match (ptnl, textl) { (Some(x), Some(y)) => { - if x.trim() == WILDCARD { + if x.trim() == INTERLINE_WILDCARD { ptnl = ptn_lines.next(); ptn_lines_off += 1; match ptnl { @@ -314,7 +317,7 @@ impl<'a> FMatcher<'a> { } (None, None) => return Ok(()), (Some(x), None) => { - if x.trim() == WILDCARD { + if x.trim() == INTERLINE_WILDCARD { for ptnl in ptn_lines { ptn_lines_off += 1; if !self.match_line(&mut names, ptnl, "") { @@ -413,15 +416,15 @@ impl<'a> FMatcher<'a> { text = text.trim_end(); } - let sww = ptn.starts_with(WILDCARD); - let eww = ptn.ends_with(WILDCARD); + let sww = ptn.starts_with(INTRALINE_WILDCARD); + let eww = ptn.ends_with(INTRALINE_WILDCARD); if sww && eww { - text.contains(&ptn[WILDCARD.len()..ptn.len() - WILDCARD.len()]) + text.contains(&ptn[INTRALINE_WILDCARD.len()..ptn.len() - INTRALINE_WILDCARD.len()]) } else if sww { - text.ends_with(&ptn[WILDCARD.len()..]) + text.ends_with(&ptn[INTRALINE_WILDCARD.len()..]) } else if self.options.name_matchers.is_empty() { if eww { - text.starts_with(&ptn[..ptn.len() - WILDCARD.len()]) + text.starts_with(&ptn[..ptn.len() - INTRALINE_WILDCARD.len()]) } else { ptn == text } @@ -480,7 +483,9 @@ impl<'a> FMatcher<'a> { break; } } - if (eww && text.starts_with(&ptn[..ptn.len() - WILDCARD.len()])) || ptn == text { + if (eww && text.starts_with(&ptn[..ptn.len() - INTRALINE_WILDCARD.len()])) + || ptn == text + { names.extend(new_names); true } else { From a7ad0593961a97a27f2f3b326c222d502d903287 Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Thu, 16 May 2024 08:14:04 +0100 Subject: [PATCH 2/7] Shorten description. --- src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1d92edc..030d361 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -266,8 +266,7 @@ pub struct FMatcher<'a> { } impl<'a> FMatcher<'a> { - /// A convenience method that automatically builds a pattern for you using `FMBuilder`'s - /// default options. + /// A convenience method that automatically builds a pattern using `FMBuilder` defaults. pub fn new(ptn: &'a str) -> Result> { FMBuilder::new(ptn)?.build() } From edf54809d430ceee07c3e69d69da0f709761e42a Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Thu, 16 May 2024 08:24:06 +0100 Subject: [PATCH 3/7] Change the default interline wildcard syntax to "..?". This is so, when we shortly add new syntax, we can warn users of the change. --- Cargo.toml | 2 +- README.md | 14 +++---- src/lib.rs | 119 ++++++++++++++++++++++++++++++++--------------------- 3 files changed, 79 insertions(+), 56 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2919015..cced806 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ authors = ["Edd Barrett ", "Laurence Tratt " readme = "README.md" license = "Apache-2.0/MIT" categories = ["development-tools"] -edition = "2018" +edition = "2021" [dependencies] regex = "1.8" diff --git a/README.md b/README.md index 1aff1af..a500a27 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ # fm `fm` is a simple non-backtracking fuzzy text matcher useful for matching -multi-line patterns and text. At its most basic the wildcard operator `...` -default) can be used in the following ways: +multi-line patterns and text. At its most basic, wildcard operators can be used +in the following ways: - * If a line consists solely of `...` it means "match zero or more lines of text". + * If a line consists solely of `..?` it means "match zero or more lines of text". * If a line starts with `...`, the search is not anchored to the start of the line. * If a line ends with `...`, the search is not anchored to the end of the line. Note that `...` can appear both at the start and end of a line and if a line consists of `......` (i.e. starts and ends with the wildcard with nothing inbetween), it will match exactly one line. If the wildcard operator appears in -any other locations, it is matched literally. Wildcard matching does not -backtrack, so if a line consists solely of `...` then the next matching line +any other locations, it is matched literally. Wildcard matching does not +backtrack, so if a line consists solely of `..?` then the next matching line anchors the remainder of the search. The following examples show `fm` in action using its defaults: @@ -23,8 +23,8 @@ use fm::FMatcher; assert!(FMatcher::new("a").unwrap().matches("a").is_ok()); assert!(FMatcher::new(" a ").unwrap().matches("a").is_ok()); assert!(FMatcher::new("a").unwrap().matches("b").is_err()); -assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb").is_ok()); -assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb\nb").is_err()); +assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb").is_ok()); +assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb\nb").is_err()); ``` When a match fails, the matcher returns an error indicating the line number at diff --git a/src/lib.rs b/src/lib.rs index 030d361..580ce8c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,16 +1,16 @@ #![allow(clippy::upper_case_acronyms)] //! `fm` is a simple non-backtracking fuzzy text matcher useful for matching multi-line patterns -//! and text. At its most basic the wildcard operator `...` can be used in the following ways: +//! and text. At its most basic, wildcard operators can be used in the following ways: //! -//! * If a line consists solely of `...` it means "match zero or more lines of text". +//! * If a line consists solely of `..?` it means "match zero or more lines of text". //! * If a line starts with `...`, the search is not anchored to the start of the line. //! * If a line ends with `...`, the search is not anchored to the end of the line. //! //! Note that `...` can appear both at the start and end of a line and if a line consists of //! `......` (i.e. starts and ends with the wildcard with nothing inbetween), it will match exactly //! one line. If the wildcard operator appears in any other locations, it is matched literally. -//! Wildcard matching does not backtrack, so if a line consists solely of `...` then the next +//! Wildcard matching does not backtrack, so if a line consists solely of `..?` then the next //! matching line anchors the remainder of the search. //! //! The following examples show `fm` in action using its defaults: @@ -21,8 +21,8 @@ //! assert!(FMatcher::new("a").unwrap().matches("a").is_ok()); //! assert!(FMatcher::new(" a ").unwrap().matches("a").is_ok()); //! assert!(FMatcher::new("a").unwrap().matches("b").is_err()); -//! assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb").is_ok()); -//! assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb\nb").is_err()); +//! assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb").is_ok()); +//! assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb\nb").is_err()); //! ``` //! //! When a match fails, the matcher returns an error indicating the line number at which the match @@ -58,7 +58,7 @@ use std::{ use regex::Regex; const ERROR_CONTEXT: usize = 3; -const INTERLINE_WILDCARD: &str = "..."; +const LINE_ANCHOR_WILDCARD: &str = "..?"; const INTRALINE_WILDCARD: &str = "..."; const ERROR_MARKER: &str = ">>"; @@ -231,10 +231,20 @@ impl<'a> FMBuilder<'a> { fn validate(&self) -> Result<(), Box> { let lines = self.ptn.lines().collect::>(); + + for (i, l) in lines.iter().enumerate() { + if l.trim() == "..." { + return Err(Box::::from(format!( + "'...' interline syntax on line {} is deprecated: use '..?' instead", + i + 1 + ))); + } + } + for i in 0..lines.len() { if i < lines.len() - 1 - && lines[i].trim() == INTERLINE_WILDCARD - && lines[i + 1].trim() == INTERLINE_WILDCARD + && lines[i].trim() == LINE_ANCHOR_WILDCARD + && lines[i + 1].trim() == LINE_ANCHOR_WILDCARD { return Err(Box::::from(format!( "Can't have two consecutive wildcards lines at lines {} and {}.", @@ -283,7 +293,7 @@ impl<'a> FMatcher<'a> { loop { match (ptnl, textl) { (Some(x), Some(y)) => { - if x.trim() == INTERLINE_WILDCARD { + if x.trim() == LINE_ANCHOR_WILDCARD { ptnl = ptn_lines.next(); ptn_lines_off += 1; match ptnl { @@ -316,7 +326,7 @@ impl<'a> FMatcher<'a> { } (None, None) => return Ok(()), (Some(x), None) => { - if x.trim() == INTERLINE_WILDCARD { + if x.trim() == LINE_ANCHOR_WILDCARD { for ptnl in ptn_lines { ptn_lines_off += 1; if !self.match_line(&mut names, ptnl, "") { @@ -634,19 +644,19 @@ mod tests { assert!(helper("", "\n")); assert!(helper("a", "a")); assert!(!helper("a", "ab")); - assert!(helper("...", "")); - assert!(helper("...", "a")); - assert!(helper("...", "a\nb")); - assert!(helper("...\na", "a")); - assert!(helper("...\na\n...", "a")); - assert!(helper("a\n...", "a")); - assert!(helper("a\n...\nd", "a\nd")); - assert!(helper("a\n...\nd", "a\nb\nc\nd")); - assert!(!helper("a\n...\nd", "a\nb\nc")); - assert!(helper("a\n...\nc\n...\ne", "a\nb\nc\nd\ne")); - assert!(helper("a\n...\n...b", "a\nb")); - assert!(helper("a\n...\nb...", "a\nb")); - assert!(helper("a\n...\nb...", "a\nbc")); + assert!(helper("..?", "")); + assert!(helper("..?", "a")); + assert!(helper("..?", "a\nb")); + assert!(helper("..?\na", "a")); + assert!(helper("..?\na\n..?", "a")); + assert!(helper("a\n..?", "a")); + assert!(helper("a\n..?\nd", "a\nd")); + assert!(helper("a\n..?\nd", "a\nb\nc\nd")); + assert!(!helper("a\n..?\nd", "a\nb\nc")); + assert!(helper("a\n..?\nc\n..?\ne", "a\nb\nc\nd\ne")); + assert!(helper("a\n..?\n...b", "a\nb")); + assert!(helper("a\n..?\nb...", "a\nb")); + assert!(helper("a\n..?\nb...", "a\nbc")); assert!(helper("a\nb...", "a\nbc")); assert!(!helper("a\nb...", "a\nb\nc")); assert!(helper("a\n...b...", "a\nb")); @@ -656,7 +666,7 @@ mod tests { assert!(!helper("a\n...b...", "a\nxb\nc")); assert!(!helper("a", "a\nb")); assert!(!helper("a\nb", "a")); - assert!(!helper("a\n...\nb", "a")); + assert!(!helper("a\n..?\nb", "a")); assert!(helper("a\n", "a\n")); assert!(helper("a\n", "a")); assert!(helper("a", "a\n")); @@ -707,22 +717,22 @@ mod tests { assert!(helper("", "")); assert!(helper("a", "a")); assert!(!helper("a", "ab")); - assert!(helper("...", "")); - assert!(helper("...", "a")); + assert!(helper("..?", "")); + assert!(helper("..?", "a")); assert!(helper("......", "a")); assert!(!helper("......", "")); - assert!(helper("...", "a\nb")); + assert!(helper("..?", "a\nb")); assert!(!helper("......", "a\nb")); - assert!(helper("...\na", "a")); - assert!(helper("...\na\n...", "a")); - assert!(helper("a\n...", "a")); - assert!(helper("a\n...\nd", "a\nd")); - assert!(helper("a\n...\nd", "a\nb\nc\nd")); - assert!(!helper("a\n...\nd", "a\nb\nc")); - assert!(helper("a\n...\nc\n...\ne", "a\nb\nc\nd\ne")); - assert!(helper("a\n...\n...b", "a\nb")); - assert!(helper("a\n...\nb...", "a\nb")); - assert!(helper("a\n...\nb...", "a\nbc")); + assert!(helper("..?\na", "a")); + assert!(helper("..?\na\n..?", "a")); + assert!(helper("a\n..?", "a")); + assert!(helper("a\n..?\nd", "a\nd")); + assert!(helper("a\n..?\nd", "a\nb\nc\nd")); + assert!(!helper("a\n..?\nd", "a\nb\nc")); + assert!(helper("a\n..?\nc\n..?\ne", "a\nb\nc\nd\ne")); + assert!(helper("a\n..?\n...b", "a\nb")); + assert!(helper("a\n..?\nb...", "a\nb")); + assert!(helper("a\n..?\nb...", "a\nbc")); assert!(helper("a\nb...", "a\nbc")); assert!(!helper("a\nb...", "a\nb\nc")); assert!(helper("a\n...b...", "a\nb")); @@ -741,7 +751,7 @@ mod tests { assert!(helper("$1, $1, a", "a, a, a")); assert!(!helper("$1, $1, a", "a, a, b")); assert!(!helper("$1, $1, a", "a, b, a")); - assert!(helper("$1 $2\n...\n$3 $2", "a X\nb Y\nc X")); + assert!(helper("$1 $2\n..?\n$3 $2", "a X\nb Y\nc X")); assert!(!helper("ab$a", "a")); assert!(helper("$1\n$1...", "a\na b c")); assert!(!helper("$1\n$1...", "a\nb b c")); @@ -778,7 +788,7 @@ mod tests { assert!(helper("$1, $1, a", "a, a, a")); assert!(!helper("$1, $1, a", "a, a, b")); assert!(!helper("$1, $1, a", "a, b, a")); - assert!(helper("$1 $2\n...\n$3 $2", "a X\nb Y\nc X")); + assert!(helper("$1 $2\n..?\n$3 $2", "a X\nb Y\nc X")); assert!(!helper("ab$a", "a")); assert!(helper("$1\n$1...", "a\na b c")); assert!(!helper("$1\n$1...", "a\nb b c")); @@ -798,7 +808,7 @@ mod tests { assert!(helper("&1, &1, a", "a, a, a")); assert!(!helper("&1, &1, a", "a, a, b")); assert!(!helper("&1, &1, a", "a, b, a")); - assert!(helper("&1 &2\n...\n&3 &2", "a X\nb Y\nc X")); + assert!(helper("&1 &2\n..?\n&3 &2", "a X\nb Y\nc X")); assert!(!helper("ab&a", "a")); assert!(helper("&1\n&1...", "a\na b c")); assert!(!helper("&1\n&1...", "a\nb b c")); @@ -813,7 +823,7 @@ mod tests { assert!(helper("$1 &1 $1", "a b a")); assert!(helper("$1 &1 &1", "a b b")); assert!(!helper("$1 &1 &1", "a b a")); - assert!(helper("$1 &2\n...\n$3 &2", "a X\nb Y\nc X")); + assert!(helper("$1 &2\n..?\n$3 &2", "a X\nb Y\nc X")); assert!(helper("$1 &1\n$1 &1...", "a b\na b c d")); assert!(helper("$1 &1\n$1 &1...", "a b\na b")); assert!(!helper("$1 &1\n$1 &1...", "a b\na a c d")); @@ -836,7 +846,7 @@ mod tests { (err.ptn_line_off(), err.text_line_off()) }; - assert_eq!(helper("a\n...\nd", "a\nb\nc"), (3, 3)); + assert_eq!(helper("a\n..?\nd", "a\nb\nc"), (3, 3)); assert_eq!(helper("a\nb...", "a\nb\nc"), (3, 3)); assert_eq!(helper("a\n...b...", "a\nxb\nc"), (3, 3)); @@ -857,9 +867,9 @@ mod tests { assert_eq!(helper("$1\n$1\na", "a\na\nb"), (3, 3)); assert_eq!(helper("$1\n$1\na", "a\nb\na"), (2, 2)); - assert_eq!(helper("...\nb\nc\nd\n", "a\nb\nc\n0\ne"), (4, 4)); - assert_eq!(helper("...\nc\nd\n", "a\nb\nc\n0\ne"), (3, 4)); - assert_eq!(helper("...\nd\n", "a\nb\nc\n0\ne"), (2, 5)); + assert_eq!(helper("..?\nb\nc\nd\n", "a\nb\nc\n0\ne"), (4, 4)); + assert_eq!(helper("..?\nc\nd\n", "a\nb\nc\n0\ne"), (3, 4)); + assert_eq!(helper("..?\nd\n", "a\nb\nc\n0\ne"), (2, 5)); } #[test] @@ -892,7 +902,7 @@ mod tests { #[test] fn consecutive_wildcards_disallowed() { - match FMatcher::new("...\n...") { + match FMatcher::new("..?\n..?") { Err(e) if e.to_string() == "Can't have two consecutive wildcards lines at lines 1 and 2." => @@ -902,7 +912,7 @@ mod tests { _ => panic!(), } - match FMatcher::new("...\n...\n...") { + match FMatcher::new("..?\n..?\n..?") { Err(e) if e.to_string() == "Can't have two consecutive wildcards lines at lines 1 and 2." => @@ -912,7 +922,7 @@ mod tests { _ => panic!(), } - match FMatcher::new("a\nb\n...\n...") { + match FMatcher::new("a\nb\n..?\n..?") { Err(e) if e.to_string() == "Can't have two consecutive wildcards lines at lines 3 and 4." => @@ -923,6 +933,19 @@ mod tests { } } + #[test] + fn syntax_deprecation() { + match FMatcher::new("...") { + Err(e) + if e.to_string() + == "'...' interline syntax on line 1 is deprecated: use '..?' instead" => + { + () + } + x => panic!("{x:?}"), + } + } + #[test] fn wildcards_and_names() { let ptn_re = Regex::new("\\$.+?\\b").unwrap(); From 7944723ce7901a5a329897319201ff40b6e5b6e8 Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Thu, 16 May 2024 08:58:14 +0100 Subject: [PATCH 4/7] Use `README.md` as the crate doc string. This means only having to edit one file instead of keeping two nearly-identical things in sync. It also means that the doc strings in the README are tested. --- src/lib.rs | 49 ++----------------------------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 580ce8c..63a072a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,52 +1,7 @@ +#![doc = include_str!("../README.md")] + #![allow(clippy::upper_case_acronyms)] -//! `fm` is a simple non-backtracking fuzzy text matcher useful for matching multi-line patterns -//! and text. At its most basic, wildcard operators can be used in the following ways: -//! -//! * If a line consists solely of `..?` it means "match zero or more lines of text". -//! * If a line starts with `...`, the search is not anchored to the start of the line. -//! * If a line ends with `...`, the search is not anchored to the end of the line. -//! -//! Note that `...` can appear both at the start and end of a line and if a line consists of -//! `......` (i.e. starts and ends with the wildcard with nothing inbetween), it will match exactly -//! one line. If the wildcard operator appears in any other locations, it is matched literally. -//! Wildcard matching does not backtrack, so if a line consists solely of `..?` then the next -//! matching line anchors the remainder of the search. -//! -//! The following examples show `fm` in action using its defaults: -//! -//! ```rust -//! use fm::FMatcher; -//! -//! assert!(FMatcher::new("a").unwrap().matches("a").is_ok()); -//! assert!(FMatcher::new(" a ").unwrap().matches("a").is_ok()); -//! assert!(FMatcher::new("a").unwrap().matches("b").is_err()); -//! assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb").is_ok()); -//! assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb\nb").is_err()); -//! ``` -//! -//! When a match fails, the matcher returns an error indicating the line number at which the match -//! failed. The error can be formatted for human comprehension using the provided `Display` -//! implementation. -//! -//! If you want to use non-default options, you will first need to use `FMBuilder` before having -//! access to an `FMatcher`. For example, to use "name matching" (where you specify that the same -//! chunk of text must appear at multiple points in the text, but without specifying exactly what -//! the chunk must contain) you can set options as follows: -//! -//! ```rust -//! use {fm::FMBuilder, regex::Regex}; -//! -//! let ptn_re = Regex::new(r"\$.+?\b").unwrap(); -//! let text_re = Regex::new(r".+?\b").unwrap(); -//! let matcher = FMBuilder::new("$1 $1") -//! .unwrap() -//! .name_matcher(ptn_re, text_re) -//! .build() -//! .unwrap(); -//! assert!(matcher.matches("a a").is_ok()); -//! assert!(matcher.matches("a b").is_err()); -//! ``` use std::{ collections::hash_map::{Entry, HashMap}, default::Default, From 4e835504c05b30cc2a34d5eb7f69d89266592b28 Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Thu, 16 May 2024 14:16:57 +0100 Subject: [PATCH 5/7] Introduce the `..*` interline wildcard. This allows "group matching", which also implies backtracing. Consider this pattern: ```text A ..? B C ..? ``` This will match successfully against the literal: ```text A D B C E ``` but fail to match against the literal: ```text A D B B C E ``` because the `..?` matched against the first "B", anchored the search, then immediately failed to match against the second "B". In contrast the pattern: ```text A ..* B C ..? ``` will, through backtracing, successfully match the literal. ```text A ..? B C ..* D E ``` There are two reasons why you should default to using `..?` rather than `..*`. Most obviously `..?` does not backtrack and has linear performance. Less obviously `..?` prevents literals from matching when they contain multiple similar sequences. Informally, `..?` makes for more rigorous testing: `..?` can be thought of as "the next thing that matches must look like X" whereas `..*` says "skip things that are almost like X until you find something that is definitely X". Consider this pattern: ```text A ..? B C ..? ``` This will match successfully against the literal: ```text A D B C E ``` but fail to match against the literal: ```text A D B B C E ``` because the `..?` matched against the first "B", anchored the search, then immediately failed to match against the second "B". In contrast the pattern: ```text A ..* B C ..? ``` will, through backtracing, successfully match the literal. ```text A ..? B C ..* D E ``` There are two reasons why you should default to using `..?` rather than `..*`. Most obviously `..?` does not backtrack and has linear performance. Less obviously `..?` prevents literals from matching when they contain multiple similar sequences. Informally, `..?` makes for more rigorous testing: `..?` can be thought of as "the next thing that matches must look like X" whereas `..*` says "skip things that are almost like X until you find something that is definitely X". --- README.md | 122 +++++++++++++++++++++++++++++++----- src/lib.rs | 181 +++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 255 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index a500a27..60a0b6c 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,111 @@ # fm -`fm` is a simple non-backtracking fuzzy text matcher useful for matching -multi-line patterns and text. At its most basic, wildcard operators can be used -in the following ways: - - * If a line consists solely of `..?` it means "match zero or more lines of text". - * If a line starts with `...`, the search is not anchored to the start of the line. - * If a line ends with `...`, the search is not anchored to the end of the line. - -Note that `...` can appear both at the start and end of a line and if a line -consists of `......` (i.e. starts and ends with the wildcard with nothing -inbetween), it will match exactly one line. If the wildcard operator appears in -any other locations, it is matched literally. Wildcard matching does not -backtrack, so if a line consists solely of `..?` then the next matching line -anchors the remainder of the search. - -The following examples show `fm` in action using its defaults: +`fm` is a simple limited backtracking fuzzy text matcher useful for matching +multi-line *patterns* and *literal* text. Wildcard operators can be used to +match parts of a line and to skip multiple lines of text. For example this +*pattern*: + +```text +...A +..? +D... +``` + +will successfully match against literals such as: + +```text +xyzA +B +C +Dxyz +``` + + +## Intraline matching + +The intraline wildcard operator `...` can appear at the start and/or end of a +line. `...X...` matches any literal line that contains "X"; `...X` matches any +literal line that ends with "X"; and `X...` matches any literal line that +starts with "X". `......` matches exactly one literal line (i.e. the contents +of the literal line are irrelevant but this will not match against the end +of the literal text). + +## Interline matching + +There are two interline wildcard operators that determine when multiple literal +lines are matched. Both match zero or more literal lines until a match for the +next *item* is found, at which point the search is *anchored* (i.e. +backtracking will not occur before the anchor). An item is either: + + * A single pattern line. + * A group of pattern lines. A group is the sequence of pattern lines between + two interline wildcard operators or, if no wildcard operator is found, the + end of the pattern. + +The interline wildcards are: + + * `..?` matches until it finds a match for the line immediately after the + interline operator, at which point the search is anchored. + + * `..~` matches until it finds a match for the next group, at which point the + search is anchored. + +Consider this pattern: + +```text +A +..? +B +C +..? +``` + +This will match successfully against the literal: + +```text +A +D +B +C +E +``` + +but fail to match against the literal: + +```text +A +D +B +B +C +E +``` + +because the `..?` matched against the first "B", anchored the search, then +immediately failed to match against the second "B". + +In contrast the pattern: + +```text +A +..~ +B +C +..? +``` + +will, through backtracing, successfully match the literal. + +There are two reasons why you should default to using `..?` rather than `..~`. +Most obviously `..?` does not backtrack and has linear performance. Less +obviously `..?` prevents literals from matching when they contain multiple +similar sequences. Informally, `..?` makes for more rigorous testing: `..?` can +be thought of as "the next thing that matches must look like X" whereas `..~` +says "skip things that are almost like X until you find something that is +definitely X". + + +## API ```rust use fm::FMatcher; diff --git a/src/lib.rs b/src/lib.rs index 63a072a..7c5cedd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,4 @@ #![doc = include_str!("../README.md")] - #![allow(clippy::upper_case_acronyms)] use std::{ @@ -14,6 +13,7 @@ use regex::Regex; const ERROR_CONTEXT: usize = 3; const LINE_ANCHOR_WILDCARD: &str = "..?"; +const GROUP_ANCHOR_WILDCARD: &str = "..~"; const INTRALINE_WILDCARD: &str = "..."; const ERROR_MARKER: &str = ">>"; @@ -42,7 +42,7 @@ impl Default for FMOptions { /// How should an [FMatchError] format itself? Where: /// -/// * `Input` means the raw text passed to fmt. +/// * `Input` means the literal text passed to fmt. /// * `Summary` is the subset of pattern and text where an error was detected. /// /// For example a summary may look as follows (where `...` means "text above/below was elided"): @@ -59,7 +59,7 @@ impl Default for FMOptions { /// |8 /// ... /// -/// Text (error at line 5): +/// Literal text (error at line 5): /// ... /// |2 /// |3 @@ -185,10 +185,10 @@ impl<'a> FMBuilder<'a> { } fn validate(&self) -> Result<(), Box> { - let lines = self.ptn.lines().collect::>(); + let lines = self.ptn.lines().map(|x| x.trim()).collect::>(); for (i, l) in lines.iter().enumerate() { - if l.trim() == "..." { + if *l == "..." { return Err(Box::::from(format!( "'...' interline syntax on line {} is deprecated: use '..?' instead", i + 1 @@ -198,11 +198,11 @@ impl<'a> FMBuilder<'a> { for i in 0..lines.len() { if i < lines.len() - 1 - && lines[i].trim() == LINE_ANCHOR_WILDCARD - && lines[i + 1].trim() == LINE_ANCHOR_WILDCARD + && (lines[i] == LINE_ANCHOR_WILDCARD || lines[i] == GROUP_ANCHOR_WILDCARD) + && (lines[i + 1] == LINE_ANCHOR_WILDCARD || lines[i + 1] == GROUP_ANCHOR_WILDCARD) { return Err(Box::::from(format!( - "Can't have two consecutive wildcards lines at lines {} and {}.", + "Can't have two consecutive interline wildcards lines at lines {} and {}.", i + 1, i + 2 ))); @@ -264,6 +264,85 @@ impl<'a> FMatcher<'a> { } None => return Ok(()), } + } else if x.trim() == GROUP_ANCHOR_WILDCARD { + let ptn_lines_off_orig = ptn_lines_off; + let text_lines_off_orig = text_lines_off; + ptnl = ptn_lines.next(); + // If the interline wildcard is the last part of the pattern, then we + // implicitly match any remaining text: i.e. we're done! + if ptnl.is_none() { + return Ok(()); + } + // We now have to perform (bounded) backtracking + ptn_lines_off += 1; + let mut ptn_lines_sub = ptn_lines.clone(); + let mut ptnl_sub = ptnl; + let mut ptn_lines_off_sub = ptn_lines_off; + let mut text_lines_sub = text_lines.clone(); + let mut text_lines_off_sub = text_lines_off; + let mut textl_sub = textl; + let mut names_sub = names.clone(); + loop { + match (ptnl_sub, textl_sub) { + (None, None) => return Ok(()), + (Some(x), _) + if x.trim() == GROUP_ANCHOR_WILDCARD + || x.trim() == LINE_ANCHOR_WILDCARD => + { + // We've matched everything successfully + ptn_lines = ptn_lines_sub; + ptnl = ptnl_sub; + ptn_lines_off = ptn_lines_off_sub; + text_lines = text_lines_sub; + textl = textl_sub; + text_lines_off = text_lines_off_sub; + names = names_sub; + break; + } + (None, Some(_)) => { + match self.skip_blank_lines(&mut text_lines_sub, textl_sub) { + (Some(_), _) => { + return Err(FMatchError { + output_formatter: self.options.output_formatter, + ptn: self.ptn.to_owned(), + text: text.to_owned(), + ptn_line_off: ptn_lines_off_orig, + text_line_off: text_lines_off_orig, + }) + } + (None, _) => return Ok(()), + } + } + (Some(_), None) => { + return Err(FMatchError { + output_formatter: self.options.output_formatter, + ptn: self.ptn.to_owned(), + text: text.to_owned(), + ptn_line_off: ptn_lines_off_orig, + text_line_off: text_lines_off_orig, + }); + } + (Some(x), Some(y)) => { + if self.match_line(&mut names_sub, x, y) { + ptnl_sub = ptn_lines_sub.next(); + ptn_lines_off_sub += 1; + textl_sub = text_lines_sub.next(); + text_lines_off_sub += 1; + } else { + // We failed to match, so we need to reset the + // pattern, but advance the text. + ptn_lines_sub = ptn_lines.clone(); + ptnl_sub = ptnl; + ptn_lines_off_sub += 1; + textl_sub = text_lines.next(); + text_lines_off += 1; + text_lines_sub = text_lines.clone(); + text_lines_off_sub = text_lines_off; + names_sub = names.clone(); + } + } + } + } } else if self.match_line(&mut names, x, y) { ptnl = ptn_lines.next(); ptn_lines_off += 1; @@ -281,20 +360,16 @@ impl<'a> FMatcher<'a> { } (None, None) => return Ok(()), (Some(x), None) => { - if x.trim() == LINE_ANCHOR_WILDCARD { - for ptnl in ptn_lines { - ptn_lines_off += 1; - if !self.match_line(&mut names, ptnl, "") { - return Err(FMatchError { - output_formatter: self.options.output_formatter, - ptn: self.ptn.to_owned(), - text: text.to_owned(), - ptn_line_off: ptn_lines_off, - text_line_off: text_lines_off, - }); - } + if let LINE_ANCHOR_WILDCARD | GROUP_ANCHOR_WILDCARD = x.trim() { + ptnl = ptn_lines.next(); + ptn_lines_off += 1; + // If the interline wildcard is the last line in the pattern, we're done. + // If it isn't, then the pattern hasn't matched: rather than explicitly + // throw an error in this clause, we let the next iteration of the outer + // while loop catch this case. + if ptnl.is_none() { + return Ok(()); } - return Ok(()); } else { match self.skip_blank_lines(&mut ptn_lines, Some(x)) { (Some(_), skipped) => { @@ -510,7 +585,7 @@ fn fmt_raw(f: &mut fmt::Formatter, text: &str) -> fmt::Result { let lhs = &format!("\n{}|", " ".repeat(err_mk_chars)); writeln!( f, - "Raw text:{}{}", + "Literal text:{}{}", lhs, text.split("\n").collect::>().join(lhs) ) @@ -632,6 +707,37 @@ mod tests { assert!(!helper("a\n", "a\n\nb")); } + #[test] + fn groupings() { + fn helper(ptn: &str, text: &str) -> bool { + FMatcher::new(ptn).unwrap().matches(text).is_ok() + } + assert!(helper("a\n..~\nc\n", "a\nb\nc")); + assert!(helper("a\n..~\nc\n", "a\nc")); + assert!(helper("a\n..~\nc\nd\n", "a\nc\nd")); + assert!(helper("a\n..~\nc\nd\n", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\nd\ne", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\nd\n", "a\nb\nc\ne")); + assert!(!helper("a\n..~\nc\nd\n", "a\nc\ne\nc\ne")); + assert!(!helper("..~\nc", "")); + assert!(!helper("..~\nc", "c\nd")); + assert!(helper("a\n..~\nc\nd\n", "a\nc\ne\nc\nd")); + assert!(helper("a\n..~\nc\nd\ne", "a\nc\ne\nc\nd\ne")); + assert!(helper("a\n..~\nc\n..~", "a\nb\nc")); + assert!(helper("a\n..~\nc\n..?", "a\nb\nc")); + assert!(helper("a\n..~\nc\n..~\nd", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\n..~\nd", "a\nb\nc\ne")); + assert!(helper("a\n..~\nc\n..?\nd", "a\nb\nc\nd")); + assert!(helper("a\n..~\nc\n..~\nd", "a\nb\nc\ne\nf\nd")); + assert!(helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd\nd")); + assert!(!helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd")); + assert!(!helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd\nd\nd")); + assert!(helper("..~\na\n..~\nb", "a\nb")); + assert!(!helper("..~\na\n..~\nb", "a")); + assert!(!helper("..~\na\n..~\nb", "a\nb\nc")); + assert!(helper("..~\na\n..~", "a\nb")); + } + #[test] fn dont_ignore_surrounding_blank_lines() { fn helper(ptn: &str, text: &str) -> bool { @@ -784,6 +890,13 @@ mod tests { assert!(!helper("$1 &1\n$1 &1...", "a b\na a c d")); assert!(!helper("$1 &1\n$1 &1 c...\n$1", "a b\na b c")); assert!(!helper("$1 &1\n$1 &1 c...\n$1", "a b\na b c\na\nb")); + + assert!(helper("..~\n$1, $1\n..~", "a, a")); + assert!(helper("..~\n$1\n$1\n..~", "a\na")); + assert!(helper("..~\n$1\n$1\n..~", "a\nb\nb")); + assert!(helper("..~\n$1\n$1\n..~", "a\nb\na\na")); + assert!(helper("..~\n$1\n$1\n..~", "a\nb\na\nc\nc")); + assert!(!helper("..~\n$1\n$1\n..~", "a\nb\na\nb")); } #[test] @@ -825,6 +938,10 @@ mod tests { assert_eq!(helper("..?\nb\nc\nd\n", "a\nb\nc\n0\ne"), (4, 4)); assert_eq!(helper("..?\nc\nd\n", "a\nb\nc\n0\ne"), (3, 4)); assert_eq!(helper("..?\nd\n", "a\nb\nc\n0\ne"), (2, 5)); + + assert_eq!(helper("a\n..~\nc\nd\ne", "a\nb\nc\nd"), (2, 2)); + assert_eq!(helper("a\n..~\nc\nd", "a\nb\nc\ne"), (2, 2)); + assert_eq!(helper("a\n..~\nc\nd", "a\nc\ne\nc\ne"), (2, 2)); } #[test] @@ -860,31 +977,31 @@ mod tests { match FMatcher::new("..?\n..?") { Err(e) if e.to_string() - == "Can't have two consecutive wildcards lines at lines 1 and 2." => + == "Can't have two consecutive interline wildcards lines at lines 1 and 2." => { () } - _ => panic!(), + x => panic!("{x:?}"), } - match FMatcher::new("..?\n..?\n..?") { + match FMatcher::new("..~\n..?") { Err(e) if e.to_string() - == "Can't have two consecutive wildcards lines at lines 1 and 2." => + == "Can't have two consecutive interline wildcards lines at lines 1 and 2." => { () } - _ => panic!(), + x => panic!("{x:?}"), } - match FMatcher::new("a\nb\n..?\n..?") { + match FMatcher::new("a\nb\n..?\n..~") { Err(e) if e.to_string() - == "Can't have two consecutive wildcards lines at lines 3 and 4." => + == "Can't have two consecutive interline wildcards lines at lines 3 and 4." => { () } - _ => panic!(), + x => panic!("{x:?}"), } } @@ -1021,7 +1138,7 @@ Text (error at line 5): assert_eq!( helper(OutputFormatter::InputThenSummary, &ptn, &text), - "Raw text: + "Literal text: |1 |2 |3 @@ -1058,7 +1175,7 @@ Text (error at line 5): assert_eq!( helper(OutputFormatter::InputOnly, &ptn, &text), - "Raw text: + "Literal text: |1 |2 |3 From b71e3df9d0b17fb8c90ee73007f54a95006fb769 Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Fri, 17 May 2024 09:03:02 +0100 Subject: [PATCH 6/7] Make `...` the "default" interline syntax again. --- README.md | 24 ++++++------- src/lib.rs | 104 +++++++++++++++++++++-------------------------------- 2 files changed, 53 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 60a0b6c..d3e5456 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ match parts of a line and to skip multiple lines of text. For example this ```text ...A -..? +... D... ``` @@ -44,7 +44,7 @@ backtracking will not occur before the anchor). An item is either: The interline wildcards are: - * `..?` matches until it finds a match for the line immediately after the + * `...` matches until it finds a match for the line immediately after the interline operator, at which point the search is anchored. * `..~` matches until it finds a match for the next group, at which point the @@ -54,10 +54,10 @@ Consider this pattern: ```text A -..? +... B C -..? +... ``` This will match successfully against the literal: @@ -81,7 +81,7 @@ C E ``` -because the `..?` matched against the first "B", anchored the search, then +because the `...` matched against the first "B", anchored the search, then immediately failed to match against the second "B". In contrast the pattern: @@ -91,15 +91,15 @@ A ..~ B C -..? +... ``` will, through backtracing, successfully match the literal. -There are two reasons why you should default to using `..?` rather than `..~`. -Most obviously `..?` does not backtrack and has linear performance. Less -obviously `..?` prevents literals from matching when they contain multiple -similar sequences. Informally, `..?` makes for more rigorous testing: `..?` can +There are two reasons why you should default to using `...` rather than `..~`. +Most obviously `...` does not backtrack and has linear performance. Less +obviously `...` prevents literals from matching when they contain multiple +similar sequences. Informally, `...` makes for more rigorous testing: `...` can be thought of as "the next thing that matches must look like X" whereas `..~` says "skip things that are almost like X until you find something that is definitely X". @@ -113,8 +113,8 @@ use fm::FMatcher; assert!(FMatcher::new("a").unwrap().matches("a").is_ok()); assert!(FMatcher::new(" a ").unwrap().matches("a").is_ok()); assert!(FMatcher::new("a").unwrap().matches("b").is_err()); -assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb").is_ok()); -assert!(FMatcher::new("a\n..?\nb").unwrap().matches("a\na\nb\nb").is_err()); +assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb").is_ok()); +assert!(FMatcher::new("a\n...\nb").unwrap().matches("a\na\nb\nb").is_err()); ``` When a match fails, the matcher returns an error indicating the line number at diff --git a/src/lib.rs b/src/lib.rs index 7c5cedd..c9f8229 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ use std::{ use regex::Regex; const ERROR_CONTEXT: usize = 3; -const LINE_ANCHOR_WILDCARD: &str = "..?"; +const LINE_ANCHOR_WILDCARD: &str = "..."; const GROUP_ANCHOR_WILDCARD: &str = "..~"; const INTRALINE_WILDCARD: &str = "..."; const ERROR_MARKER: &str = ">>"; @@ -187,15 +187,6 @@ impl<'a> FMBuilder<'a> { fn validate(&self) -> Result<(), Box> { let lines = self.ptn.lines().map(|x| x.trim()).collect::>(); - for (i, l) in lines.iter().enumerate() { - if *l == "..." { - return Err(Box::::from(format!( - "'...' interline syntax on line {} is deprecated: use '..?' instead", - i + 1 - ))); - } - } - for i in 0..lines.len() { if i < lines.len() - 1 && (lines[i] == LINE_ANCHOR_WILDCARD || lines[i] == GROUP_ANCHOR_WILDCARD) @@ -674,19 +665,19 @@ mod tests { assert!(helper("", "\n")); assert!(helper("a", "a")); assert!(!helper("a", "ab")); - assert!(helper("..?", "")); - assert!(helper("..?", "a")); - assert!(helper("..?", "a\nb")); - assert!(helper("..?\na", "a")); - assert!(helper("..?\na\n..?", "a")); - assert!(helper("a\n..?", "a")); - assert!(helper("a\n..?\nd", "a\nd")); - assert!(helper("a\n..?\nd", "a\nb\nc\nd")); - assert!(!helper("a\n..?\nd", "a\nb\nc")); - assert!(helper("a\n..?\nc\n..?\ne", "a\nb\nc\nd\ne")); - assert!(helper("a\n..?\n...b", "a\nb")); - assert!(helper("a\n..?\nb...", "a\nb")); - assert!(helper("a\n..?\nb...", "a\nbc")); + assert!(helper("...", "")); + assert!(helper("...", "a")); + assert!(helper("...", "a\nb")); + assert!(helper("...\na", "a")); + assert!(helper("...\na\n...", "a")); + assert!(helper("a\n...", "a")); + assert!(helper("a\n...\nd", "a\nd")); + assert!(helper("a\n...\nd", "a\nb\nc\nd")); + assert!(!helper("a\n...\nd", "a\nb\nc")); + assert!(helper("a\n...\nc\n...\ne", "a\nb\nc\nd\ne")); + assert!(helper("a\n...\n...b", "a\nb")); + assert!(helper("a\n...\nb...", "a\nb")); + assert!(helper("a\n...\nb...", "a\nbc")); assert!(helper("a\nb...", "a\nbc")); assert!(!helper("a\nb...", "a\nb\nc")); assert!(helper("a\n...b...", "a\nb")); @@ -696,7 +687,7 @@ mod tests { assert!(!helper("a\n...b...", "a\nxb\nc")); assert!(!helper("a", "a\nb")); assert!(!helper("a\nb", "a")); - assert!(!helper("a\n..?\nb", "a")); + assert!(!helper("a\n...\nb", "a")); assert!(helper("a\n", "a\n")); assert!(helper("a\n", "a")); assert!(helper("a", "a\n")); @@ -724,10 +715,10 @@ mod tests { assert!(helper("a\n..~\nc\nd\n", "a\nc\ne\nc\nd")); assert!(helper("a\n..~\nc\nd\ne", "a\nc\ne\nc\nd\ne")); assert!(helper("a\n..~\nc\n..~", "a\nb\nc")); - assert!(helper("a\n..~\nc\n..?", "a\nb\nc")); + assert!(helper("a\n..~\nc\n...", "a\nb\nc")); assert!(helper("a\n..~\nc\n..~\nd", "a\nb\nc\nd")); assert!(!helper("a\n..~\nc\n..~\nd", "a\nb\nc\ne")); - assert!(helper("a\n..~\nc\n..?\nd", "a\nb\nc\nd")); + assert!(helper("a\n..~\nc\n...\nd", "a\nb\nc\nd")); assert!(helper("a\n..~\nc\n..~\nd", "a\nb\nc\ne\nf\nd")); assert!(helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd\nd")); assert!(!helper("a\n..~\nc\nd\n..~\nd", "a\nb\nc\nd")); @@ -778,22 +769,22 @@ mod tests { assert!(helper("", "")); assert!(helper("a", "a")); assert!(!helper("a", "ab")); - assert!(helper("..?", "")); - assert!(helper("..?", "a")); + assert!(helper("...", "")); + assert!(helper("...", "a")); assert!(helper("......", "a")); assert!(!helper("......", "")); - assert!(helper("..?", "a\nb")); + assert!(helper("...", "a\nb")); assert!(!helper("......", "a\nb")); - assert!(helper("..?\na", "a")); - assert!(helper("..?\na\n..?", "a")); - assert!(helper("a\n..?", "a")); - assert!(helper("a\n..?\nd", "a\nd")); - assert!(helper("a\n..?\nd", "a\nb\nc\nd")); - assert!(!helper("a\n..?\nd", "a\nb\nc")); - assert!(helper("a\n..?\nc\n..?\ne", "a\nb\nc\nd\ne")); - assert!(helper("a\n..?\n...b", "a\nb")); - assert!(helper("a\n..?\nb...", "a\nb")); - assert!(helper("a\n..?\nb...", "a\nbc")); + assert!(helper("...\na", "a")); + assert!(helper("...\na\n...", "a")); + assert!(helper("a\n...", "a")); + assert!(helper("a\n...\nd", "a\nd")); + assert!(helper("a\n...\nd", "a\nb\nc\nd")); + assert!(!helper("a\n...\nd", "a\nb\nc")); + assert!(helper("a\n...\nc\n...\ne", "a\nb\nc\nd\ne")); + assert!(helper("a\n...\n...b", "a\nb")); + assert!(helper("a\n...\nb...", "a\nb")); + assert!(helper("a\n...\nb...", "a\nbc")); assert!(helper("a\nb...", "a\nbc")); assert!(!helper("a\nb...", "a\nb\nc")); assert!(helper("a\n...b...", "a\nb")); @@ -812,7 +803,7 @@ mod tests { assert!(helper("$1, $1, a", "a, a, a")); assert!(!helper("$1, $1, a", "a, a, b")); assert!(!helper("$1, $1, a", "a, b, a")); - assert!(helper("$1 $2\n..?\n$3 $2", "a X\nb Y\nc X")); + assert!(helper("$1 $2\n...\n$3 $2", "a X\nb Y\nc X")); assert!(!helper("ab$a", "a")); assert!(helper("$1\n$1...", "a\na b c")); assert!(!helper("$1\n$1...", "a\nb b c")); @@ -849,7 +840,7 @@ mod tests { assert!(helper("$1, $1, a", "a, a, a")); assert!(!helper("$1, $1, a", "a, a, b")); assert!(!helper("$1, $1, a", "a, b, a")); - assert!(helper("$1 $2\n..?\n$3 $2", "a X\nb Y\nc X")); + assert!(helper("$1 $2\n...\n$3 $2", "a X\nb Y\nc X")); assert!(!helper("ab$a", "a")); assert!(helper("$1\n$1...", "a\na b c")); assert!(!helper("$1\n$1...", "a\nb b c")); @@ -869,7 +860,7 @@ mod tests { assert!(helper("&1, &1, a", "a, a, a")); assert!(!helper("&1, &1, a", "a, a, b")); assert!(!helper("&1, &1, a", "a, b, a")); - assert!(helper("&1 &2\n..?\n&3 &2", "a X\nb Y\nc X")); + assert!(helper("&1 &2\n...\n&3 &2", "a X\nb Y\nc X")); assert!(!helper("ab&a", "a")); assert!(helper("&1\n&1...", "a\na b c")); assert!(!helper("&1\n&1...", "a\nb b c")); @@ -884,7 +875,7 @@ mod tests { assert!(helper("$1 &1 $1", "a b a")); assert!(helper("$1 &1 &1", "a b b")); assert!(!helper("$1 &1 &1", "a b a")); - assert!(helper("$1 &2\n..?\n$3 &2", "a X\nb Y\nc X")); + assert!(helper("$1 &2\n...\n$3 &2", "a X\nb Y\nc X")); assert!(helper("$1 &1\n$1 &1...", "a b\na b c d")); assert!(helper("$1 &1\n$1 &1...", "a b\na b")); assert!(!helper("$1 &1\n$1 &1...", "a b\na a c d")); @@ -914,7 +905,7 @@ mod tests { (err.ptn_line_off(), err.text_line_off()) }; - assert_eq!(helper("a\n..?\nd", "a\nb\nc"), (3, 3)); + assert_eq!(helper("a\n...\nd", "a\nb\nc"), (3, 3)); assert_eq!(helper("a\nb...", "a\nb\nc"), (3, 3)); assert_eq!(helper("a\n...b...", "a\nxb\nc"), (3, 3)); @@ -935,9 +926,9 @@ mod tests { assert_eq!(helper("$1\n$1\na", "a\na\nb"), (3, 3)); assert_eq!(helper("$1\n$1\na", "a\nb\na"), (2, 2)); - assert_eq!(helper("..?\nb\nc\nd\n", "a\nb\nc\n0\ne"), (4, 4)); - assert_eq!(helper("..?\nc\nd\n", "a\nb\nc\n0\ne"), (3, 4)); - assert_eq!(helper("..?\nd\n", "a\nb\nc\n0\ne"), (2, 5)); + assert_eq!(helper("...\nb\nc\nd\n", "a\nb\nc\n0\ne"), (4, 4)); + assert_eq!(helper("...\nc\nd\n", "a\nb\nc\n0\ne"), (3, 4)); + assert_eq!(helper("...\nd\n", "a\nb\nc\n0\ne"), (2, 5)); assert_eq!(helper("a\n..~\nc\nd\ne", "a\nb\nc\nd"), (2, 2)); assert_eq!(helper("a\n..~\nc\nd", "a\nb\nc\ne"), (2, 2)); @@ -974,7 +965,7 @@ mod tests { #[test] fn consecutive_wildcards_disallowed() { - match FMatcher::new("..?\n..?") { + match FMatcher::new("...\n...") { Err(e) if e.to_string() == "Can't have two consecutive interline wildcards lines at lines 1 and 2." => @@ -984,7 +975,7 @@ mod tests { x => panic!("{x:?}"), } - match FMatcher::new("..~\n..?") { + match FMatcher::new("..~\n...") { Err(e) if e.to_string() == "Can't have two consecutive interline wildcards lines at lines 1 and 2." => @@ -994,7 +985,7 @@ mod tests { x => panic!("{x:?}"), } - match FMatcher::new("a\nb\n..?\n..~") { + match FMatcher::new("a\nb\n...\n..~") { Err(e) if e.to_string() == "Can't have two consecutive interline wildcards lines at lines 3 and 4." => @@ -1005,19 +996,6 @@ mod tests { } } - #[test] - fn syntax_deprecation() { - match FMatcher::new("...") { - Err(e) - if e.to_string() - == "'...' interline syntax on line 1 is deprecated: use '..?' instead" => - { - () - } - x => panic!("{x:?}"), - } - } - #[test] fn wildcards_and_names() { let ptn_re = Regex::new("\\$.+?\\b").unwrap(); From 0dc88e35df2c48355209cbeff3de5d5508165646 Mon Sep 17 00:00:00 2001 From: Laurence Tratt Date: Fri, 17 May 2024 09:14:18 +0100 Subject: [PATCH 7/7] Clarify the documentation. --- README.md | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index d3e5456..1ce964b 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,13 @@ starts with "X". `......` matches exactly one literal line (i.e. the contents of the literal line are irrelevant but this will not match against the end of the literal text). + ## Interline matching -There are two interline wildcard operators that determine when multiple literal -lines are matched. Both match zero or more literal lines until a match for the -next *item* is found, at which point the search is *anchored* (i.e. -backtracking will not occur before the anchor). An item is either: +There are two interline wildcard operators that match zero or more literal +lines until a match for the next *item* is found, at which point the search is +*anchored* (i.e. backtracking will not occur before the anchor). An item is +either: * A single pattern line. * A group of pattern lines. A group is the sequence of pattern lines between @@ -44,11 +45,17 @@ backtracking will not occur before the anchor). An item is either: The interline wildcards are: - * `...` matches until it finds a match for the line immediately after the - interline operator, at which point the search is anchored. + * The *prefix match* wildcard `...` matches until it finds a match for the + line immediately after the interline operator ("the prefix"), at which + point the search is anchored. This wildcard does not backtrack. + * The *group match* wildcard `..~` matches until it finds a match for the + next group, at which point the search is anchored. This wildcard + backtracks, though never further than one group. - * `..~` matches until it finds a match for the next group, at which point the - search is anchored. +Interline wildcards cannot directly follow each other i.e. `...\n...?` is an +invalid pattern. Interline wildcards can appear at the beginning or end of +a pattern: at the end of a pattern, both interline wildcards have identical +semantics to each other. Consider this pattern: @@ -81,8 +88,8 @@ C E ``` -because the `...` matched against the first "B", anchored the search, then -immediately failed to match against the second "B". +because the `...` matches against the first "B", which anchors the search, then +immediately fails to match against the second "B". In contrast the pattern: @@ -94,15 +101,12 @@ C ... ``` -will, through backtracing, successfully match the literal. +does match the literal because `..~` backtracks on the second "B". There are two reasons why you should default to using `...` rather than `..~`. Most obviously `...` does not backtrack and has linear performance. Less -obviously `...` prevents literals from matching when they contain multiple -similar sequences. Informally, `...` makes for more rigorous testing: `...` can -be thought of as "the next thing that matches must look like X" whereas `..~` -says "skip things that are almost like X until you find something that is -definitely X". +obviously `...` is a more rigorous test, since it cannot skip prefix matches +(i.e. the next line after the `...` in the pattern) in the literal. ## API