Skip to content

Commit

Permalink
cli: error when searching for NUL
Browse files Browse the repository at this point in the history
Basically, unless the -a/--text flag is given, it is generally always an
error to search for an explicit NUL byte because the binary detection
will prevent it from matching.

Fixes #1838
  • Loading branch information
BurntSushi committed Nov 25, 2023
1 parent 17c6f26 commit 6304568
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ Feature enhancements:
Add new `--stop-on-nonmatch` flag.
* [FEATURE #1814](https://github.com/BurntSushi/ripgrep/issues/1814):
Flags are now categorized in `-h/--help` output and ripgrep's man page.
* [FEATURE #1838](https://github.com/BurntSushi/ripgrep/issues/1838):
An error is shown when searching for NUL bytes with binary detection enabled.
* [FEATURE #2195](https://github.com/BurntSushi/ripgrep/issues/2195):
When `extra-verbose` mode is enabled in zsh, show extra file type info.
* [FEATURE #2298](https://github.com/BurntSushi/ripgrep/issues/2298):
Expand Down
28 changes: 27 additions & 1 deletion crates/core/flags/hiargs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -499,9 +499,14 @@ impl HiArgs {
if let Some(limit) = self.dfa_size_limit {
builder.dfa_size_limit(limit);
}
if !self.binary.is_none() {
builder.ban_byte(Some(b'\x00'));
}
let m = match builder.build_many(&self.patterns.patterns) {
Ok(m) => m,
Err(err) => anyhow::bail!(suggest_multiline(err.to_string())),
Err(err) => {
anyhow::bail!(suggest_text(suggest_multiline(err.to_string())))
}
};
Ok(PatternMatcher::RustRegex(m))
}
Expand Down Expand Up @@ -1144,6 +1149,13 @@ impl BinaryDetection {
};
BinaryDetection { explicit, implicit }
}

/// Returns true when both implicit and explicit binary detection is
/// disabled.
pub(crate) fn is_none(&self) -> bool {
let none = grep::searcher::BinaryDetection::none();
self.explicit == none && self.implicit == none
}
}

/// Builds the file type matcher from low level arguments.
Expand Down Expand Up @@ -1428,3 +1440,17 @@ When multiline mode is enabled, new line characters can be matched.",
msg
}
}

/// Possibly suggest the `-a/--text` flag.
fn suggest_text(msg: String) -> String {
if msg.contains("pattern contains \"\\0\"") {
format!(
"{msg}
Consider enabling text mode with the --text flag (or -a for short). Otherwise,
binary detection is enabled and matching a NUL byte is impossible.",
)
} else {
msg
}
}
83 changes: 83 additions & 0 deletions crates/regex/src/ban.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use regex_syntax::hir::{
self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind,
};

use crate::error::{Error, ErrorKind};

/// Returns an error when a sub-expression in `expr` must match `byte`.
pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> {
assert!(byte.is_ascii(), "ban byte must be ASCII");
let ch = char::from(byte);
let invalid = || Err(Error::new(ErrorKind::Banned(byte)));
match expr.kind() {
HirKind::Empty => {}
HirKind::Literal(hir::Literal(ref lit)) => {
if lit.iter().find(|&&b| b == byte).is_some() {
return invalid();
}
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
let contains =
|r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end();
if cls.ranges().iter().find(contains).is_some() {
return invalid();
}
}
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
let contains = |r: &&ClassBytesRange| {
r.start() <= byte && byte <= r.end()
};
if cls.ranges().iter().find(contains).is_some() {
return invalid();
}
}
}
HirKind::Look(_) => {}
HirKind::Repetition(ref x) => check(&x.sub, byte)?,
HirKind::Capture(ref x) => check(&x.sub, byte)?,
HirKind::Concat(ref xs) => {
for x in xs.iter() {
check(x, byte)?;
}
}
HirKind::Alternation(ref xs) => {
for x in xs.iter() {
check(x, byte)?;
}
}
};
Ok(())
}

#[cfg(test)]
mod tests {
use regex_syntax::Parser;

/// Returns true when the given pattern is detected to contain the given
/// banned byte.
fn check(pattern: &str, byte: u8) -> bool {
let hir = Parser::new().parse(pattern).unwrap();
super::check(&hir, byte).is_err()
}

#[test]
fn various() {
assert!(check(r"\x00", 0));
assert!(check(r"a\x00", 0));
assert!(check(r"\x00b", 0));
assert!(check(r"a\x00b", 0));
assert!(check(r"\x00|ab", 0));
assert!(check(r"ab|\x00", 0));
assert!(check(r"\x00?", 0));
assert!(check(r"(\x00)", 0));

assert!(check(r"[\x00]", 0));
assert!(check(r"[^[^\x00]]", 0));

assert!(!check(r"[^\x00]", 0));
assert!(!check(r"[\x00a]", 0));
}
}
7 changes: 6 additions & 1 deletion crates/regex/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use {
};

use crate::{
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
strip::strip_from_match,
};

Expand All @@ -35,6 +35,7 @@ pub(crate) struct Config {
pub(crate) dfa_size_limit: usize,
pub(crate) nest_limit: u32,
pub(crate) line_terminator: Option<LineTerminator>,
pub(crate) ban: Option<u8>,
pub(crate) crlf: bool,
pub(crate) word: bool,
pub(crate) fixed_strings: bool,
Expand All @@ -58,6 +59,7 @@ impl Default for Config {
dfa_size_limit: 1000 * (1 << 20),
nest_limit: 250,
line_terminator: None,
ban: None,
crlf: false,
word: false,
fixed_strings: false,
Expand Down Expand Up @@ -205,6 +207,9 @@ impl ConfiguredHIR {
.build()
.translate(&pattern, &ast)
.map_err(Error::generic)?;
if let Some(byte) = config.ban {
ban::check(&hir, byte)?;
}
// We don't need to do this for the fixed-strings case above
// because is_fixed_strings will return false if any pattern
// contains a line terminator. Therefore, we don't need to strip
Expand Down
13 changes: 11 additions & 2 deletions crates/regex/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ pub enum ErrorKind {
///
/// The invalid byte is included in this error.
InvalidLineTerminator(u8),
/// Occurs when a banned byte was found in a pattern.
Banned(u8),
}

impl std::error::Error for Error {}
Expand All @@ -76,8 +78,15 @@ impl std::fmt::Display for Error {
ErrorKind::InvalidLineTerminator(byte) => {
write!(
f,
"line terminators must be ASCII, but {} is not",
[byte].as_bstr()
"line terminators must be ASCII, but {byte:?} is not",
byte = [byte].as_bstr(),
)
}
ErrorKind::Banned(byte) => {
write!(
f,
"pattern contains {byte:?} but it is impossible to match",
byte = [byte].as_bstr(),
)
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/regex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub use crate::{
};

mod ast;
mod ban;
mod config;
mod error;
mod literal;
Expand Down
16 changes: 16 additions & 0 deletions crates/regex/src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,22 @@ impl RegexMatcherBuilder {
self
}

/// Ban a byte from occurring in a regular expression pattern.
///
/// If this byte is found in the regex pattern, then an error will be
/// returned at construction time.
///
/// This is useful when binary detection is enabled. Callers will likely
/// want to ban the same byte that is used to detect binary data, i.e.,
/// the NUL byte. The reason for this is that when binary detection is
/// enabled, it's impossible to match a NUL byte because binary detection
/// will either quit when one is found, or will convert NUL bytes to line
/// terminators to avoid exorbitant heap usage.
pub fn ban_byte(&mut self, byte: Option<u8>) -> &mut RegexMatcherBuilder {
self.config.ban = byte;
self
}

/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
/// regex patterns.
///
Expand Down
2 changes: 1 addition & 1 deletion crates/searcher/src/line_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pub(crate) fn alloc_error(limit: usize) -> io::Error {
/// is that binary data often indicates data that is undesirable to search
/// using textual patterns. Of course, there are many cases in which this isn't
/// true, which is why binary detection is disabled by default.
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum BinaryDetection {
/// No binary detection is performed. Data reported by the line buffer may
/// contain arbitrary bytes.
Expand Down
2 changes: 1 addition & 1 deletion crates/searcher/src/searcher/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ type Range = Match;
/// heap, then binary detection is only guaranteed to be applied to the
/// parts corresponding to a match. When `Quit` is enabled, then the first
/// few KB of the data are searched for binary data.
#[derive(Clone, Debug, Default)]
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct BinaryDetection(line_buffer::BinaryDetection);

impl BinaryDetection {
Expand Down
14 changes: 14 additions & 0 deletions tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,20 @@ rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
assert!(!cmd.stdout().is_empty());
});

// See: https://github.com/BurntSushi/ripgrep/issues/1838
rgtest!(r1838_nul_error_with_binary_detection, |dir: Dir, _: TestCommand| {
// We don't support this error reporting with PCRE2 since we can't parse
// the pattern (easily) to give a good error message.
if dir.is_pcre2() {
return;
}
dir.create("test", "foo\n");

dir.command().args(&[r"foo\x00?"]).assert_err();
eqnice!("test:foo\n", dir.command().args(&["-a", r"foo\x00?"]).stdout());
});

// See: https://github.com/BurntSushi/ripgrep/issues/1866
rgtest!(r1866, |dir: Dir, mut cmd: TestCommand| {
dir.create("test", "foobar\nfoobar\nfoo quux");
cmd.args(&[
Expand Down

0 comments on commit 6304568

Please sign in to comment.