From 305d1a4488f23df0385bbf8e3082d6510f392922 Mon Sep 17 00:00:00 2001 From: Viktor Kleen Date: Mon, 4 Sep 2023 15:11:32 +0000 Subject: [PATCH] Normalize line endings in string literals during parsing (#1562) * Remove unused `strip_indent_doc` * Replace `\r\n` by `\n` in the string literal parser * `mk_strchunk_literal` -> `StrChunk::normalized_literal` * Move string normalizing into the lexer * Remove leftover debug print --- Cargo.toml | 1 + core/src/parser/grammar.lalrpop | 8 +++---- core/src/parser/lexer.rs | 41 ++++++++++++++++++++++++--------- core/src/parser/tests.rs | 28 +++++++++++++--------- core/src/parser/utils.rs | 18 ++------------- core/src/pretty.rs | 18 +++++++++++---- 6 files changed, 68 insertions(+), 46 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8fc24b81d1..24fab02a8e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "wasm-repl", "pyckel", ] +resolver = "2" [workspace.package] version = "1.1.1" diff --git a/core/src/parser/grammar.lalrpop b/core/src/parser/grammar.lalrpop index c5d487c574..bdaff9cb08 100644 --- a/core/src/parser/grammar.lalrpop +++ b/core/src/parser/grammar.lalrpop @@ -583,7 +583,7 @@ ChunkLiteral : String = => { parts.into_iter().fold(String::new(), |mut acc, part| { match part { - ChunkLiteralPart::Str(s) => acc.push_str(s), + ChunkLiteralPart::Str(s) => acc.push_str(&s), ChunkLiteralPart::Char(c) => acc.push(c), }; @@ -623,7 +623,7 @@ EnumTag: LocIdent = { => <>.into(), }; -ChunkLiteralPart: ChunkLiteralPart<'input> = { +ChunkLiteralPart: ChunkLiteralPart = { "str literal" => ChunkLiteralPart::Str(<>), "multstr literal" => ChunkLiteralPart::Str(<>), "str esc char" => ChunkLiteralPart::Char(<>), @@ -946,9 +946,9 @@ extern { enum Token<'input> { "identifier" => Token::Normal(NormalToken::Identifier(<&'input str>)), - "str literal" => Token::Str(StringToken::Literal(<&'input str>)), + "str literal" => Token::Str(StringToken::Literal()), "str esc char" => Token::Str(StringToken::EscapedChar()), - "multstr literal" => Token::MultiStr(MultiStringToken::Literal(<&'input str>)), + "multstr literal" => Token::MultiStr(MultiStringToken::Literal()), "num literal" => Token::Normal(NormalToken::NumLiteral()), "raw enum tag" => Token::Normal(NormalToken::RawEnumTag(<&'input str>)), diff --git a/core/src/parser/lexer.rs b/core/src/parser/lexer.rs index c9e1e2e6c3..490e437917 100644 --- a/core/src/parser/lexer.rs +++ b/core/src/parser/lexer.rs @@ -56,10 +56,12 @@ fn symbolic_string_prefix_and_length<'input>( /// The tokens in normal mode. #[derive(Logos, Debug, PartialEq, Clone)] pub enum NormalToken<'input> { - #[regex("[ \r\t\n]+", logos::skip)] + #[regex("((\r\n)+|[ \t\n]+)", logos::skip)] // multiline strings cannot be used as enum tags, so we explicitly // disallow that pattern. #[regex("'m(%)+\"")] + // We forbid lone carriage returns for sanity + #[regex("\r[^\n]")] #[error] Error, @@ -389,13 +391,15 @@ pub struct SymbolicStringStart<'input> { /// The tokens in string mode. #[derive(Logos, Debug, PartialEq, Eq, Clone)] pub enum StringToken<'input> { + // We forbid lone carriage returns for sanity + #[regex("\r[^\n]")] #[error] Error, - #[regex("[^\"%\\\\]+")] + #[regex("[^\"%\\\\]+", |lex| normalize_line_endings(lex.slice()))] // Has lower matching priority than `Interpolation` according to Logos' rules. - #[token("%")] - Literal(&'input str), + #[token("%", |lex| String::from(lex.slice()))] + Literal(String), #[token("\"")] DoubleQuote, @@ -411,16 +415,18 @@ pub enum StringToken<'input> { /// The tokens in multiline string mode. #[derive(Logos, Debug, PartialEq, Eq, Clone)] pub enum MultiStringToken<'input> { + // We forbid lone carriage returns for sanity + #[regex("\r[^\n]")] #[error] Error, - #[regex("[^\"%]+")] + #[regex("[^\"%]+", |lex| normalize_line_endings(lex.slice()))] // A token that starts as a multiline end delimiter or an interpolation sequence but is not // one. These ones should have lowest matching priority according to Logos' rules, and // CandidateEnd and CandidateInterpolation should be matched first. - #[token("\"")] - #[regex("%+")] - Literal(&'input str), + #[token("\"", |lex| String::from(lex.slice()))] + #[regex("%+", |lex| String::from(lex.slice()))] + Literal(String), /// A candidate end. A multiline string starting delimiter `MultiStringStart` can have a /// variable number of `%` character, so the lexer matches candidate end delimiter, compare the @@ -692,7 +698,7 @@ impl<'input> Lexer<'input> { }; self.bufferize(next_token, next_span); - let token = Token::MultiStr(MultiStringToken::Literal(&s[0..split_at])); + let token = Token::MultiStr(MultiStringToken::Literal(s[0..split_at].to_owned())); let span = Range { start: span.start, end: span.start + split_at, @@ -850,7 +856,7 @@ impl<'input> Lexer<'input> { // `Literal` one MultiStringToken::CandidateInterpolation(s) | MultiStringToken::QuotesCandidateInterpolation(s) => { - Token::MultiStr(MultiStringToken::Literal(s)) + Token::MultiStr(MultiStringToken::Literal(s.to_owned())) } // Strictly speaking, a candidate end delimiter with more than the required count of // `%` should be split between multistring end token, plus a variable number of `%` @@ -876,7 +882,9 @@ impl<'input> Lexer<'input> { } // Otherwise, it is just part of the string, so we transform the token into a // `Literal` one - MultiStringToken::CandidateEnd(s) => Token::MultiStr(MultiStringToken::Literal(s)), + MultiStringToken::CandidateEnd(s) => { + Token::MultiStr(MultiStringToken::Literal(s.to_owned())) + } // Early report errors for now. This could change in the future MultiStringToken::Error => { return Some(Err(ParseError::Lexical(LexicalError::Generic(span)))) @@ -978,3 +986,14 @@ fn escape_ascii(code: &str) -> Option { Some(code as char) } } + +/// Normalize the line endings in `s` to only `\n` and, in debug mode, check +/// for lone `\r` without an accompanying `\n`. +pub fn normalize_line_endings(s: impl AsRef) -> String { + let normalized = s.as_ref().replace("\r\n", "\n"); + debug_assert!( + normalized.find('\r').is_none(), + "The lexer throws an error when it finds a lone carriage return" + ); + normalized +} diff --git a/core/src/parser/tests.rs b/core/src/parser/tests.rs index b7980fb1dd..77b93d0027 100644 --- a/core/src/parser/tests.rs +++ b/core/src/parser/tests.rs @@ -347,10 +347,10 @@ fn string_lexing() { r#""Good" "strings""#, vec![ Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("Good")), + Token::Str(StringToken::Literal("Good".to_owned())), Token::Normal(NormalToken::DoubleQuote), Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("strings")), + Token::Str(StringToken::Literal("strings".to_owned())), Token::Normal(NormalToken::DoubleQuote), ], ), @@ -359,9 +359,9 @@ fn string_lexing() { r#""Good\nEscape\t\"""#, vec![ Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("Good")), + Token::Str(StringToken::Literal("Good".to_owned())), Token::Str(StringToken::EscapedChar('\n')), - Token::Str(StringToken::Literal("Escape")), + Token::Str(StringToken::Literal("Escape".to_owned())), Token::Str(StringToken::EscapedChar('\t')), Token::Str(StringToken::EscapedChar('\"')), Token::Normal(NormalToken::DoubleQuote), @@ -372,11 +372,11 @@ fn string_lexing() { r#""1 + %{ 1 } + 2""#, vec![ Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("1 + ")), + Token::Str(StringToken::Literal("1 + ".to_owned())), Token::Str(StringToken::Interpolation), Token::Normal(NormalToken::NumLiteral(Number::from(1))), Token::Normal(NormalToken::RBrace), - Token::Str(StringToken::Literal(" + 2")), + Token::Str(StringToken::Literal(" + 2".to_owned())), Token::Normal(NormalToken::DoubleQuote), ], ), @@ -385,7 +385,7 @@ fn string_lexing() { r#""1 + %{ "%{ 1 }" } + 2""#, vec![ Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("1 + ")), + Token::Str(StringToken::Literal("1 + ".to_owned())), Token::Str(StringToken::Interpolation), Token::Normal(NormalToken::DoubleQuote), Token::Str(StringToken::Interpolation), @@ -393,7 +393,7 @@ fn string_lexing() { Token::Normal(NormalToken::RBrace), Token::Normal(NormalToken::DoubleQuote), Token::Normal(NormalToken::RBrace), - Token::Str(StringToken::Literal(" + 2")), + Token::Str(StringToken::Literal(" + 2".to_owned())), Token::Normal(NormalToken::DoubleQuote), ], ), @@ -402,7 +402,7 @@ fn string_lexing() { r#"m%%""%"%%"#, vec![ Token::Normal(NormalToken::MultiStringStart(4)), - Token::MultiStr(MultiStringToken::Literal("\"%")), + Token::MultiStr(MultiStringToken::Literal("\"%".to_owned())), Token::MultiStr(MultiStringToken::End), ], ), @@ -425,11 +425,11 @@ fn string_lexing() { prefix: "foo", length: 3, })), - Token::MultiStr(MultiStringToken::Literal("text ")), + Token::MultiStr(MultiStringToken::Literal("text ".to_owned())), Token::MultiStr(MultiStringToken::Interpolation), Token::Normal(NormalToken::NumLiteral(Number::from(1))), Token::Normal(NormalToken::RBrace), - Token::MultiStr(MultiStringToken::Literal(" etc.")), + Token::MultiStr(MultiStringToken::Literal(" etc.".to_owned())), Token::MultiStr(MultiStringToken::End), ], ), @@ -469,6 +469,12 @@ fn str_escape() { ); } +#[test] +fn carriage_returns() { + assert_eq!(parse_without_pos("\"\\r\""), mk_single_chunk("\r"),); + assert_matches!(parse("foo\rbar"), Err(ParseError::UnexpectedToken(..))) +} + #[test] fn ascii_escape() { assert_matches!( diff --git a/core/src/parser/utils.rs b/core/src/parser/utils.rs index 48341e58b4..445a01d0c8 100644 --- a/core/src/parser/utils.rs +++ b/core/src/parser/utils.rs @@ -96,8 +96,8 @@ pub type FieldPath = Vec; /// Because of the way the lexer handles escaping and interpolation, a contiguous static string /// `"Some \\ \%{escaped} string"` will be lexed as a sequence of such atoms. #[derive(Clone, Debug, Eq, PartialEq)] -pub enum ChunkLiteralPart<'input> { - Str(&'input str), +pub enum ChunkLiteralPart { + Str(String), Char(char), } @@ -839,20 +839,6 @@ pub fn strip_indent(mut chunks: Vec>) -> Vec String { - let chunk = vec![StrChunk::Literal(doc)]; - strip_indent(chunk) - .into_iter() - .map(|chunk| match chunk { - StrChunk::Literal(s) => s, - _ => panic!("expected literal string after indentation of documentation"), - }) - .next() - .expect("expected non-empty chunks after indentation of documentation") -} - #[cfg(test)] mod tests { use crate::typ::TypeF; diff --git a/core/src/pretty.rs b/core/src/pretty.rs index 217f5880f6..7cfad8a58b 100644 --- a/core/src/pretty.rs +++ b/core/src/pretty.rs @@ -55,6 +55,7 @@ fn escape(s: &str) -> String { .replace("%{", "\\%{") .replace('\"', "\\\"") .replace('\n', "\\n") + .replace('\r', "\\r") } static QUOTING_REGEX: Lazy = Lazy::new(|| Regex::new("^_?[a-zA-Z][_a-zA-Z0-9-]*$").unwrap()); @@ -116,6 +117,15 @@ fn contains_newline(chunks: &[StrChunk]) -> bool { }) } +/// Does a sequence of `StrChunk`s contain a carriage return? Lone carriage +/// returns are forbidden in Nickel's surface syntax. +fn contains_carriage_return(chunks: &[StrChunk]) -> bool { + chunks.iter().any(|chunk| match chunk { + StrChunk::Literal(str) => str.contains('\r'), + StrChunk::Expr(_, _) => false, + }) +} + pub fn fmt_pretty<'a, T>(value: &T, f: &mut fmt::Formatter) -> fmt::Result where T: Pretty<'a, pretty::BoxAllocator, ()> + Clone, @@ -143,7 +153,9 @@ where chunks: &[StrChunk], string_style: StringRenderStyle, ) -> DocBuilder<'a, Self, A> { - let multiline = string_style == StringRenderStyle::Multiline && contains_newline(chunks); + let multiline = string_style == StringRenderStyle::Multiline + && contains_newline(chunks) + && !contains_carriage_return(chunks); let nb_perc = if multiline { chunks @@ -195,9 +207,7 @@ where // about whether a trailing newline appears at // the end of the last line. s.split_inclusive('\n').map(|line| { - if let Some(s) = line.strip_suffix("\r\n") { - self.text(s.to_owned()).append(self.hardline()) - } else if let Some(s) = line.strip_suffix('\n') { + if let Some(s) = line.strip_suffix('\n') { self.text(s.to_owned()).append(self.hardline()) } else { self.text(line.to_owned())