Skip to content

Commit

Permalink
Normalize line endings in string literals during parsing (#1562)
Browse files Browse the repository at this point in the history
* Remove unused `strip_indent_doc`

* Replace `\r\n` by `\n` in the string literal parser

* `mk_strchunk_literal` -> `StrChunk::normalized_literal`

* Move string normalizing into the lexer

* Remove leftover debug print
  • Loading branch information
vkleen authored Sep 4, 2023
1 parent 80e9abe commit 305d1a4
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 46 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ members = [
"wasm-repl",
"pyckel",
]
resolver = "2"

[workspace.package]
version = "1.1.1"
Expand Down
8 changes: 4 additions & 4 deletions core/src/parser/grammar.lalrpop
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ ChunkLiteral : String =
<parts: ChunkLiteralPart+> => {
parts.into_iter().fold(String::new(), |mut acc, part| {
match part {
ChunkLiteralPart::Str(s) => acc.push_str(s),
ChunkLiteralPart::Str(s) => acc.push_str(&s),
ChunkLiteralPart::Char(c) => acc.push(c),
};

Expand Down Expand Up @@ -623,7 +623,7 @@ EnumTag: LocIdent = {
<StringEnumTag> => <>.into(),
};

ChunkLiteralPart: ChunkLiteralPart<'input> = {
ChunkLiteralPart: ChunkLiteralPart = {
"str literal" => ChunkLiteralPart::Str(<>),
"multstr literal" => ChunkLiteralPart::Str(<>),
"str esc char" => ChunkLiteralPart::Char(<>),
Expand Down Expand Up @@ -946,9 +946,9 @@ extern {

enum Token<'input> {
"identifier" => Token::Normal(NormalToken::Identifier(<&'input str>)),
"str literal" => Token::Str(StringToken::Literal(<&'input str>)),
"str literal" => Token::Str(StringToken::Literal(<String>)),
"str esc char" => Token::Str(StringToken::EscapedChar(<char>)),
"multstr literal" => Token::MultiStr(MultiStringToken::Literal(<&'input str>)),
"multstr literal" => Token::MultiStr(MultiStringToken::Literal(<String>)),
"num literal" => Token::Normal(NormalToken::NumLiteral(<Number>)),

"raw enum tag" => Token::Normal(NormalToken::RawEnumTag(<&'input str>)),
Expand Down
41 changes: 30 additions & 11 deletions core/src/parser/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,12 @@ fn symbolic_string_prefix_and_length<'input>(
/// The tokens in normal mode.
#[derive(Logos, Debug, PartialEq, Clone)]
pub enum NormalToken<'input> {
#[regex("[ \r\t\n]+", logos::skip)]
#[regex("((\r\n)+|[ \t\n]+)", logos::skip)]
// multiline strings cannot be used as enum tags, so we explicitly
// disallow that pattern.
#[regex("'m(%)+\"")]
// We forbid lone carriage returns for sanity
#[regex("\r[^\n]")]
#[error]
Error,

Expand Down Expand Up @@ -389,13 +391,15 @@ pub struct SymbolicStringStart<'input> {
/// The tokens in string mode.
#[derive(Logos, Debug, PartialEq, Eq, Clone)]
pub enum StringToken<'input> {
// We forbid lone carriage returns for sanity
#[regex("\r[^\n]")]
#[error]
Error,

#[regex("[^\"%\\\\]+")]
#[regex("[^\"%\\\\]+", |lex| normalize_line_endings(lex.slice()))]
// Has lower matching priority than `Interpolation` according to Logos' rules.
#[token("%")]
Literal(&'input str),
#[token("%", |lex| String::from(lex.slice()))]
Literal(String),

#[token("\"")]
DoubleQuote,
Expand All @@ -411,16 +415,18 @@ pub enum StringToken<'input> {
/// The tokens in multiline string mode.
#[derive(Logos, Debug, PartialEq, Eq, Clone)]
pub enum MultiStringToken<'input> {
// We forbid lone carriage returns for sanity
#[regex("\r[^\n]")]
#[error]
Error,

#[regex("[^\"%]+")]
#[regex("[^\"%]+", |lex| normalize_line_endings(lex.slice()))]
// A token that starts as a multiline end delimiter or an interpolation sequence but is not
// one. These ones should have lowest matching priority according to Logos' rules, and
// CandidateEnd and CandidateInterpolation should be matched first.
#[token("\"")]
#[regex("%+")]
Literal(&'input str),
#[token("\"", |lex| String::from(lex.slice()))]
#[regex("%+", |lex| String::from(lex.slice()))]
Literal(String),

/// A candidate end. A multiline string starting delimiter `MultiStringStart` can have a
/// variable number of `%` character, so the lexer matches candidate end delimiter, compare the
Expand Down Expand Up @@ -692,7 +698,7 @@ impl<'input> Lexer<'input> {
};
self.bufferize(next_token, next_span);

let token = Token::MultiStr(MultiStringToken::Literal(&s[0..split_at]));
let token = Token::MultiStr(MultiStringToken::Literal(s[0..split_at].to_owned()));
let span = Range {
start: span.start,
end: span.start + split_at,
Expand Down Expand Up @@ -850,7 +856,7 @@ impl<'input> Lexer<'input> {
// `Literal` one
MultiStringToken::CandidateInterpolation(s)
| MultiStringToken::QuotesCandidateInterpolation(s) => {
Token::MultiStr(MultiStringToken::Literal(s))
Token::MultiStr(MultiStringToken::Literal(s.to_owned()))
}
// Strictly speaking, a candidate end delimiter with more than the required count of
// `%` should be split between multistring end token, plus a variable number of `%`
Expand All @@ -876,7 +882,9 @@ impl<'input> Lexer<'input> {
}
// Otherwise, it is just part of the string, so we transform the token into a
// `Literal` one
MultiStringToken::CandidateEnd(s) => Token::MultiStr(MultiStringToken::Literal(s)),
MultiStringToken::CandidateEnd(s) => {
Token::MultiStr(MultiStringToken::Literal(s.to_owned()))
}
// Early report errors for now. This could change in the future
MultiStringToken::Error => {
return Some(Err(ParseError::Lexical(LexicalError::Generic(span))))
Expand Down Expand Up @@ -978,3 +986,14 @@ fn escape_ascii(code: &str) -> Option<char> {
Some(code as char)
}
}

/// Normalize the line endings in `s` to only `\n` and, in debug mode, check
/// for lone `\r` without an accompanying `\n`.
pub fn normalize_line_endings(s: impl AsRef<str>) -> String {
let normalized = s.as_ref().replace("\r\n", "\n");
debug_assert!(
normalized.find('\r').is_none(),
"The lexer throws an error when it finds a lone carriage return"
);
normalized
}
28 changes: 17 additions & 11 deletions core/src/parser/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,10 @@ fn string_lexing() {
r#""Good" "strings""#,
vec![
Token::Normal(NormalToken::DoubleQuote),
Token::Str(StringToken::Literal("Good")),
Token::Str(StringToken::Literal("Good".to_owned())),
Token::Normal(NormalToken::DoubleQuote),
Token::Normal(NormalToken::DoubleQuote),
Token::Str(StringToken::Literal("strings")),
Token::Str(StringToken::Literal("strings".to_owned())),
Token::Normal(NormalToken::DoubleQuote),
],
),
Expand All @@ -359,9 +359,9 @@ fn string_lexing() {
r#""Good\nEscape\t\"""#,
vec![
Token::Normal(NormalToken::DoubleQuote),
Token::Str(StringToken::Literal("Good")),
Token::Str(StringToken::Literal("Good".to_owned())),
Token::Str(StringToken::EscapedChar('\n')),
Token::Str(StringToken::Literal("Escape")),
Token::Str(StringToken::Literal("Escape".to_owned())),
Token::Str(StringToken::EscapedChar('\t')),
Token::Str(StringToken::EscapedChar('\"')),
Token::Normal(NormalToken::DoubleQuote),
Expand All @@ -372,11 +372,11 @@ fn string_lexing() {
r#""1 + %{ 1 } + 2""#,
vec![
Token::Normal(NormalToken::DoubleQuote),
Token::Str(StringToken::Literal("1 + ")),
Token::Str(StringToken::Literal("1 + ".to_owned())),
Token::Str(StringToken::Interpolation),
Token::Normal(NormalToken::NumLiteral(Number::from(1))),
Token::Normal(NormalToken::RBrace),
Token::Str(StringToken::Literal(" + 2")),
Token::Str(StringToken::Literal(" + 2".to_owned())),
Token::Normal(NormalToken::DoubleQuote),
],
),
Expand All @@ -385,15 +385,15 @@ fn string_lexing() {
r#""1 + %{ "%{ 1 }" } + 2""#,
vec![
Token::Normal(NormalToken::DoubleQuote),
Token::Str(StringToken::Literal("1 + ")),
Token::Str(StringToken::Literal("1 + ".to_owned())),
Token::Str(StringToken::Interpolation),
Token::Normal(NormalToken::DoubleQuote),
Token::Str(StringToken::Interpolation),
Token::Normal(NormalToken::NumLiteral(Number::from(1))),
Token::Normal(NormalToken::RBrace),
Token::Normal(NormalToken::DoubleQuote),
Token::Normal(NormalToken::RBrace),
Token::Str(StringToken::Literal(" + 2")),
Token::Str(StringToken::Literal(" + 2".to_owned())),
Token::Normal(NormalToken::DoubleQuote),
],
),
Expand All @@ -402,7 +402,7 @@ fn string_lexing() {
r#"m%%""%"%%"#,
vec![
Token::Normal(NormalToken::MultiStringStart(4)),
Token::MultiStr(MultiStringToken::Literal("\"%")),
Token::MultiStr(MultiStringToken::Literal("\"%".to_owned())),
Token::MultiStr(MultiStringToken::End),
],
),
Expand All @@ -425,11 +425,11 @@ fn string_lexing() {
prefix: "foo",
length: 3,
})),
Token::MultiStr(MultiStringToken::Literal("text ")),
Token::MultiStr(MultiStringToken::Literal("text ".to_owned())),
Token::MultiStr(MultiStringToken::Interpolation),
Token::Normal(NormalToken::NumLiteral(Number::from(1))),
Token::Normal(NormalToken::RBrace),
Token::MultiStr(MultiStringToken::Literal(" etc.")),
Token::MultiStr(MultiStringToken::Literal(" etc.".to_owned())),
Token::MultiStr(MultiStringToken::End),
],
),
Expand Down Expand Up @@ -469,6 +469,12 @@ fn str_escape() {
);
}

#[test]
fn carriage_returns() {
assert_eq!(parse_without_pos("\"\\r\""), mk_single_chunk("\r"),);
assert_matches!(parse("foo\rbar"), Err(ParseError::UnexpectedToken(..)))
}

#[test]
fn ascii_escape() {
assert_matches!(
Expand Down
18 changes: 2 additions & 16 deletions core/src/parser/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ pub type FieldPath = Vec<FieldPathElem>;
/// Because of the way the lexer handles escaping and interpolation, a contiguous static string
/// `"Some \\ \%{escaped} string"` will be lexed as a sequence of such atoms.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ChunkLiteralPart<'input> {
Str(&'input str),
pub enum ChunkLiteralPart {
Str(String),
Char(char),
}

Expand Down Expand Up @@ -839,20 +839,6 @@ pub fn strip_indent(mut chunks: Vec<StrChunk<RichTerm>>) -> Vec<StrChunk<RichTer
chunks
}

/// Strip the indentation of a doc metadata. Wrap it as a literal string chunk and call
/// [`strip_indent`].
pub fn strip_indent_doc(doc: String) -> String {
let chunk = vec![StrChunk::Literal(doc)];
strip_indent(chunk)
.into_iter()
.map(|chunk| match chunk {
StrChunk::Literal(s) => s,
_ => panic!("expected literal string after indentation of documentation"),
})
.next()
.expect("expected non-empty chunks after indentation of documentation")
}

#[cfg(test)]
mod tests {
use crate::typ::TypeF;
Expand Down
18 changes: 14 additions & 4 deletions core/src/pretty.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ fn escape(s: &str) -> String {
.replace("%{", "\\%{")
.replace('\"', "\\\"")
.replace('\n', "\\n")
.replace('\r', "\\r")
}

static QUOTING_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new("^_?[a-zA-Z][_a-zA-Z0-9-]*$").unwrap());
Expand Down Expand Up @@ -116,6 +117,15 @@ fn contains_newline<T>(chunks: &[StrChunk<T>]) -> bool {
})
}

/// Does a sequence of `StrChunk`s contain a carriage return? Lone carriage
/// returns are forbidden in Nickel's surface syntax.
fn contains_carriage_return<T>(chunks: &[StrChunk<T>]) -> bool {
chunks.iter().any(|chunk| match chunk {
StrChunk::Literal(str) => str.contains('\r'),
StrChunk::Expr(_, _) => false,
})
}

pub fn fmt_pretty<'a, T>(value: &T, f: &mut fmt::Formatter) -> fmt::Result
where
T: Pretty<'a, pretty::BoxAllocator, ()> + Clone,
Expand Down Expand Up @@ -143,7 +153,9 @@ where
chunks: &[StrChunk<RichTerm>],
string_style: StringRenderStyle,
) -> DocBuilder<'a, Self, A> {
let multiline = string_style == StringRenderStyle::Multiline && contains_newline(chunks);
let multiline = string_style == StringRenderStyle::Multiline
&& contains_newline(chunks)
&& !contains_carriage_return(chunks);

let nb_perc = if multiline {
chunks
Expand Down Expand Up @@ -195,9 +207,7 @@ where
// about whether a trailing newline appears at
// the end of the last line.
s.split_inclusive('\n').map(|line| {
if let Some(s) = line.strip_suffix("\r\n") {
self.text(s.to_owned()).append(self.hardline())
} else if let Some(s) = line.strip_suffix('\n') {
if let Some(s) = line.strip_suffix('\n') {
self.text(s.to_owned()).append(self.hardline())
} else {
self.text(line.to_owned())
Expand Down

0 comments on commit 305d1a4

Please sign in to comment.