Skip to content

Commit

Permalink
feat: make frontend normalize line endings to LF
Browse files Browse the repository at this point in the history
To reduce differences between Windows and other platforms, the frontend now normalizes all CRLF line endings to LF.

Effects:
- Lake's hashes already use normalized line endings. This change makes the hashes be faithful to what Lean sees.
- Docstrings are affected by line endings. In particular, this fixes `#guard_msgs` failing multilines tests for Windows users using CRLF.
- Now strings don't have different lengths depending on the platform. The following theorem is true for LF and false for CRLF files.
```lean
example : "
".length = 1 := rfl
```
  • Loading branch information
kmill committed Apr 14, 2024
1 parent 1c20b53 commit b064c56
Show file tree
Hide file tree
Showing 12 changed files with 86 additions and 85 deletions.
28 changes: 28 additions & 0 deletions src/Init/Data/String/Extra.lean
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,32 @@ def removeLeadingSpaces (s : String) : String :=
let n := findLeadingSpacesSize s
if n == 0 then s else removeNumLeadingSpaces n s

/--
Replaces each `\r\n` with `\n` to normalize line endings,
but does not validate that there are no isolated `\r` characters.
It is an optimized version of `String.replace text "\r\n" "\n"`.
-/
def crlfToLf (text : String) : String :=
go "" 0 0
where
go (acc : String) (accStop pos : String.Pos) : String :=
if h : text.atEnd pos then
-- note: if accStop = 0 then acc is empty
if accStop = 0 then text else acc ++ text.extract accStop pos
else
let c := text.get' pos h
let pos' := text.next' pos h
if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
let acc := acc ++ text.extract accStop pos
have : text.utf8ByteSize - (text.next' (text.next' pos h) h'.1).byteIdx < text.utf8ByteSize - pos.byteIdx := by
have k := Nat.gt_of_not_le <| mt decide_eq_true h
exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
go acc pos' (text.next' pos' h'.1)
else
have : text.utf8ByteSize - (text.next' pos h).byteIdx < text.utf8ByteSize - pos.byteIdx := by
have k := Nat.gt_of_not_le <| mt decide_eq_true h
exact Nat.sub_lt_sub_left k (String.lt_next _ _)
go acc accStop pos'
termination_by text.utf8ByteSize - pos.byteIdx

end String
2 changes: 1 addition & 1 deletion src/Lean/Data/Position.lean
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=

/--
Returns the position of the start of (1-based) line `line`.
This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
-/
def lineStart (map : FileMap) (line : Nat) : String.Pos :=
if h : line - 1 < map.positions.size then
Expand Down
17 changes: 7 additions & 10 deletions src/Lean/Parser/Basic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
let curr := input.get' i h
if curr == '\t' then
s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
else if curr == '\r' then
s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
else if curr.isWhitespace then whitespace c (s.next' input i h)
else if curr == '-' then
let i := input.next' i h
Expand Down Expand Up @@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
/--
Parses the whitespace after the `\` when there is a string gap.
Raises an error if the whitespace does not contain exactly one newline character.
Processes `\r\n` as a newline.
-/
partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
let i := s.pos
if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
else
Expand All @@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
-- Having more than one newline in a string gap is visually confusing
s.mkUnexpectedError "unexpected additional newline in string gap"
else
stringGapFn true false c (s.next' c.input i h)
else if curr == '\r' then
stringGapFn seenNewline true c (s.next' c.input i h)
else if afterCR then
s.mkUnexpectedError "expecting newline after carriage return"
stringGapFn true c (s.next' c.input i h)
else if curr.isWhitespace then
stringGapFn seenNewline false c (s.next' c.input i h)
stringGapFn seenNewline c (s.next' c.input i h)
else if seenNewline then
s
else
Expand All @@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
else if curr == 'u' then
andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
else if inString && (curr == '\n' || curr == '\r') then
stringGapFn false false c s
else if inString && curr == '\n' then
stringGapFn false c s
else
s.mkUnexpectedError "invalid escape sequence"

Expand Down
11 changes: 6 additions & 5 deletions src/Lean/Parser/Extension.lean
Original file line number Diff line number Diff line change
Expand Up @@ -438,11 +438,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
def getTokenTable (env : Environment) : TokenTable :=
(parserExtension.getState env).tokens

def mkInputContext (input : String) (fileName : String) : InputContext := {
input := input,
fileName := fileName,
fileMap := input.toFileMap
}
-- Note: `crlfToLf` preserves logical line and column numbers for each character.
def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
let input' := if normalizeLineEndings then input.crlfToLf else input
{ input := input',
fileName := fileName,
fileMap := input'.toFileMap }

def mkParserState (input : String) : ParserState :=
{ cache := initCacheForInput input }
Expand Down
9 changes: 3 additions & 6 deletions src/Lean/Server/FileWorker.lean
Original file line number Diff line number Diff line change
Expand Up @@ -676,12 +676,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
let initParams ← i.readLspRequestAs "initialize" InitializeParams
let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
let doc := param.textDocument
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
"\n", which should be enough to handle both LF and CRLF correctly.
This is because LSP always refers to characters by (line, column),
so if we get the line number correct it shouldn't matter that there
is a CR there. -/
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
/- Note (kmill): LSP always refers to characters by (line, column),
so converting CRLF to LF preserves line and column numbers. -/
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
let e := e.withPrefix s!"[{param.textDocument.uri}] "
let _ ← IO.setStderr e
let (ctx, st) ← try
Expand Down
4 changes: 2 additions & 2 deletions src/Lean/Server/Utils.lean
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
let «end» := text.lspPosToUtf8Pos r.«end»
let pre := text.source.extract 0 start
let post := text.source.extract «end» text.source.endPos
(pre ++ newText ++ post).toFileMap
(pre ++ newText.crlfToLf ++ post).toFileMap

open IO

Expand All @@ -125,7 +125,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
| TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
replaceLspRange oldText range newText
| TextDocumentContentChangeEvent.fullChange (newText : String) =>
newText.toFileMap
newText.crlfToLf.toFileMap

/-- Returns the document contents with all changes applied. -/
def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=
Expand Down
9 changes: 3 additions & 6 deletions src/Lean/Server/Watchdog.lean
Original file line number Diff line number Diff line change
Expand Up @@ -728,12 +728,9 @@ end RequestHandling
section NotificationHandling
def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
let doc := p.textDocument
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
"\n", which should be enough to handle both LF and CRLF correctly.
This is because LSP always refers to characters by (line, column),
so if we get the line number correct it shouldn't matter that there
is a CR there. -/
startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
/- Note (kmill): LSP always refers to characters by (line, column),
so converting CRLF to LF preserves line and column numbers. -/
startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩

def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
let doc := p.textDocument
Expand Down
3 changes: 1 addition & 2 deletions src/lake/Lake/Build/Trace.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Mac Malone
-/
import Lake.Util.Newline

open System
namespace Lake
Expand Down Expand Up @@ -131,7 +130,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩

def computeTextFileHash (file : FilePath) : IO Hash := do
let text ← IO.FS.readFile file
let text := crlf2lf text
let text := text.crlfToLf
return Hash.ofString text

/--
Expand Down
24 changes: 0 additions & 24 deletions src/lake/Lake/Util/Newline.lean

This file was deleted.

2 changes: 1 addition & 1 deletion src/lake/tests/toml/Test.lean
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
| .ok bytes =>
if let some input := String.fromUTF8? bytes then
pure (crlf2lf input)
pure input.crlfToLf
else
return .fail <| MessageLog.empty.add
{fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}
Expand Down
42 changes: 34 additions & 8 deletions tests/lean/string_gaps.lean → tests/lean/run/string_gaps.lean
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,32 @@ String gaps are described in RFC #2838
/-!
A string gap with no trailing space.
-/
/-- info: "ab" -/
#guard_msgs in
#eval "a\
b"

/-!
A string gap with trailing space before the `b`, which is consumed.
-/
/-- info: "ab" -/
#guard_msgs in
#eval "a\
b"

/-!
A string gap with space before the gap, which is not consumed.
-/
/-- info: "a b" -/
#guard_msgs in
#eval "a \
b"

/-!
Multiple string gaps in a row.
-/
/-- info: "a b" -/
#guard_msgs in
#eval "a \
\
\
Expand All @@ -36,39 +44,57 @@ Multiple string gaps in a row.
/-!
Two tests from the RFC.
-/
/-- info: "this is a string" -/
#guard_msgs in
#eval "this is \
a string"
/-- info: "this is a string" -/
#guard_msgs in
#eval "this is \
a string"

/-!
Two examples of how spaces are accounted for in string gaps. `\x20` is a way to force a leading space.
-/
/-- info: "there are three spaces between the brackets < >" -/
#guard_msgs in
#eval "there are three spaces between the brackets < \
>"
/-- info: "there are three spaces between the brackets < >" -/
#guard_msgs in
#eval "there are three spaces between the brackets <\
\x20 >"

/-!
Using `\n` to terminate a string gap, which is a technique suggested by Mario for using string gaps to write
multiline literals in an indented context.
-/
/-- info: "this is\n a string with two space indent" -/
#guard_msgs in
#eval "this is\
\n a string with two space indent"

/-!
Similar tests but for interpolated strings.
-/
/-- info: "ab" -/
#guard_msgs in
#eval s!"a\
b"
/-- info: "ab" -/
#guard_msgs in
#eval s!"a\
b"
/-- info: "ab" -/
#guard_msgs in
#eval s!"a\
b"

/-!
The `{` terminates the string gap.
-/
/-- info: "ab" -/
#guard_msgs in
#eval s!"a\
{"b"}\
"
Expand All @@ -82,22 +108,18 @@ open Lean
/-!
Standard string gap, with LF
-/
/-- info: "ab" -/
#guard_msgs in
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\n b\""
let some s := stx.isStrLit? | failure
return s

/-!
Standard string gap, with CRLF
-/
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r\n b\""
let some s := stx.isStrLit? | failure
return s

/-!
Isolated CR, which is an error
-/
/-- error: <input>:1:3: invalid escape sequence -/
#guard_msgs (error, drop info) in
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r b\""
let some s := stx.isStrLit? | failure
Expand All @@ -106,6 +128,8 @@ Isolated CR, which is an error
/-!
Not a string gap since there's no end-of-line.
-/
/-- error: <input>:1:3: invalid escape sequence -/
#guard_msgs (error, drop info) in
#eval show MetaM String from do
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\ b\""
let some s := stx.isStrLit? | failure
Expand Down Expand Up @@ -134,6 +158,8 @@ elab "d!" s:str : term => do
let some s := String.dedent s | Lean.Elab.throwIllFormedSyntax
pure $ Lean.mkStrLit s

/-- info: "this is line 1\n line 2, indented\nline 3" -/
#guard_msgs in
#eval d!"this is \
line 1
| line 2, indented
Expand Down
20 changes: 0 additions & 20 deletions tests/lean/string_gaps.lean.expected.out

This file was deleted.

0 comments on commit b064c56

Please sign in to comment.