feat: make frontend normalize line endings to LF

To reduce differences between Windows and other platforms, the frontend now normalizes all CRLF line endings to LF. Effects: - Lake's hashes already use normalized line endings. This change makes the hashes be faithful to what Lean sees. - Docstrings are affected by line endings. In particular, this fixes `#guard_msgs` failing multilines tests for Windows users using CRLF. - Now strings don't have different lengths depending on the platform. The following theorem is true for LF and false for CRLF files. ```lean example : " ".length = 1 := rfl ```
leanprover · Apr 14, 2024 · b064c56 · b064c56
1 parent 1c20b53
commit b064c56
Show file tree

Hide file tree

Showing 12 changed files with 86 additions and 85 deletions.
diff --git a/src/Init/Data/String/Extra.lean b/src/Init/Data/String/Extra.lean
@@ -98,4 +98,32 @@ def removeLeadingSpaces (s : String) : String :=
   let n := findLeadingSpacesSize s
   if n == 0 then s else removeNumLeadingSpaces n s
 
+/--
+Replaces each `\r\n` with `\n` to normalize line endings,
+but does not validate that there are no isolated `\r` characters.
+It is an optimized version of `String.replace text "\r\n" "\n"`.
+-/
+def crlfToLf (text : String) : String :=
+  go "" 0 0
+where
+  go (acc : String) (accStop pos : String.Pos) : String :=
+    if h : text.atEnd pos then
+      -- note: if accStop = 0 then acc is empty
+      if accStop = 0 then text else acc ++ text.extract accStop pos
+    else
+      let c := text.get' pos h
+      let pos' := text.next' pos h
+      if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
+        let acc := acc ++ text.extract accStop pos
+        have : text.utf8ByteSize - (text.next' (text.next' pos h) h'.1).byteIdx < text.utf8ByteSize - pos.byteIdx := by
+          have k := Nat.gt_of_not_le <| mt decide_eq_true h
+          exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
+        go acc pos' (text.next' pos' h'.1)
+      else
+        have : text.utf8ByteSize - (text.next' pos h).byteIdx < text.utf8ByteSize - pos.byteIdx := by
+          have k := Nat.gt_of_not_le <| mt decide_eq_true h
+          exact Nat.sub_lt_sub_left k (String.lt_next _ _)
+        go acc accStop pos'
+  termination_by text.utf8ByteSize - pos.byteIdx
+
 end String
diff --git a/src/Lean/Data/Position.lean b/src/Lean/Data/Position.lean
@@ -105,7 +105,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=
 
 /--
 Returns the position of the start of (1-based) line `line`.
-This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
+This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
 -/
 def lineStart (map : FileMap) (line : Nat) : String.Pos :=
   if h : line - 1 < map.positions.size then

diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean
@@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
     let curr := input.get' i h
     if curr == '\t' then
       s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
+    else if curr == '\r' then
+      s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
     else if curr.isWhitespace then whitespace c (s.next' input i h)
     else if curr == '-' then
       let i    := input.next' i h
@@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
 /--
 Parses the whitespace after the `\` when there is a string gap.
 Raises an error if the whitespace does not contain exactly one newline character.
-Processes `\r\n` as a newline.
 -/
-partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
+partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
   let i := s.pos
   if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
   else
@@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
         -- Having more than one newline in a string gap is visually confusing
         s.mkUnexpectedError "unexpected additional newline in string gap"
       else
-        stringGapFn true false c (s.next' c.input i h)
-    else if curr == '\r' then
-      stringGapFn seenNewline true c (s.next' c.input i h)
-    else if afterCR then
-      s.mkUnexpectedError "expecting newline after carriage return"
+        stringGapFn true c (s.next' c.input i h)
     else if curr.isWhitespace then
-      stringGapFn seenNewline false c (s.next' c.input i h)
+      stringGapFn seenNewline c (s.next' c.input i h)
     else if seenNewline then
       s
     else
@@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
       andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
     else if curr == 'u' then
       andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
-    else if inString && (curr == '\n' || curr == '\r') then
-      stringGapFn false false c s
+    else if inString && curr == '\n' then
+      stringGapFn false c s
     else
       s.mkUnexpectedError "invalid escape sequence"
 

diff --git a/src/Lean/Parser/Extension.lean b/src/Lean/Parser/Extension.lean
@@ -438,11 +438,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
 def getTokenTable (env : Environment) : TokenTable :=
   (parserExtension.getState env).tokens
 
-def mkInputContext (input : String) (fileName : String) : InputContext := {
-  input    := input,
-  fileName := fileName,
-  fileMap  := input.toFileMap
-}
+-- Note: `crlfToLf` preserves logical line and column numbers for each character.
+def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
+  let input' := if normalizeLineEndings then input.crlfToLf else input
+  { input    := input',
+    fileName := fileName,
+    fileMap  := input'.toFileMap }
 
 def mkParserState (input : String) : ParserState :=
   { cache := initCacheForInput input }

diff --git a/src/Lean/Server/FileWorker.lean b/src/Lean/Server/FileWorker.lean
@@ -676,12 +676,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
   let initParams ← i.readLspRequestAs "initialize" InitializeParams
   let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
   let doc := param.textDocument
-  /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
-    "\n", which should be enough to handle both LF and CRLF correctly.
-    This is because LSP always refers to characters by (line, column),
-    so if we get the line number correct it shouldn't matter that there
-    is a CR there. -/
-  let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
+  /- Note (kmill): LSP always refers to characters by (line, column),
+     so converting CRLF to LF preserves line and column numbers. -/
+  let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
   let e := e.withPrefix s!"[{param.textDocument.uri}] "
   let _ ← IO.setStderr e
   let (ctx, st) ← try

diff --git a/src/Lean/Server/Utils.lean b/src/Lean/Server/Utils.lean
@@ -98,7 +98,7 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
   let «end» := text.lspPosToUtf8Pos r.«end»
   let pre := text.source.extract 0 start
   let post := text.source.extract «end» text.source.endPos
-  (pre ++ newText ++ post).toFileMap
+  (pre ++ newText.crlfToLf ++ post).toFileMap
 
 open IO
 
@@ -125,7 +125,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
   | TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
     replaceLspRange oldText range newText
   | TextDocumentContentChangeEvent.fullChange (newText : String) =>
-    newText.toFileMap
+    newText.crlfToLf.toFileMap
 
 /-- Returns the document contents with all changes applied. -/
 def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=

diff --git a/src/Lean/Server/Watchdog.lean b/src/Lean/Server/Watchdog.lean
@@ -728,12 +728,9 @@ end RequestHandling
 section NotificationHandling
   def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
     let doc := p.textDocument
-    /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
-       "\n", which should be enough to handle both LF and CRLF correctly.
-       This is because LSP always refers to characters by (line, column),
-       so if we get the line number correct it shouldn't matter that there
-       is a CR there. -/
-    startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
+    /- Note (kmill): LSP always refers to characters by (line, column),
+      so converting CRLF to LF preserves line and column numbers. -/
+    startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩
 
   def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
     let doc := p.textDocument

diff --git a/src/lake/Lake/Build/Trace.lean b/src/lake/Lake/Build/Trace.lean
@@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
 Released under Apache 2.0 license as described in the file LICENSE.
 Authors: Mac Malone
 -/
-import Lake.Util.Newline
 
 open System
 namespace Lake
@@ -131,7 +130,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩
 
 def computeTextFileHash (file : FilePath) : IO Hash := do
   let text ← IO.FS.readFile file
-  let text := crlf2lf text
+  let text := text.crlfToLf
   return Hash.ofString text
 
 /--

diff --git a/src/lake/Lake/Util/Newline.lean b/src/lake/Lake/Util/Newline.lean
diff --git a/src/lake/tests/toml/Test.lean b/src/lake/tests/toml/Test.lean
@@ -56,7 +56,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
     match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
     | .ok bytes =>
       if let some input := String.fromUTF8? bytes then
-        pure (crlf2lf input)
+        pure input.crlfToLf
       else
         return .fail <| MessageLog.empty.add
           {fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}

diff --git a/tests/lean/string_gaps.lean → tests/lean/run/string_gaps.lean b/tests/lean/string_gaps.lean → tests/lean/run/string_gaps.lean
@@ -10,24 +10,32 @@ String gaps are described in RFC #2838
 /-!
 A string gap with no trailing space.
 -/
+/-- info: "ab" -/
+#guard_msgs in
 #eval "a\
 b"
 
 /-!
 A string gap with trailing space before the `b`, which is consumed.
 -/
+/-- info: "ab" -/
+#guard_msgs in
 #eval "a\
        b"
 
 /-!
 A string gap with space before the gap, which is not consumed.
 -/
+/-- info: "a b" -/
+#guard_msgs in
 #eval "a \
        b"
 
 /-!
 Multiple string gaps in a row.
 -/
+/-- info: "a b" -/
+#guard_msgs in
 #eval "a \
          \
          \
@@ -36,39 +44,57 @@ Multiple string gaps in a row.
 /-!
 Two tests from the RFC.
 -/
+/-- info: "this is a string" -/
+#guard_msgs in
 #eval "this is \
        a string"
+/-- info: "this is a string" -/
+#guard_msgs in
 #eval "this is \
         a string"
 
 /-!
 Two examples of how spaces are accounted for in string gaps. `\x20` is a way to force a leading space.
 -/
+/-- info: "there are three spaces between the brackets <   >" -/
+#guard_msgs in
 #eval "there are three spaces between the brackets <   \
                          >"
+/-- info: "there are three spaces between the brackets <   >" -/
+#guard_msgs in
 #eval "there are three spaces between the brackets <\
          \x20  >"
 
 /-!
 Using `\n` to terminate a string gap, which is a technique suggested by Mario for using string gaps to write
 multiline literals in an indented context.
 -/
+/-- info: "this is\n  a string with two space indent" -/
+#guard_msgs in
 #eval "this is\
        \n  a string with two space indent"
 
 /-!
 Similar tests but for interpolated strings.
 -/
+/-- info: "ab" -/
+#guard_msgs in
 #eval s!"a\
 b"
+/-- info: "ab" -/
+#guard_msgs in
 #eval s!"a\
          b"
+/-- info: "ab" -/
+#guard_msgs in
 #eval s!"a\
          b"
 
 /-!
 The `{` terminates the string gap.
 -/
+/-- info: "ab" -/
+#guard_msgs in
 #eval s!"a\
          {"b"}\
         "
@@ -82,22 +108,18 @@ open Lean
 /-!
 Standard string gap, with LF
 -/
+/-- info: "ab" -/
+#guard_msgs in
 #eval show MetaM String from do
   let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\n b\""
   let some s := stx.isStrLit? | failure
   return s
 
-/-!
-Standard string gap, with CRLF
--/
-#eval show MetaM String from do
-  let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r\n b\""
-  let some s := stx.isStrLit? | failure
-  return s
-
 /-!
 Isolated CR, which is an error
 -/
+/-- error: <input>:1:3: invalid escape sequence -/
+#guard_msgs (error, drop info) in
 #eval show MetaM String from do
   let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r b\""
   let some s := stx.isStrLit? | failure
@@ -106,6 +128,8 @@ Isolated CR, which is an error
 /-!
 Not a string gap since there's no end-of-line.
 -/
+/-- error: <input>:1:3: invalid escape sequence -/
+#guard_msgs (error, drop info) in
 #eval show MetaM String from do
   let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\ b\""
   let some s := stx.isStrLit? | failure
@@ -134,6 +158,8 @@ elab "d!" s:str : term => do
   let some s := String.dedent s | Lean.Elab.throwIllFormedSyntax
   pure $ Lean.mkStrLit s
 
+/-- info: "this is line 1\n  line 2, indented\nline 3" -/
+#guard_msgs in
 #eval d!"this is \
             line 1
         |  line 2, indented

diff --git a/tests/lean/string_gaps.lean.expected.out b/tests/lean/string_gaps.lean.expected.out