leanprover · kmill · May 20, 2024 · Apr 13, 2024 · May 7, 2024 · May 7, 2024
@@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String :=
   let n := findLeadingSpacesSize s
   if n == 0 then s else removeNumLeadingSpaces n s
 
+/--
+Replaces each `\r\n` with `\n` to normalize line endings,
+but does not validate that there are no isolated `\r` characters.
+It is an optimized version of `String.replace text "\r\n" "\n"`.
+-/
+def crlfToLf (text : String) : String :=
+  go "" 0 0
+where
+  go (acc : String) (accStop pos : String.Pos) : String :=
+    if h : text.atEnd pos then
+      -- note: if accStop = 0 then acc is empty
+      if accStop = 0 then text else acc ++ text.extract accStop pos
+    else
+      let c := text.get' pos h
+      let pos' := text.next' pos h
+      if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
+        let acc := acc ++ text.extract accStop pos
+        go acc pos' (text.next' pos' h'.1)
+      else
+        go acc accStop pos'
+  termination_by text.utf8ByteSize - pos.byteIdx
+  decreasing_by
+    decreasing_with
+      show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx
+      have k := Nat.gt_of_not_le <| mt decide_eq_true h
+      exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
+    decreasing_with
+      show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx
+      have k := Nat.gt_of_not_le <| mt decide_eq_true h
+      exact Nat.sub_lt_sub_left k (String.lt_next _ _)
+
 end String
diff --git a/src/Lean/Data/Position.lean b/src/Lean/Data/Position.lean
@@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=
 
 /--
 Returns the position of the start of (1-based) line `line`.
-This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
+This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
 -/
 def lineStart (map : FileMap) (line : Nat) : String.Pos :=
   if h : line - 1 < map.positions.size then

@@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
     let curr := input.get' i h
     if curr == '\t' then
       s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
+    else if curr == '\r' then
+      s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
     else if curr.isWhitespace then whitespace c (s.next' input i h)
     else if curr == '-' then
       let i    := input.next' i h
@@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
 /--
 Parses the whitespace after the `\` when there is a string gap.
 Raises an error if the whitespace does not contain exactly one newline character.
-Processes `\r\n` as a newline.
 -/
-partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
+partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
   let i := s.pos
   if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
   else
@@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
         -- Having more than one newline in a string gap is visually confusing
         s.mkUnexpectedError "unexpected additional newline in string gap"
       else
-        stringGapFn true false c (s.next' c.input i h)
-    else if curr == '\r' then
-      stringGapFn seenNewline true c (s.next' c.input i h)
-    else if afterCR then
-      s.mkUnexpectedError "expecting newline after carriage return"
+        stringGapFn true c (s.next' c.input i h)
     else if curr.isWhitespace then
-      stringGapFn seenNewline false c (s.next' c.input i h)
+      stringGapFn seenNewline c (s.next' c.input i h)
     else if seenNewline then
       s
     else
@@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
       andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
     else if curr == 'u' then
       andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
-    else if inString && (curr == '\n' || curr == '\r') then
-      stringGapFn false false c s
+    else if inString && curr == '\n' then
+      stringGapFn false c s
     else
       s.mkUnexpectedError "invalid escape sequence"
 

@@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
 def getTokenTable (env : Environment) : TokenTable :=
   (parserExtension.getState env).tokens
 
-def mkInputContext (input : String) (fileName : String) : InputContext := {
-  input    := input,
-  fileName := fileName,
-  fileMap  := input.toFileMap
-}
+-- Note: `crlfToLf` preserves logical line and column numbers for each character.
+def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
+  let input' := if normalizeLineEndings then input.crlfToLf else input
+  { input    := input',
+    fileName := fileName,
+    fileMap  := input'.toFileMap }
 
 def mkParserState (input : String) : ParserState :=
   { cache := initCacheForInput input }
@@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil
   let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input)
   if !s.allErrors.isEmpty  then
     Except.error (s.toErrorMsg ictx)
-  else if input.atEnd s.pos then
+  else if ictx.input.atEnd s.pos then
     Except.ok s.stxStack.back
   else
     Except.error ((s.mkError "end of input").toErrorMsg ictx)

@@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
   let initParams ← i.readLspRequestAs "initialize" InitializeParams
   let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
   let doc := param.textDocument
-  /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
-    "\n", which should be enough to handle both LF and CRLF correctly.
-    This is because LSP always refers to characters by (line, column),
-    so if we get the line number correct it shouldn't matter that there
-    is a CR there. -/
-  let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
+  /- Note (kmill): LSP always refers to characters by (line, column),
+     so converting CRLF to LF preserves line and column numbers. -/
+  let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
   let e := e.withPrefix s!"[{param.textDocument.uri}] "
   let _ ← IO.setStderr e
   let (ctx, st) ← try

@@ -75,7 +75,10 @@ structure DocumentMeta where
   uri                 : Lsp.DocumentUri
   /-- Version number of the document. Incremented whenever the document is edited. -/
   version             : Nat
-  /-- Current text of the document. -/
+  /--
+  Current text of the document.
+  It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers.
+  (Note: we assume that edit operations never split or merge `\r\n` line endings.) -/
   text                : FileMap
   /--
   Controls when dependencies of the document are built on `textDocument/didOpen` notifications.
@@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
   let «end» := text.lspPosToUtf8Pos r.«end»
   let pre := text.source.extract 0 start
   let post := text.source.extract «end» text.source.endPos
-  (pre ++ newText ++ post).toFileMap
+  -- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized.
+  -- Note: this assumes that editing never separates a `\r\n`.
+  -- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate.
+  -- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here.
+  (pre ++ newText.crlfToLf ++ post).toFileMap
 
 open IO
 
@@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
   | TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
     replaceLspRange oldText range newText
   | TextDocumentContentChangeEvent.fullChange (newText : String) =>
-    newText.toFileMap
+    newText.crlfToLf.toFileMap
 
 /-- Returns the document contents with all changes applied. -/
 def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=

@@ -728,12 +728,9 @@ end RequestHandling
 section NotificationHandling
   def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
     let doc := p.textDocument
-    /- NOTE(WN): `toFileMap` marks line beginnings as immediately following
-       "\n", which should be enough to handle both LF and CRLF correctly.
-       This is because LSP always refers to characters by (line, column),
-       so if we get the line number correct it shouldn't matter that there
-       is a CR there. -/
-    startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
+    /- Note (kmill): LSP always refers to characters by (line, column),
+      so converting CRLF to LF preserves line and column numbers. -/
+    startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩
 
   def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
     let doc := p.textDocument

@@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
 Released under Apache 2.0 license as described in the file LICENSE.
 Authors: Mac Malone
 -/
-import Lake.Util.Newline
 
 open System
 namespace Lake
@@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩
 
 def computeTextFileHash (file : FilePath) : IO Hash := do
   let text ← IO.FS.readFile file
-  let text := crlf2lf text
+  let text := text.crlfToLf
   return Hash.ofString text
 
 /--

@@ -5,7 +5,6 @@ Authors: Mac Malone
 -/
 import Lake.Toml
 import Lake.Util.Message
-import Lake.Util.Newline
 
 /-!
 ## TOML Test Runner
@@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
     match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
     | .ok bytes =>
       if let some input := String.fromUTF8? bytes then
-        pure (crlf2lf input)
+        pure input.crlfToLf
       else
         return .fail <| MessageLog.empty.add
           {fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}

diff --git a/tests/lean/run/crlfToLf.lean b/tests/lean/run/crlfToLf.lean
@@ -0,0 +1,39 @@
+/-!
+# Test `String.crlfToLf`
+-/
+
+/-!
+Leaves single `\n`'s alone.
+-/
+/-- info: "hello\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\nworld"
+
+/-!
+Turns `\r\n` into `\n`.
+-/
+/-- info: "hello\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld"
+
+/-!
+In a string of `\r...\r\n`, only normalizes the last `\r\n`.
+-/
+/-- info: "hello\x0d\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld"
+
+/-!
+Two in a row.
+-/
+/-- info: "hello\n\nworld" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld"
+
+/-!
+Normalizes at the end.
+-/
+/-- info: "hello\nworld\n" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n"
+
+/-!
+Can handle a loose `\r` as the last character.
+-/
+/-- info: "hello\nworld\x0d" -/
+#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r"