Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make frontend normalize line endings to LF #3903

Merged
merged 8 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/Init/Data/String/Extra.lean
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,35 @@ def removeLeadingSpaces (s : String) : String :=
let n := findLeadingSpacesSize s
if n == 0 then s else removeNumLeadingSpaces n s

/--
kmill marked this conversation as resolved.
Show resolved Hide resolved
Replaces each `\r\n` with `\n` to normalize line endings,
but does not validate that there are no isolated `\r` characters.
It is an optimized version of `String.replace text "\r\n" "\n"`.
-/
def crlfToLf (text : String) : String :=
kmill marked this conversation as resolved.
Show resolved Hide resolved
go "" 0 0
where
go (acc : String) (accStop pos : String.Pos) : String :=
if h : text.atEnd pos then
-- note: if accStop = 0 then acc is empty
if accStop = 0 then text else acc ++ text.extract accStop pos
else
let c := text.get' pos h
let pos' := text.next' pos h
if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
let acc := acc ++ text.extract accStop pos
go acc pos' (text.next' pos' h'.1)
else
go acc accStop pos'
termination_by text.utf8ByteSize - pos.byteIdx
decreasing_by
decreasing_with
show text.utf8ByteSize - (text.next' (text.next' pos _) _).byteIdx < text.utf8ByteSize - pos.byteIdx
have k := Nat.gt_of_not_le <| mt decide_eq_true h
exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
decreasing_with
show text.utf8ByteSize - (text.next' pos _).byteIdx < text.utf8ByteSize - pos.byteIdx
have k := Nat.gt_of_not_le <| mt decide_eq_true h
exact Nat.sub_lt_sub_left k (String.lt_next _ _)

end String
2 changes: 1 addition & 1 deletion src/Lean/Data/Position.lean
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def ofPosition (text : FileMap) (pos : Position) : String.Pos :=

/--
Returns the position of the start of (1-based) line `line`.
This gives the stame result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
This gives the same result as `map.ofPosition ⟨line, 0⟩`, but is more efficient.
-/
def lineStart (map : FileMap) (line : Nat) : String.Pos :=
if h : line - 1 < map.positions.size then
Expand Down
17 changes: 7 additions & 10 deletions src/Lean/Parser/Basic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,8 @@ partial def whitespace : ParserFn := fun c s =>
let curr := input.get' i h
if curr == '\t' then
s.mkUnexpectedError (pushMissing := false) "tabs are not allowed; please configure your editor to expand them"
else if curr == '\r' then
s.mkUnexpectedError (pushMissing := false) "isolated carriage returns are not allowed"
else if curr.isWhitespace then whitespace c (s.next' input i h)
else if curr == '-' then
let i := input.next' i h
Expand Down Expand Up @@ -621,9 +623,8 @@ def hexDigitFn : ParserFn := fun c s =>
/--
Parses the whitespace after the `\` when there is a string gap.
Raises an error if the whitespace does not contain exactly one newline character.
Processes `\r\n` as a newline.
-/
partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
partial def stringGapFn (seenNewline : Bool) : ParserFn := fun c s =>
let i := s.pos
if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
else
Expand All @@ -633,13 +634,9 @@ partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
-- Having more than one newline in a string gap is visually confusing
s.mkUnexpectedError "unexpected additional newline in string gap"
else
stringGapFn true false c (s.next' c.input i h)
else if curr == '\r' then
stringGapFn seenNewline true c (s.next' c.input i h)
else if afterCR then
s.mkUnexpectedError "expecting newline after carriage return"
stringGapFn true c (s.next' c.input i h)
else if curr.isWhitespace then
stringGapFn seenNewline false c (s.next' c.input i h)
stringGapFn seenNewline c (s.next' c.input i h)
else if seenNewline then
s
else
Expand All @@ -663,8 +660,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn :
andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
else if curr == 'u' then
andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
else if inString && (curr == '\n' || curr == '\r') then
stringGapFn false false c s
else if inString && curr == '\n' then
stringGapFn false c s
else
s.mkUnexpectedError "invalid escape sequence"

Expand Down
13 changes: 7 additions & 6 deletions src/Lean/Parser/Extension.lean
Original file line number Diff line number Diff line change
Expand Up @@ -437,11 +437,12 @@ def getSyntaxNodeKinds (env : Environment) : List SyntaxNodeKind :=
def getTokenTable (env : Environment) : TokenTable :=
(parserExtension.getState env).tokens

def mkInputContext (input : String) (fileName : String) : InputContext := {
input := input,
fileName := fileName,
fileMap := input.toFileMap
}
-- Note: `crlfToLf` preserves logical line and column numbers for each character.
def mkInputContext (input : String) (fileName : String) (normalizeLineEndings := true) : InputContext :=
let input' := if normalizeLineEndings then input.crlfToLf else input
{ input := input',
fileName := fileName,
fileMap := input'.toFileMap }

def mkParserState (input : String) : ParserState :=
{ cache := initCacheForInput input }
Expand All @@ -453,7 +454,7 @@ def runParserCategory (env : Environment) (catName : Name) (input : String) (fil
let s := p.run ictx { env, options := {} } (getTokenTable env) (mkParserState input)
if !s.allErrors.isEmpty then
Except.error (s.toErrorMsg ictx)
else if input.atEnd s.pos then
else if ictx.input.atEnd s.pos then
Except.ok s.stxStack.back
else
Except.error ((s.mkError "end of input").toErrorMsg ictx)
Expand Down
9 changes: 3 additions & 6 deletions src/Lean/Server/FileWorker.lean
Original file line number Diff line number Diff line change
Expand Up @@ -705,12 +705,9 @@ def initAndRunWorker (i o e : FS.Stream) (opts : Options) : IO UInt32 := do
let initParams ← i.readLspRequestAs "initialize" InitializeParams
let ⟨_, param⟩ ← i.readLspNotificationAs "textDocument/didOpen" LeanDidOpenTextDocumentParams
let doc := param.textDocument
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
"\n", which should be enough to handle both LF and CRLF correctly.
This is because LSP always refers to characters by (line, column),
so if we get the line number correct it shouldn't matter that there
is a CR there. -/
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.toFileMap, param.dependencyBuildMode?.getD .always⟩
/- Note (kmill): LSP always refers to characters by (line, column),
so converting CRLF to LF preserves line and column numbers. -/
let meta : DocumentMeta := ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, param.dependencyBuildMode?.getD .always⟩
let e := e.withPrefix s!"[{param.textDocument.uri}] "
let _ ← IO.setStderr e
let (ctx, st) ← try
Expand Down
13 changes: 10 additions & 3 deletions src/Lean/Server/Utils.lean
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ structure DocumentMeta where
uri : Lsp.DocumentUri
/-- Version number of the document. Incremented whenever the document is edited. -/
version : Nat
/-- Current text of the document. -/
/--
Current text of the document.
It is maintained such that it is normalized using `String.crlfToLf`, which preserves logical line/column numbers.
(Note: we assume that edit operations never split or merge `\r\n` line endings.) -/
text : FileMap
/--
Controls when dependencies of the document are built on `textDocument/didOpen` notifications.
Expand All @@ -98,7 +101,11 @@ def replaceLspRange (text : FileMap) (r : Lsp.Range) (newText : String) : FileMa
let «end» := text.lspPosToUtf8Pos r.«end»
let pre := text.source.extract 0 start
let post := text.source.extract «end» text.source.endPos
(pre ++ newText ++ post).toFileMap
-- `pre` and `post` already have normalized line endings, so only `newText` needs its endings normalized.
-- Note: this assumes that editing never separates a `\r\n`.
-- If `pre` ends with `\r` and `newText` begins with `\n`, the result is potentially inaccurate.
-- If this is ever a problem, we could store a second unnormalized FileMap, edit it, and normalize it here.
(pre ++ newText.crlfToLf ++ post).toFileMap

open IO

Expand All @@ -125,7 +132,7 @@ def applyDocumentChange (oldText : FileMap) : (change : Lsp.TextDocumentContentC
| TextDocumentContentChangeEvent.rangeChange (range : Range) (newText : String) =>
replaceLspRange oldText range newText
| TextDocumentContentChangeEvent.fullChange (newText : String) =>
newText.toFileMap
newText.crlfToLf.toFileMap
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to move crlfToLf to toFileMap?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The replaceLspRange case made me decide not to incorporate crlfToLf into toFileMap. We only want to normalize the line endings of the new text, and it's worth making it explicit where we're normalizing.

There's still something unsatisfactory however, which is that if somehow a \r\n is split and re-merged by edit operations, then we can get into a situation where there's an unnormalized \r\n. I'm not sure if this is an issue that could crop up in practice.

If we're not happy with leaving it in the current form, here are some solutions that came to mind:

  1. Have a second FileMap with the unnormalized text. Make sure LSP operations edit that, and then re-normalize it to create the actual FileMap used by elaboration each time.
  2. Do that, but accelerate it somehow. There's probably some fancy data structure supporting inserting-and-normalizing.
  3. Make the LSP throw an error when it detects a loose \r. It's not clear how to get this information back to the user in a nice way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current state seems fine. I'm more worried about accidentally forgetting crlfToLf somewhere in the future, but I agree that this is not that important and explicitly seeing the normalization in the code is helpful, too.


/-- Returns the document contents with all changes applied. -/
def foldDocumentChanges (changes : Array Lsp.TextDocumentContentChangeEvent) (oldText : FileMap) : FileMap :=
Expand Down
9 changes: 3 additions & 6 deletions src/Lean/Server/Watchdog.lean
Original file line number Diff line number Diff line change
Expand Up @@ -728,12 +728,9 @@ end RequestHandling
section NotificationHandling
def handleDidOpen (p : LeanDidOpenTextDocumentParams) : ServerM Unit :=
let doc := p.textDocument
/- NOTE(WN): `toFileMap` marks line beginnings as immediately following
"\n", which should be enough to handle both LF and CRLF correctly.
This is because LSP always refers to characters by (line, column),
so if we get the line number correct it shouldn't matter that there
is a CR there. -/
startFileWorker ⟨doc.uri, doc.version, doc.text.toFileMap, p.dependencyBuildMode?.getD .always⟩
/- Note (kmill): LSP always refers to characters by (line, column),
so converting CRLF to LF preserves line and column numbers. -/
startFileWorker ⟨doc.uri, doc.version, doc.text.crlfToLf.toFileMap, p.dependencyBuildMode?.getD .always⟩

def handleDidChange (p : DidChangeTextDocumentParams) : ServerM Unit := do
let doc := p.textDocument
Expand Down
3 changes: 1 addition & 2 deletions src/lake/Lake/Build/Trace.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ Copyright (c) 2021 Mac Malone. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Mac Malone
-/
import Lake.Util.Newline

open System
namespace Lake
Expand Down Expand Up @@ -135,7 +134,7 @@ instance : ComputeHash FilePath IO := ⟨computeFileHash⟩

def computeTextFileHash (file : FilePath) : IO Hash := do
let text ← IO.FS.readFile file
let text := crlf2lf text
let text := text.crlfToLf
return Hash.ofString text

/--
Expand Down
24 changes: 0 additions & 24 deletions src/lake/Lake/Util/Newline.lean

This file was deleted.

3 changes: 1 addition & 2 deletions src/lake/tests/toml/Test.lean
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Authors: Mac Malone
-/
import Lake.Toml
import Lake.Util.Message
import Lake.Util.Newline

/-!
## TOML Test Runner
Expand All @@ -30,7 +29,7 @@ nonrec def loadToml (tomlFile : FilePath) : BaseIO TomlOutcome := do
match (← IO.FS.readBinFile tomlFile |>.toBaseIO) with
| .ok bytes =>
if let some input := String.fromUTF8? bytes then
pure (crlf2lf input)
pure input.crlfToLf
else
return .fail <| MessageLog.empty.add
{fileName, pos := ⟨1,0⟩, data := m!"file contains invalid characters"}
Expand Down
39 changes: 39 additions & 0 deletions tests/lean/run/crlfToLf.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/-!
# Test `String.crlfToLf`
-/

/-!
Leaves single `\n`'s alone.
-/
/-- info: "hello\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\nworld"

/-!
Turns `\r\n` into `\n`.
-/
/-- info: "hello\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\r\nworld"

/-!
In a string of `\r...\r\n`, only normalizes the last `\r\n`.
-/
/-- info: "hello\x0d\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\r\r\nworld"

/-!
Two in a row.
-/
/-- info: "hello\n\nworld" -/
#guard_msgs in #eval String.crlfToLf "hello\r\n\r\nworld"

/-!
Normalizes at the end.
-/
/-- info: "hello\nworld\n" -/
#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r\n"

/-!
Can handle a loose `\r` as the last character.
-/
/-- info: "hello\nworld\x0d" -/
#guard_msgs in #eval String.crlfToLf "hello\r\nworld\r"
Loading
Loading