From 35944c367b9468fc49ac57aa321e9dd69fa9b586 Mon Sep 17 00:00:00 2001 From: Marc Huisinga Date: Fri, 6 Mar 2026 13:46:44 +0100 Subject: [PATCH] feat: leading whitespace on first token (#12662) This PR adjusts the module parser to set the leading whitespace of the first token to the whitespace up to that token. If there are no actual tokens in the file, the leading whitespace is set on the final (empty) EOI token. This ensures that we do not lose the initial whitespace (e.g. comments) of a file in `Syntax`. (Tests generated/adjusted by Claude) Co-authored-by: Claude Sonnet 4.6 --- src/Lean/Elab/DocString/Builtin.lean | 6 +- src/Lean/Parser/Module.lean | 36 +++++-- tests/elab/Reformat.lean | 15 ++- tests/elab/Reformat.lean.out.expected | 2 + tests/elab/Reparen.lean | 7 +- .../run/moduleHeaderLeadingWhitespace.lean | 95 +++++++++++++++++++ 6 files changed, 144 insertions(+), 17 deletions(-) create mode 100644 tests/lean/run/moduleHeaderLeadingWhitespace.lean diff --git a/src/Lean/Elab/DocString/Builtin.lean b/src/Lean/Elab/DocString/Builtin.lean index 093dbaa447..42a977ff63 100644 --- a/src/Lean/Elab/DocString/Builtin.lean +++ b/src/Lean/Elab/DocString/Builtin.lean @@ -917,7 +917,11 @@ def lean (name : Option Ident := none) (error warning : flag false) («show» : let scopes := (← get).scopes let (cmds, cmdState, trees) ← withSaveInfoContext do let mut cmdState : Command.State := { env, maxRecDepth := ← MonadRecDepth.getMaxRecDepth, scopes } - let mut pstate : Parser.ModuleParserState := {pos := pos, recovering := false} + let mut pstate : Parser.ModuleParserState := { + pos + recovering := false + hasLeading := false + } let mut cmds := #[] repeat let scope := cmdState.scopes.head! diff --git a/src/Lean/Parser/Module.lean b/src/Lean/Parser/Module.lean index cd82f7496c..61d8f4e236 100644 --- a/src/Lean/Parser/Module.lean +++ b/src/Lean/Parser/Module.lean @@ -26,7 +26,12 @@ end Module structure ModuleParserState where pos : String.Pos.Raw := 0 - recovering : Bool := false + recovering : Bool := false + /-- + Whether there is leading whitespace before `pos`. + Used to associate whitespace at the start of the file with the first token of the file. + -/ + hasLeading : Bool := true deriving Inhabited private partial def mkErrorMessage (c : InputContext) (pos : String.Pos.Raw) (stk : SyntaxStack) (e : Parser.Error) : Message := Id.run do @@ -60,6 +65,12 @@ where if let .original (trailing := trailing) .. := stx.getTailInfo then pure (some trailing) else none +private def setStartOfFileLeading (stx : Syntax) : Syntax × Bool := Id.run do + let some (.original leading pos trailing endPos) ← stx.getHeadInfo? + | return (stx, false) + let info := .original { leading with startPos := 0 } pos trailing endPos + return (stx.setHeadInfo info, true) + def parseHeader (inputCtx : InputContext) : IO (TSyntax ``Module.header × ModuleParserState × MessageLog) := do let dummyEnv ← mkEmptyEnvironment let p := andthenFn whitespace Module.header.fn @@ -96,10 +107,12 @@ def parseHeader (inputCtx : InputContext) : IO (TSyntax ``Module.header × Modul consider using separate `public import {mod}` and `import all {mod}` directives \ in order to import public data into the public scope and private data into the \ private scope." - pure (⟨stx⟩, {pos := s.pos, recovering := s.hasError}, messages) + let (stx, isLeadingSet) := setStartOfFileLeading stx + pure (⟨stx⟩, { pos := s.pos, recovering := s.hasError, hasLeading := ! isLeadingSet }, messages) -private def mkEOI (pos : String.Pos.Raw) : Syntax := - let atom := mkAtom (SourceInfo.original "".toRawSubstring pos "".toRawSubstring pos) "" +private def mkEOI (inputCtx : InputContext) (pos : String.Pos.Raw) : Syntax := + let emptyAt := inputCtx.substring pos pos + let atom := mkAtom (SourceInfo.original emptyAt pos emptyAt pos) "" mkNode ``Command.eoi #[atom] def isTerminalCommand (s : Syntax) : Bool := @@ -122,7 +135,7 @@ partial def parseCommand (inputCtx : InputContext) (pmctx : ParserModuleContext) let mut stx := Syntax.missing -- will always be assigned below repeat if inputCtx.atEnd pos then - stx := mkEOI pos + stx := mkEOI inputCtx pos break let pos' := pos let p := andthenFn whitespace topLevelCommandParserFn @@ -155,7 +168,10 @@ partial def parseCommand (inputCtx : InputContext) (pmctx : ParserModuleContext) else stx := s.stxStack.back break - return (stx, { pos, recovering }, messages) + if mps.hasLeading then + -- File header was empty - set start-of-file leading whitespace on first token in `stx` + (stx, _) := setStartOfFileLeading stx + return (stx, { pos, recovering, hasLeading := false }, messages) -- only useful for testing since most Lean files cannot be parsed without elaboration @@ -165,7 +181,7 @@ partial def testParseModuleAux (env : Environment) (inputCtx : InputContext) (s | (stx, state, msgs) => if isTerminalCommand stx then if !msgs.hasUnreported then - pure stxs + pure (stxs.push stx) else do msgs.forM fun msg => msg.toString >>= IO.println throw (IO.userError "failed to parse file") @@ -173,14 +189,14 @@ partial def testParseModuleAux (env : Environment) (inputCtx : InputContext) (s parse state msgs (stxs.push stx) parse s msgs stxs -def testParseModule (env : Environment) (fname contents : String) : IO (TSyntax ``Parser.Module.module) := do +def testParseModule (env : Environment) (fname contents : String) : IO Syntax := do let inputCtx := mkInputContext contents fname let (header, state, messages) ← parseHeader inputCtx let cmds ← testParseModuleAux env inputCtx state messages #[] let stx := mkNode `Lean.Parser.Module.module #[header, mkListNode cmds] - pure ⟨stx.raw.updateLeading⟩ + pure stx -def testParseFile (env : Environment) (fname : System.FilePath) : IO (TSyntax ``Parser.Module.module) := do +def testParseFile (env : Environment) (fname : System.FilePath) : IO Syntax := do let contents ← IO.FS.readFile fname testParseModule env fname.toString contents diff --git a/tests/elab/Reformat.lean b/tests/elab/Reformat.lean index 544b85e500..9a16c73ce9 100644 --- a/tests/elab/Reformat.lean +++ b/tests/elab/Reformat.lean @@ -12,20 +12,27 @@ unsafe def main (args : List String) : IO Unit := do | [f] => (false, f) | _ => panic! "usage: file [-d]" let env ← mkEmptyEnvironment - let stx ← Lean.Parser.testParseFile env fn + let stx₀ ← Lean.Parser.testParseFile env fn + -- `testParseFile` matches the real pipeline (no `updateLeading`), but reformatting + -- needs leading info to correctly place inter-declaration comments, just like + -- `script/reformat.lean` which explicitly calls `updateLeading`. + let stx : Syntax := stx₀.updateLeading let act : CoreM Format := do withOptions (fun opts => opts -- Name sanitization clears inline comments attached to identifiers. |>.set `pp.sanitizeNames false |>.set `trace.PrettyPrinter.format debug) do - tryFinally (PrettyPrinter.ppModule stx) printTraces + tryFinally (PrettyPrinter.ppModule ⟨stx⟩) printTraces let (f, _) ← act.toIO { fileName := "", fileMap := default } { env := env } + -- `testParseModule` includes the EOI token in the command list (matching the real + -- pipeline), so `ppModule` emits two trailing newlines via the `ppLine >> ppLine` + -- separator for EOI's empty command text. IO.print f let stx' ← Lean.Parser.testParseModule env fn (toString f) if stx' != stx then - let stx := stx.raw.getArg 1 - let stx' := stx'.raw.getArg 1 + let stx := stx.getArg 1 + let stx' := stx'.getArg 1 stx.getArgs.size.forM fun i _ => do if stx.getArg i != stx'.getArg i then throw $ IO.userError s!"reparsing failed:\n{stx.getArg i}\n{stx'.getArg i}" diff --git a/tests/elab/Reformat.lean.out.expected b/tests/elab/Reformat.lean.out.expected index f1ef8630d2..2f5a1d8483 100644 --- a/tests/elab/Reformat.lean.out.expected +++ b/tests/elab/Reformat.lean.out.expected @@ -326,3 +326,5 @@ instance {α : Type u} [DecidableEq α] : BEq α := def dite {α : Sort u} (c : Prop) [h : Decidable c] (t : c → α) (e : Not c → α) : α := Decidable.casesOn (motive := fun _ => α) h e t + + diff --git a/tests/elab/Reparen.lean b/tests/elab/Reparen.lean index f9a95be3d6..765e0c91e5 100644 --- a/tests/elab/Reparen.lean +++ b/tests/elab/Reparen.lean @@ -33,10 +33,13 @@ let (debug, f) : Bool × String := match args with | _ => panic! "usage: file [-d]"; let env ← mkEmptyEnvironment; let stx ← Lean.Parser.testParseFile env args.head!; -let header := stx.raw.getArg 0; +-- `testParseFile` matches the real pipeline (no `updateLeading`), but reprinting +-- needs leading info to correctly place inter-declaration whitespace. +let stx := stx.updateLeading; +let header := stx.getArg 0; let some s ← pure header.reprint | throw $ IO.userError "header reprint failed"; IO.print s; -let cmds := (stx.raw.getArg 1).getArgs; +let cmds := (stx.getArg 1).getArgs; cmds.forM $ fun cmd => do let cmd := unparen cmd; let (cmd, _) ← (tryFinally (PrettyPrinter.parenthesizeCommand cmd) printTraces).toIO { options := Options.empty.set `trace.PrettyPrinter.parenthesize debug, fileName := "", fileMap := default } { env := env }; diff --git a/tests/lean/run/moduleHeaderLeadingWhitespace.lean b/tests/lean/run/moduleHeaderLeadingWhitespace.lean new file mode 100644 index 0000000000..ac11f46942 --- /dev/null +++ b/tests/lean/run/moduleHeaderLeadingWhitespace.lean @@ -0,0 +1,95 @@ +-- Tests that file-start whitespace and comments are associated with the leading +-- of the first actual token in the file (in the module header or first command). +import Lean + +open Lean Parser + +/-- Parse a header string and return the leading Substring.Raw of the first token. -/ +def getFirstTokenLeading (input : String) : IO (Option Substring.Raw) := do + let inputCtx := mkInputContext input "" + let (header, _, _) ← parseHeader inputCtx + return match header.raw.getHeadInfo? with + | some (.original leading ..) => some leading + | _ => none + +-- Test 1: file-start line comment before import +-- The leading of the `import` keyword should start at position 0 and contain the comment. +/-- +info: leading startPos: 0 +leading content: -- file-start comment + -/ +#guard_msgs in +#eval do + let some leading ← getFirstTokenLeading "-- file-start comment\nimport Foo\n" + | IO.println "no leading found" + IO.println s!"leading startPos: {leading.startPos.byteIdx}" + IO.println s!"leading content: {Substring.Raw.toString leading}" + +-- Test 2: file-start block comment before import +/-- +info: leading startPos: 0 +leading content: /- block comment -/ + -/ +#guard_msgs in +#eval do + let some leading ← getFirstTokenLeading "/- block comment -/\nimport Foo\n" + | IO.println "no leading found" + IO.println s!"leading startPos: {leading.startPos.byteIdx}" + IO.println s!"leading content: {Substring.Raw.toString leading}" + +-- Test 3: no file-start whitespace (import is the very first thing) +-- The leading should start at position 0 with stopPos 0 (empty leading). +/-- +info: leading startPos: 0 +leading stopPos: 0 -/ +#guard_msgs in +#eval do + let some leading ← getFirstTokenLeading "import Foo\n" + | IO.println "no leading found" + IO.println s!"leading startPos: {leading.startPos.byteIdx}" + IO.println s!"leading stopPos: {leading.stopPos.byteIdx}" + +-- Test 4: file with only newlines at the start (no comments) +/-- +info: leading startPos: 0 +leading content: + + -/ +#guard_msgs in +#eval do + let some leading ← getFirstTokenLeading "\n\nimport Foo\n" + | IO.println "no leading found" + IO.println s!"leading startPos: {leading.startPos.byteIdx}" + IO.println s!"leading content: {Substring.Raw.toString leading}" + +-- Test 5: entirely empty file (only a comment, no header or body) +-- The EOI token's leading should include the file-start comment. +/-- +info: first token leading startPos: 0 +first token leading content: -- just a comment + -/ +#guard_msgs in +#eval do + let env ← mkEmptyEnvironment + let stx ← testParseModule env "" "-- just a comment\n" + match stx.getHeadInfo? with + | some (.original leading ..) => + IO.println s!"first token leading startPos: {leading.startPos.byteIdx}" + IO.println s!"first token leading content: {Substring.Raw.toString leading}" + | _ => IO.println "no original leading found" + +-- Test 6: empty header (no imports) - file-start comment before a def +-- The leading of the first command token should include the file-start comment. +/-- +info: first token leading startPos: 0 +first token leading content: -- file-start comment + -/ +#guard_msgs in +#eval do + let env ← mkEmptyEnvironment + let stx ← testParseModule env "" "-- file-start comment\ndef foo := 1\n" + match stx.getHeadInfo? with + | some (.original leading ..) => + IO.println s!"first token leading startPos: {leading.startPos.byteIdx}" + IO.println s!"first token leading content: {Substring.Raw.toString leading}" + | _ => IO.println "no original leading found"