fix: apply newlines before and after comments when formatting syntax (#8626)

This PR closes #3791, making sure that the Syntax formatter inserts whitespace before and after comments in the leading and trailing text of Syntax to avoid having comments comment out any following syntax, and to avoid comments' lexical syntax from being interpreted as being part of another syntax. If the text contains newlines before or after any comments, they are formatted as hard newlines rather than soft newlines. For example, `--` comments will have a hard newline after them. Note: metaprograms generating Syntax with comments should be sure to include newlines at the ends of `--` comments.
2025-06-26 12:23:35 -07:00 · 2025-06-26 12:23:35 -07:00 · b56ad5a7d2
commit b56ad5a7d2
parent 7ed716f904
6 changed files with 235 additions and 42 deletions
--- a/src/Lean/PrettyPrinter/Formatter.lean
+++ b/src/Lean/PrettyPrinter/Formatter.lean
@ -31,7 +31,7 @@ structure Context where
 structure State where
  stxTrav  : Syntax.Traverser
  /-- Textual content of `stack` up to the first whitespace (not enclosed in an escaped ident). We assume that the textual
-  content of `stack` is modified only by `pushText` and `pushLine`, so `leadWord` is adjusted there accordingly. -/
+  content of `stack` is modified only by `push` and `pushWhitespace`, so `leadWord` is adjusted there accordingly. -/
  leadWord : String := ""
  /-- When the `leadWord` is nonempty, whether it is an identifier. Identifiers get space inserted between them. -/
  leadWordIdent : Bool := false
@ -351,18 +351,21 @@ def parseToken (s : String) : FormatterM ParserState :=
  } ((← read).table) (Parser.mkParserState s)

 def pushToken (info : SourceInfo) (tk : String) (ident : Bool) : FormatterM Unit := do
-  match info with
-  | SourceInfo.original _ _ ss _ =>
-    -- preserve non-whitespace content (i.e. comments)
+  if let SourceInfo.original _ _ ss _ := info then
+    -- preserve non-whitespace content (comments)
    let ss' := ss.trim
-    if !ss'.isEmpty then
-      let ws := { ss with startPos := ss'.stopPos }
-      if ws.contains '\n' then
-        push s!"\n{ss'}"
+    unless ss'.isEmpty do
+      let preNL := Substring.contains { ss with stopPos := ss'.startPos } '\n'
+      let postNL := Substring.contains { ss with startPos := ss'.stopPos } '\n'
+      if postNL then
+        pushWhitespace "\n"
+      else if !(← get).leadWord.isEmpty then
+        pushLine
+      push ss'.toString
+      if preNL then
+        pushWhitespace "\n"
      else
-        push s!"  {ss'}"
-      modify fun st => { st with leadWord := "", leadWordIdent := false }
-  | _ => pure ()
+        pushLine

  let st ← get
  -- If there is no space between `tk` and the next word, see if we should insert a discretionary space.
@ -400,21 +403,27 @@ def pushToken (info : SourceInfo) (tk : String) (ident : Bool) : FormatterM Unit
      push tk -- preserve special whitespace for tokens like ":=\n"
    modify fun st => { st with leadWord := if tk.trimLeft == tk then tk else "", leadWordIdent := ident }

-  match info with
-  | SourceInfo.original ss _ _ _ =>
-    -- preserve non-whitespace content (i.e. comments)
+  if let SourceInfo.original ss _ _ _ := info then
+    -- preserve non-whitespace content (comments)
    let ss' := ss.trim
-    if !ss'.isEmpty then
-      let ws := { ss with startPos := ss'.stopPos }
-      if ws.contains '\n' then do
-        -- Indentation is automatically increased when entering a category, but comments should be aligned
-        -- with the actual token, so dedent
-        indent (push s!"{ss'}\n") (some ((0:Int) - Std.Format.getIndent (← getOptions)))
-      else
-        pushLine
-        push ss'.toString
-      modify fun st => { st with leadWord := "" }
-  | _ => pure ()
+    unless ss'.isEmpty do
+      let preNL := Substring.contains { ss with stopPos := ss'.startPos } '\n'
+      let postNL := Substring.contains { ss with startPos := ss'.stopPos } '\n'
+      -- Indentation is automatically increased when entering a category, but comments should be aligned
+      -- with the actual token, so dedent
+      indent (indent := some (-Std.Format.getIndent (← getOptions))) do
+        if postNL then
+          pushWhitespace "\n"
+        else
+          pushLine
+        pushWhitespace ss'.toString
+        if preNL then
+          pushWhitespace "\n"
+        else
+          -- It is conceivable that the start of comment syntax could be misinterpreted as part of a token,
+          -- so add the beginning of it as the leadWord.
+          let ctk := ss' |>.takeWhile (!·.isWhitespace) |>.toString
+          modify fun st => { st with leadWord := ctk }

@[combinator_formatter symbolNoAntiquot]
 def symbolNoAntiquot.formatter (sym : String) : Formatter := do
--- a/tests/lean/Reformat.lean
+++ b/tests/lean/Reformat.lean
@ -7,21 +7,28 @@ open Lean.Elab.Term
 open Std.Format open Std

 unsafe def main (args : List String) : IO Unit := do
-let (debug, f) : Bool × String := match args with
-  | [f, "-d"] => (true, f)
-  | [f]       => (false, f)
-  | _         => panic! "usage: file [-d]"
-let env ← mkEmptyEnvironment
-let stx ← Lean.Parser.testParseFile env args.head!
-let (f, _) ← (tryFinally (PrettyPrinter.ppModule stx) printTraces).toIO { options := Options.empty.setBool `trace.PrettyPrinter.format debug, fileName := "", fileMap := default } { env := env }
-IO.print f
-let stx' ← Lean.Parser.testParseModule env args.head! (toString f)
-if stx' != stx then
-  let stx := stx.raw.getArg 1
-  let stx' := stx'.raw.getArg 1
-  stx.getArgs.size.forM fun i _ => do
-    if stx.getArg i != stx'.getArg i then
-      throw $ IO.userError s!"reparsing failed:\n{stx.getArg i}\n{stx'.getArg i}"
+  let (debug, fn) : Bool × String := match args with
+    | [f, "-d"] => (true, f)
+    | [f]       => (false, f)
+    | _         => panic! "usage: file [-d]"
+  let env ← mkEmptyEnvironment
+  let stx ← Lean.Parser.testParseFile env fn
+  let act : CoreM Format := do
+    withOptions (fun opts =>
+        opts
+          -- Name sanitization clears inline comments attached to identifiers.
+          |>.set `pp.sanitizeNames false
+          |>.set `trace.PrettyPrinter.format debug) do
+      tryFinally (PrettyPrinter.ppModule stx) printTraces
+  let (f, _) ← act.toIO { fileName := "", fileMap := default } { env := env }
+  IO.print f
+  let stx' ← Lean.Parser.testParseModule env fn (toString f)
+  if stx' != stx then
+    let stx := stx.raw.getArg 1
+    let stx' := stx'.raw.getArg 1
+    stx.getArgs.size.forM fun i _ => do
+      if stx.getArg i != stx'.getArg i then
+        throw $ IO.userError s!"reparsing failed:\n{stx.getArg i}\n{stx'.getArg i}"

 -- abbreviated Prelude.lean, which can be parsed without elaboration
 #eval main ["Reformat/Input.lean"]
--- a/tests/lean/Reformat.lean.expected.out
+++ b/tests/lean/Reformat.lean.expected.out
@ -11,6 +11,7 @@ universe u v w
 def id {α : Sort u} (a : α) : α :=
  a

+
 /-
 The kernel definitional equality test (t =?= s) has special support for idDelta applications.
 It implements the following rules
@ -29,6 +30,7 @@ theorems generated by the equation Compiler.
 def idDelta {α : Sort u} (a : α) : α :=
  a

+
 /- `idRhs` is an auxiliary declaration used to implement "smart unfolding". It is used as a marker. -/
@[macro_inline, reducible]
 def idRhs (α : Sort u) (a : α) : α :=
@ -107,6 +109,7 @@ def cast {α β : Sort u} (h : Eq α β) (a : α) : β :=
 theorem congrArg {α : Sort u} {β : Sort v} {a₁ a₂ : α} (f : α → β) (h : Eq a₁ a₂) : Eq (f a₁) (f a₂) :=
  h ▸ rfl

+
 /-
 Initialize the Quotient Module, which effectively adds the following definitions:

@ -165,6 +168,7 @@ inductive Bool : Type

 export Bool (false true)

+
 /- Remark: Subtype must take a Sort instead of Type because of the axiom strongIndefiniteDescription. -/
 structure Subtype {α : Sort u} (p : α → Prop) :=
  (val : α)
@ -190,6 +194,7 @@ def typedExpr (α : Sort u) (a : α) : α :=
 def namedPattern {α : Sort u} (x a : α) : α :=
  a

+
 /- Auxiliary axiom used to implement `sorry`. -/
 axiom sorryAx (α : Sort u) (synthetic := true) : α

@ -225,6 +230,7 @@ instance (α : Sort u) {β : α → Sort v} [(a : α) → Inhabited (β a)] : In
 structure PLift (α : Sort u) : Type u := up ::
  (down : α)

+
 /- Bijection between α and PLift α -/
 theorem PLift.upDown {α : Sort u} : ∀ (b : PLift α), Eq (up (down b)) b
  | up a => rfl
@ -232,6 +238,7 @@ theorem PLift.upDown {α : Sort u} : ∀ (b : PLift α), Eq (up (down b)) b
 theorem PLift.downUp {α : Sort u} (a : α) : Eq (down (up a)) a :=
  rfl

+
 /- Pointed types -/
 structure PointedType :=
  (type : Type u)
@ -244,6 +251,7 @@ instance : Inhabited PointedType.{u} :=
 structure ULift.{r, s} (α : Type s) : Type (max s r) := up ::
  (down : α)

+
 /- Bijection between α and ULift.{v} α -/
 theorem ULift.upDown {α : Type u} : ∀ (b : ULift.{v} α), Eq (up (down b)) b
  | up a => rfl
@ -311,6 +319,7 @@ open BEq (beq)
 instance {α : Type u} [DecidableEq α] : BEq α :=
  ⟨fun a b => decide (Eq a b)⟩

+
 -- We use "dependent" if-then-else to be able to communicate the if-then-else condition
 -- to the branches
@[macro_inline]
--- a/tests/lean/binopInfoTree.lean.expected.out
+++ b/tests/lean/binopInfoTree.lean.expected.out
@ -10,10 +10,12 @@
        1 + 2 +
          3
            -- should propagate through multiple macro expansions
+            
        ===>
        binop% HAdd.hAdd✝ (1 + 2)
          3
            -- should propagate through multiple macro expansions
+            
        • [Term] 1 + 2 + 3 : Nat @ ⟨5, 7⟩†-⟨5, 16⟩† @ Lean.Elab.Term.Op.elabBinOp
          • [Term] 1 + 2 + 3 : Nat @ ⟨5, 7⟩†-⟨5, 16⟩†
            • [Term] 1 + 2 : Nat @ ⟨5, 7⟩-⟨5, 12⟩ @ «_aux_Init_Notation___macroRules_term_+__2»
--- a/tests/lean/run/3791.lean
+++ b/tests/lean/run/3791.lean
@ -0,0 +1,165 @@
+import Lean
+/-!
+# Formatter: preserve hard newlines around comments (#3791)
+
+This is a regression test for https://github.com/leanprover/lean4/issues/3791
+
+The issue was that the formatter was not preserving whether there were hard newlines
+around comments in Syntax.
+
+Note: one should turn off the sanitizer when pretty printing user syntax,
+since the sanitizer clears comments from all identifiers.
+-/
+
+open Lean
+
+deriving instance Repr for Std.Format.FlattenBehavior, Format
+
+elab "#parse_cmd " update?:("!")? dbg?:("?")? s:str : command => Elab.Command.liftTermElabM do
+  withOptions (fun opts => opts.set pp.sanitizeNames.name false) do
+    let s := s.getString
+    let mut cmd ← ofExcept <| Parser.runParserCategory (← getEnv) `command s
+    if update?.isSome then
+      cmd := cmd.updateLeading
+    let fmt ← PrettyPrinter.ppCommand ⟨cmd⟩
+    if dbg?.isSome then
+      dbg_trace "{repr cmd}"; dbg_trace "{repr fmt}"
+    logInfo (toString f!"‹{fmt}›")
+
+elab "#parse_term " update?:("!")? dbg?:("?")? s:str : command => Elab.Command.liftTermElabM do
+  withOptions (fun opts => opts.set pp.sanitizeNames.name false) do
+    let s := s.getString
+    let mut term ← ofExcept <| Parser.runParserCategory (← getEnv) `term s
+    if update?.isSome then
+      term := term.updateLeading
+    let fmt ← PrettyPrinter.ppTerm ⟨term⟩
+    if dbg?.isSome then
+      dbg_trace "{repr term}"; dbg_trace "{repr fmt}"
+    logInfo (toString f!"‹{fmt}›")
+
+/-!
+Newline before and after comment, puts hard newlines before and after.
+("commented out" comes from #3791; formerly `1 = 1` was being commented out by the comment
+in the output.)
+-/
+/--
+info: ‹theorem ohno :
+    -- commented out:
+    1 = 1 :=
+  trivial›
+-/
+#guard_msgs in #parse_cmd "theorem ohno : \n\
+    -- commented out:\n\
+    1 = 1 := trivial"
+
+/-!
+No newline before but a newline after, puts hard newline after.
+-/
+/--
+info: ‹theorem ohno : /- commented out: -/
+    1 = 1 :=
+  trivial›
+-/
+#guard_msgs in #parse_cmd "theorem ohno :   /- commented out: -/\n\
+    1 = 1 := trivial"
+
+/-!
+Now newlines before or after, puts no newlines.
+-/
+/--
+info: ‹theorem ohno : /- commented out: -/ 1 = 1 :=
+  trivial›
+-/
+#guard_msgs in #parse_cmd "theorem ohno :  /- commented out: -/  1 = 1 := trivial"
+
+/-!
+Inserts whitespace before comment, but not a hard newline.
+There's a hard newline after the comment.
+-/
+/--
+info: ‹∀ x : ℕ, -- commented out:
+  x = x›
+-/
+#guard_msgs in #parse_term "∀ x : ℕ,-- commented out:\n\
+    x = x"
+
+/-!
+When it's wide enough, gets a discretionary newline after.
+-/
+/--
+info: ‹∀ xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx, /- commented out: -/
+  x = x›
+-/
+#guard_msgs in
+#parse_term "∀ xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,/- commented out: -/x = x"
+
+/-!
+When it's wide enough, gets a discretionary newline before.
+-/
+/--
+info: ‹∀ xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,
+  /- commented out: -/ x = x›
+-/
+#guard_msgs in
+#parse_term "∀ xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,/- commented out: -/x = x"
+
+/-!
+Hard newlines before and after in terms.
+-/
+/--
+info: ‹∀ x : ℕ,
+  -- commented out:
+  x = x›
+-/
+#guard_msgs in #parse_term "∀ x : ℕ, \n\
+    -- commented out:\n\
+    x = x"
+
+/-!
+No newlines before or after, but discretionary whitespace.
+-/
+/-- info: ‹∀ x : ℕ, /- commented out: -/ x = x› -/
+#guard_msgs in #parse_term "∀ x : ℕ, \
+    /- commented out: -/\
+    x = x"
+
+/-!
+Inline comments
+-/
+/-- info: ‹[1, 2, 3 /- last comment -/ ]› -/
+#guard_msgs in #parse_term "[\n1,2,3 /- last comment -/]"
+/--
+info: ‹[1, 2,
+  3 -- last comment
+    ]›
+-/
+#guard_msgs in #parse_term "[\n1,2,3 -- last comment\n]"
+/-- info: ‹(f /- c -/ x)› -/
+#guard_msgs in #parse_term "(f /- c -/ x)"
+
+/-!
+Comments after a term.
+-/
+/-- info: ‹(x /-comment-/ )› -/
+#guard_msgs in #parse_term "(x /-comment-/)"
+/--
+info: ‹(x --comment
+  )›
+-/
+#guard_msgs in #parse_term "(x --comment\n)"
+
+/-!
+Comments before a term. The `!` mode applies `updateLeading`, and the newline causes the comment
+to be applied to the succeeding term.
+-/
+/--
+info: ‹(
+/-comment-/ 1)›
+-/
+#guard_msgs in #parse_term ! "(\n/-comment-/ 1)"
+/--
+info: ‹(
+-- comment
+1)›
+-/
+#guard_msgs in #parse_term ! "(\n-- comment\n 1)"
--- a/tests/lean/run/PPTopDownAnalyze.lean
+++ b/tests/lean/run/PPTopDownAnalyze.lean
@ -14,8 +14,9 @@ def checkDelab (e : Expr) (tgt? : Option Term) (name? : Option Name := none) : T
  let stx ← delab e
  match tgt? with
  | some tgt =>
-    if toString (← PrettyPrinter.ppTerm stx) != toString (← PrettyPrinter.ppTerm tgt?.get!) then
-      throwError "{pfix} missed target\n{← PrettyPrinter.ppTerm stx}\n!=\n{← PrettyPrinter.ppTerm tgt}\n\nExpr: {e}\nType: {← inferType e}"
+    let tgt := tgt.raw.rewriteBottomUp Syntax.unsetTrailing
+    if toString (← PrettyPrinter.ppTerm stx) != toString (← PrettyPrinter.ppTerm ⟨tgt⟩) then
+      throwError "{pfix} missed target\n{← PrettyPrinter.ppTerm stx}\n!=\n{← PrettyPrinter.ppTerm ⟨tgt⟩}\n\nExpr: {e}\nType: {← inferType e}"
  | _ => pure ()

  let e' ←