feat: add unicode(...) parser syntax and pp.unicode option (#10373)

This PR adds a `pp.unicode` option and a `unicode("→", "->")` syntax description alias for the lower-level `unicodeSymbol "→" "->"` parser. The syntax is added to the `notation` command as well. When `pp.unicode` is true (the default) then the first form is used when pretty printing, and otherwise the second ASCII form is used. A variant, `unicode("→", "->", preserveForPP)` causes the `->` form to be preferred; delaborators can insert `→` directly into the syntax, which will be pretty printed as-is; this allows notations like `fun` to use custom options such as `pp.unicode.fun` to opt into the unicode form when pretty printing. Additionally: - Adds more documentation for the `symbol` and `nonReservedSymbol` parser descriptions. - Adds documentation for the `infix`/`infixr`/`infixl`/`prefix`/`postfix` commands. - The parenthesizers for symbols are improved to backtrack if the atom doesn't match. - Fixes a bug where `&"..."` symbols aren't validated. This is partial progress for issue #1056. What remains is enabling `unicode(...)` for mixfix commands and then making use of it for core notation.
2025-09-13 21:40:03 -07:00 · 2025-09-13 21:40:03 -07:00 · 3e4fa12c72
commit 3e4fa12c72
parent ed5dc328d9
17 changed files with 275 additions and 56 deletions
--- a/src/Init/Prelude.lean
+++ b/src/Init/Prelude.lean
@ -5125,12 +5125,32 @@ inductive ParserDescr where
  The precedence `prec` and `lhsPrec` are used to determine whether the parser
  should apply. -/
  | trailingNode (kind : SyntaxNodeKind) (prec lhsPrec : Nat) (p : ParserDescr)
-  /-- A literal symbol parser: parses `val` as a literal.
-  This parser does not work on identifiers, so `symbol` arguments are declared
-  as "keywords" and cannot be used as identifiers anywhere in the file. -/
+  /--
+  Parses the literal symbol.
+
+  The symbol is automatically included in the set of reserved tokens ("keywords").
+  Keywords cannot be used as identifiers, unless the identifier is otherwise escaped.
+  For example, `"fun"` reserves `fun` as a keyword; to refer an identifier named `fun` one can write `«fun»`.
+  Adding a `&` prefix prevents it from being reserved, for example `&"true"`.
+
+  Whitespace before or after the atom is used as a pretty printing hint.
+  For example, `" + "` parses `+` and pretty prints it with whitespace on both sides.
+  The whitespace has no effect on parsing behavior.
+  -/
  | symbol (val : String)
-  /-- Like `symbol`, but without reserving `val` as a keyword.
-  If `includeIdent` is true then `ident` will be reinterpreted as `atom` if it matches. -/
+  /--
+  Parses a literal symbol. The `&` prefix prevents it from being included in the set of reserved tokens ("keywords").
+  This means that the symbol can still be recognized as an identifier by other parsers.
+
+  Some syntax categories, such as `tactic`, automatically apply `&` to the first symbol.
+
+  Whitespace before or after the atom is used as a pretty printing hint.
+  For example, `" + "` parses `+` and pretty prints it with whitespace on both sides.
+  The whitespace has no effect on parsing behavior.
+
+  (Not exposed by parser description syntax:
+  If the `includeIdent` argument is true, lets `ident` be reinterpreted as `atom` if it matches.)
+  -/
  | nonReservedSymbol (val : String) (includeIdent : Bool)
  /-- Parses using the category parser `catName` with right binding power
  (i.e. precedence) `rbp`. -/
@ -5150,6 +5170,19 @@ inductive ParserDescr where
  /-- `sepBy1` is just like `sepBy`, except it takes 1 or more instead of
  0 or more occurrences of `p`. -/
  | sepBy1 (p : ParserDescr) (sep : String) (psep : ParserDescr) (allowTrailingSep : Bool := false)
+  /--
+  - `unicode("→", "->")` parses a symbol matching either `→` or `->`. Each symbol is reserved.
+    The second symbol is an ASCII version of the first.
+    The  `pp.unicode` option controls which is used when pretty printing.
+  - `unicode("→", "->", preserveForPP)` is the same except for pretty printing behavior.
+    When the `pp.unicode` option is enabled, then the pretty printer uses whichever symbol
+    matches the underlying atom in the syntax.
+    The intent is that `preserveForPP` means that the ASCII variant is preferred.
+    For example, `fun` notation uses `preserveForPP` for its arrow; the delaborator chooses
+    `↦` or `=>` depending on the value of `pp.unicode.fun`, letting users opt-in to formatting with `↦`.
+    Note that `notation` creates a pretty printer preferring the ASCII version.
+  -/
+  | unicodeSymbol (val asciiVal : String) (preserveForPP : Bool)

 instance : Inhabited ParserDescr where
  default := ParserDescr.symbol ""
--- a/src/Lean/Elab/DocString/Builtin.lean
+++ b/src/Lean/Elab/DocString/Builtin.lean
@ -424,6 +424,8 @@ private partial def containsAtom (e : Expr) (atom : String) : MetaM Bool := do
    | (``ParserDescr.unary, #[.app _ (.lit (.strVal _)), p]) => containsAtom p atom
    | (``ParserDescr.binary, #[.app _ (.lit (.strVal "andthen")), p, _]) => containsAtom p atom
    | (``ParserDescr.nonReservedSymbol, #[.lit (.strVal tk), _]) => pure (tk.trim == atom)
+    | (``ParserDescr.unicodeSymbol, #[.lit (.strVal tk), .lit (.strVal asciiTk), _]) =>
+      pure (tk.trim == atom || asciiTk.trim == atom)
    | (``ParserDescr.symbol, #[.lit (.strVal tk)]) => pure (tk.trim == atom)
    | (``Parser.withAntiquot, #[_, p]) => containsAtom p atom
    | (``Parser.leadingNode, #[_, _, p]) => containsAtom p atom
@ -432,6 +434,8 @@ private partial def containsAtom (e : Expr) (atom : String) : MetaM Bool := do
    | (``Parser.nonReservedSymbol, #[.lit (.strVal tk), _]) => pure (tk.trim == atom)
    | (``Parser.symbol, #[.lit (.strVal tk)]) => pure (tk.trim == atom)
    | (``Parser.symbol, #[_nonlit]) => pure false
+    | (``Parser.unicodeSymbol, #[.lit (.strVal tk), .lit (.strVal asciiTk), _]) =>
+      pure (tk.trim == atom || asciiTk.trim == atom)
    | (``Parser.withCache, #[_, p]) => containsAtom p atom
    | _ => if tryWhnf then attempt (← Meta.whnf p) false else pure false
  attempt e true
--- a/src/Lean/Elab/Mixfix.lean
+++ b/src/Lean/Elab/Mixfix.lean
@ -15,18 +15,18 @@ namespace Lean.Elab.Command
@[builtin_macro Lean.Parser.Command.mixfix] def expandMixfix : Macro := fun stx =>
  withAttrKindGlobal stx fun stx => do
    match stx with
-    | `($[$doc?:docComment]? $[@[$attrs?,*]]? infixl:$prec $[(name := $name)]? $[(priority := $prio)]? $op => $f) =>
+    | `($[$doc?:docComment]? $[@[$attrs?,*]]? infixl:$prec $[(name := $name)]? $[(priority := $prio)]? $op:str => $f) =>
      let prec1 := quote <| (← evalPrec prec) + 1
      `($[$doc?:docComment]? $[@[$attrs?,*]]? notation:$prec $[(name := $name)]? $[(priority := $prio)]? lhs:$prec $op:str rhs:$prec1 => $f lhs rhs)
-    | `($[$doc?:docComment]? $[@[$attrs?,*]]? infix:$prec $[(name := $name)]? $[(priority := $prio)]? $op => $f) =>
+    | `($[$doc?:docComment]? $[@[$attrs?,*]]? infix:$prec $[(name := $name)]? $[(priority := $prio)]? $op:str => $f) =>
      let prec1 := quote <| (← evalPrec prec) + 1
      `($[$doc?:docComment]? $[@[$attrs?,*]]? notation:$prec $[(name := $name)]? $[(priority := $prio)]? lhs:$prec1 $op:str rhs:$prec1 => $f lhs rhs)
-    | `($[$doc?:docComment]? $[@[$attrs?,*]]? infixr:$prec $[(name := $name)]? $[(priority := $prio)]? $op => $f) =>
+    | `($[$doc?:docComment]? $[@[$attrs?,*]]? infixr:$prec $[(name := $name)]? $[(priority := $prio)]? $op:str => $f) =>
      let prec1 := quote <| (← evalPrec prec) + 1
      `($[$doc?:docComment]? $[@[$attrs?,*]]? notation:$prec $[(name := $name)]? $[(priority := $prio)]? lhs:$prec1 $op:str rhs:$prec => $f lhs rhs)
-    | `($[$doc?:docComment]? $[@[$attrs?,*]]? prefix:$prec $[(name := $name)]? $[(priority := $prio)]? $op => $f) =>
+    | `($[$doc?:docComment]? $[@[$attrs?,*]]? prefix:$prec $[(name := $name)]? $[(priority := $prio)]? $op:str => $f) =>
      `($[$doc?:docComment]? $[@[$attrs?,*]]? notation:$prec $[(name := $name)]? $[(priority := $prio)]? $op:str arg:$prec => $f arg)
-    | `($[$doc?:docComment]? $[@[$attrs?,*]]? postfix:$prec $[(name := $name)]? $[(priority := $prio)]? $op => $f) =>
+    | `($[$doc?:docComment]? $[@[$attrs?,*]]? postfix:$prec $[(name := $name)]? $[(priority := $prio)]? $op:str => $f) =>
      `($[$doc?:docComment]? $[@[$attrs?,*]]? notation:$prec $[(name := $name)]? $[(priority := $prio)]? arg:$prec $op:str => $f arg)
    | _ => Macro.throwUnsupported
 where
--- a/src/Lean/Elab/Notation.lean
+++ b/src/Lean/Elab/Notation.lean
@ -46,15 +46,30 @@ private partial def antiquote (vars : Array Syntax) : Syntax → Syntax
 def expandNotationItemIntoSyntaxItem : TSyntax ``notationItem → MacroM (TSyntax `stx)
  | `(notationItem| $_:ident$[:$prec?]?) => `(stx| term $[:$prec?]?)
  | `(notationItem| $s:str)              => `(stx| $s:str)
-  | _                                    => Macro.throwUnsupported
+  -- TODO(kmill): use after stage0 update
+  -- | `(notationItem| $u:unicodeAtom)      => `(stx| $u:unicodeAtom)
+  -- | _                                    => Macro.throwUnsupported
+  | stx =>
+    if stx.raw.isOfKind ``Parser.Syntax.unicodeAtom then
+      return ⟨stx.raw⟩
+    else
+      Macro.throwUnsupported

 /-- Convert `notation` command lhs item into a pattern element -/
 def expandNotationItemIntoPattern (stx : Syntax) : MacroM Syntax :=
  let k := stx.getKind
-  if k == `Lean.Parser.Command.identPrec then
+  if k == ``Lean.Parser.Command.identPrec then
    return mkAntiquotNode stx[0] (kind := `term) (isPseudoKind := true)
  else if k == strLitKind then
    strLitToPattern stx
+  else if k == ``Parser.Syntax.unicodeAtom then
+    let preserveForPP := !stx[4].isNone
+    if preserveForPP then
+      -- Use the ascii atom for the pattern; that way delaboration gives the ASCII version,
+      -- which is the preferred form according to `ParserDescr.unicodeSymbol`.
+      strLitToPattern stx[3]
+    else
+      strLitToPattern stx[1]
  else
    Macro.throwUnsupported

--- a/src/Lean/Elab/Syntax.lean
+++ b/src/Lean/Elab/Syntax.lean
@ -136,6 +136,8 @@ where
      processAtom stx
    else if kind == ``Lean.Parser.Syntax.nonReserved then
      processNonReserved stx
+    else if kind == ``Lean.Parser.Syntax.unicodeAtom then
+      processUnicode stx
    else
      let stxNew? ← liftM (liftMacroM (expandMacro? stx) : TermElabM _)
      match stxNew? with
@ -248,23 +250,30 @@ where
    !s.front.isDigit &&
    !(s.any Char.isWhitespace)

+  validAtom (stx : Syntax) : ToParserDescrM String := do
+    let some atom := stx.isStrLit? | throwUnsupportedSyntax
+    unless isValidAtom atom do
+      throwErrorAt stx "invalid atom"
+    return atom
+
  processAtom (stx : Syntax) := do
-    match stx[0].isStrLit? with
-    | some atom =>
-      unless isValidAtom atom do
-        throwErrorAt stx "invalid atom"
-      /- For syntax categories where initialized with `LeadingIdentBehavior` different from default (e.g., `tactic`), we automatically mark
-         the first symbol as nonReserved. -/
-      if (← read).behavior != Parser.LeadingIdentBehavior.default && (← read).first then
-        return (← `(ParserDescr.nonReservedSymbol $(quote atom) false), 1)
-      else
-        return (← `(ParserDescr.symbol $(quote atom)), 1)
-    | none => throwUnsupportedSyntax
+    let atom ← validAtom stx[0]
+    /- For syntax categories where initialized with `LeadingIdentBehavior` different from default (e.g., `tactic`), we automatically mark
+       the first symbol as nonReserved. -/
+    if (← read).behavior != Parser.LeadingIdentBehavior.default && (← read).first then
+      return (← `(ParserDescr.nonReservedSymbol $(quote atom) false), 1)
+    else
+      return (← `(ParserDescr.symbol $(quote atom)), 1)

  processNonReserved (stx : Syntax) := do
-    let some atom := stx[1].isStrLit? | throwUnsupportedSyntax
+    let atom ← validAtom stx[1]
    return (← `((with_annotate_term $(stx[0]) @ParserDescr.nonReservedSymbol) $(quote atom) false), 1)

+  processUnicode (stx : Syntax) := do
+    let atom ← validAtom stx[1]
+    let asciiAtom ← validAtom stx[3]
+    let preserveForPP := !stx[4].isNone
+    return (← `((with_annotate_term $(stx[0]) @ParserDescr.unicodeSymbol) $(quote atom) $(quote asciiAtom) $(quote preserveForPP)), 1)

 end Term

--- a/src/Lean/Parser.lean
+++ b/src/Lean/Parser.lean
@ -92,6 +92,7 @@ unsafe def interpretParserDescr : ParserDescr → CoreM Parenthesizer
  | ParserDescr.trailingNode k prec lhsPrec d       => return trailingNode.parenthesizer k prec lhsPrec (← interpretParserDescr d)
  | ParserDescr.symbol tk                           => return symbol.parenthesizer tk
  | ParserDescr.nonReservedSymbol tk includeIdent   => return nonReservedSymbol.parenthesizer tk includeIdent
+  | ParserDescr.unicodeSymbol tk asciiTk preserve   => return unicodeSymbol.parenthesizer tk asciiTk preserve
  | ParserDescr.parser constName                    => combinatorParenthesizerAttribute.runDeclFor constName
  | ParserDescr.cat catName prec                    => return categoryParser.parenthesizer catName prec

@ -124,6 +125,7 @@ unsafe def interpretParserDescr : ParserDescr → CoreM Formatter
  | ParserDescr.trailingNode k prec lhsPrec d       => return trailingNode.formatter k prec lhsPrec (← interpretParserDescr d)
  | ParserDescr.symbol tk                           => return symbol.formatter tk
  | ParserDescr.nonReservedSymbol tk _              => return nonReservedSymbol.formatter tk
+  | ParserDescr.unicodeSymbol tk asciiTk preserve   => return unicodeSymbol.formatter tk asciiTk preserve
  | ParserDescr.parser constName                    => combinatorFormatterAttribute.runDeclFor constName
  | ParserDescr.cat catName _                       => return categoryParser.formatter catName

--- a/src/Lean/Parser/Basic.lean
+++ b/src/Lean/Parser/Basic.lean
@ -1229,7 +1229,8 @@ def unicodeSymbolInfo (sym asciiSym : String) : ParserInfo := {
 def unicodeSymbolFn (sym asciiSym : String) : ParserFn :=
  unicodeSymbolFnAux sym asciiSym ["'" ++ sym ++ "', '" ++ asciiSym ++ "'"]

-def unicodeSymbolNoAntiquot (sym asciiSym : String) : Parser :=
+set_option linter.unusedVariables false in
+def unicodeSymbolNoAntiquot (sym asciiSym : String) (preserveForPP : Bool) : Parser :=
  let sym := sym.trim
  let asciiSym := asciiSym.trim
  { info := unicodeSymbolInfo sym asciiSym
@ -1786,8 +1787,19 @@ instance : Coe String Parser where
 def nonReservedSymbol (sym : String) (includeIdent := false) : Parser :=
  tokenWithAntiquot (nonReservedSymbolNoAntiquot sym includeIdent)

-def unicodeSymbol (sym asciiSym : String) : Parser :=
-  tokenWithAntiquot (unicodeSymbolNoAntiquot sym asciiSym)
+/--
+`unicodeSymbol sym asciiSym` parses either `sym` or `asciiSym` as a reserved symbol.
+The `asciiSym` argument should be an ASCII alternative for `sym`.
+
+- If the `pp.unicode` option is false, then pretty prints using `asciiSym`.
+- If the `pp.unicode` option is true, then pretty prints as `sym`,
+  unless the `preserveForPP` argument is true, in which case the underlying atom
+  is used to decide whether to print using the unicode or ASCII form.
+  We take the ASCII form to be the preferred form in this case.
+  For example, `pp.unicode.fun` causes the delaborator to use `↦` instead of `=>` for the `fun` arrow.
+-/
+def unicodeSymbol (sym asciiSym : String) (preserveForPP : Bool := false) : Parser :=
+  tokenWithAntiquot (unicodeSymbolNoAntiquot sym asciiSym preserveForPP)

 /--
  Define parser for `$e` (if `anonymous == true`) and `$e:name`.
--- a/src/Lean/Parser/Extension.lean
+++ b/src/Lean/Parser/Extension.lean
@ -291,6 +291,7 @@ partial def compileParserDescr (categories : ParserCategories) (d : ParserDescr)
    | ParserDescr.trailingNode k prec lhsPrec d       => return trailingNode k prec lhsPrec (← visit d)
    | ParserDescr.symbol tk                           => return symbol tk
    | ParserDescr.nonReservedSymbol tk includeIdent   => return nonReservedSymbol tk includeIdent
+    | ParserDescr.unicodeSymbol tk asciiTk preserve   => return unicodeSymbol tk asciiTk preserve
    | ParserDescr.parser constName                    => do
      let (_, p) ← mkParserOfConstantAux constName visit;
      pure p
--- a/src/Lean/Parser/Syntax.lean
+++ b/src/Lean/Parser/Syntax.lean
@ -48,10 +48,15 @@ namespace Syntax
@[builtin_syntax_parser] def sepBy1          := leading_parser
  "sepBy1(" >> withoutPosition (many1 syntaxParser >> ", " >> strLit >>
    optional (", " >> many1 syntaxParser) >> optional (", " >> nonReservedSymbol "allowTrailingSep")) >> ")"
-@[builtin_syntax_parser] def atom            := leading_parser
+@[builtin_syntax_parser, inherit_doc ParserDescr.symbol]
+def atom := leading_parser
  strLit
-@[builtin_syntax_parser] def nonReserved     := leading_parser
+@[builtin_syntax_parser, inherit_doc ParserDescr.nonReservedSymbol]
+def nonReserved := leading_parser
  "&" >> strLit
+@[builtin_syntax_parser, inherit_doc ParserDescr.unicodeSymbol]
+def unicodeAtom := leading_parser
+  "unicode(" >> strLit >> ", " >> strLit >> optional (", " >> nonReservedSymbol "preserveForPP") >> ")"

 end Syntax

@ -61,25 +66,39 @@ def namedName := leading_parser
  atomic (" (" >> nonReservedSymbol "name") >> " := " >> ident >> ")"
 def optNamedName := optional namedName

+def identPrec  := leading_parser ident >> optPrecedence
+def notationItem := withAntiquot (mkAntiquot "notationItem" decl_name%) (strLit <|> Syntax.unicodeAtom <|> identPrec)
+/--
+`prefix:prec "op" => f` is equivalent to `notation:prec "op" x:prec => f x`.
+-/
 def «prefix»   := leading_parser "prefix"
+/--
+`infix:prec "op" => f` is equivalent to `notation:prec x:prec1 "op" y:prec1 => f x y`, where `prec1 := prec + 1`.
+-/
 def «infix»    := leading_parser "infix"
+/--
+`infixl:prec "op" => f` is equivalent to `notation:prec x:prec "op" y:prec1 => f x y`, where `prec1 := prec + 1`.
+-/
 def «infixl»   := leading_parser "infixl"
+/--
+`infixr:prec "op" => f` is equivalent to `notation:prec x:prec1 "op" y:prec => f x y`, where `prec1 := prec + 1`.
+-/
 def «infixr»   := leading_parser "infixr"
+/--
+`postfix:prec "op" => f` is equivalent to `notation:prec x:prec "op" => f x`.
+-/
 def «postfix»  := leading_parser "postfix"
 def mixfixKind := «prefix» <|> «infix» <|> «infixl» <|> «infixr» <|> «postfix»
@[builtin_command_parser] def «mixfix»   := leading_parser
  optional docComment >> optional Term.«attributes» >> Term.attrKind >> mixfixKind >>
-  precedence >> optNamedName >> optNamedPrio >> ppSpace >> strLit >> darrow >> termParser
-- NOTE: We use `suppressInsideQuot` in the following parsers because quotations inside them are evaluated in the same stage and
-- thus should be ignored when we use `checkInsideQuot` to prepare the next stage for a builtin syntax change
-def identPrec  := leading_parser ident >> optPrecedence
-
-def optKind : Parser := optional (" (" >> nonReservedSymbol "kind" >> ":=" >> ident >> ")")
-
-def notationItem := ppSpace >> withAntiquot (mkAntiquot "notationItem" decl_name%) (strLit <|> identPrec)
+  precedence >> optNamedName >> optNamedPrio >> ppSpace >> notationItem >> darrow >> termParser
@[builtin_command_parser] def «notation»    := leading_parser
  optional docComment >> optional Term.«attributes» >> Term.attrKind >>
-  "notation" >> optPrecedence >> optNamedName >> optNamedPrio >> many notationItem >> darrow >> termParser
+  "notation" >> optPrecedence >> optNamedName >> optNamedPrio >> many (ppSpace >> notationItem) >> darrow >> termParser
+
+-- NOTE: We use `suppressInsideQuot` in the following parsers because quotations inside them are evaluated in the same stage and
+-- thus should be ignored when we use `checkInsideQuot` to prepare the next stage for a builtin syntax change
+def optKind : Parser := optional (" (" >> nonReservedSymbol "kind" >> ":=" >> ident >> ")")
@[builtin_command_parser] def «macro_rules» := suppressInsideQuot <| leading_parser
  optional docComment >> optional Term.«attributes» >> Term.attrKind >>
  "macro_rules" >> optKind >> Term.matchAlts
--- a/src/Lean/Parser/Term.lean
+++ b/src/Lean/Parser/Term.lean
@ -548,9 +548,9 @@ def funBinder : Parser :=
 -- NOTE: we disable anonymous antiquotations to ensure that `fun $b => ...`
 -- remains a `term` antiquotation
 def basicFun : Parser := leading_parser (withAnonymousAntiquot := false)
-  ppGroup (many1 (ppSpace >> funBinder) >> optType >> unicodeSymbol " ↦" " =>") >> ppSpace >> termParser
+  ppGroup (many1 (ppSpace >> funBinder) >> optType >> unicodeSymbol " ↦" " =>" (preserveForPP := true)) >> ppSpace >> termParser
@[builtin_term_parser] def «fun» := leading_parser:maxPrec
-  ppAllowUngrouped >> unicodeSymbol "λ" "fun" >> (basicFun <|> matchAlts)
+  ppAllowUngrouped >> unicodeSymbol "λ" "fun" (preserveForPP := true) >> (basicFun <|> matchAlts)

 def optExprPrecedence := optional (atomic ":" >> termParser maxPrec)
 def withAnonymousAntiquot := leading_parser
--- a/src/Lean/ParserCompiler.lean
+++ b/src/Lean/ParserCompiler.lean
@ -141,6 +141,7 @@ def compileEmbeddedParsers : ParserDescr → MetaM Unit
  | ParserDescr.trailingNode _ _ _ d   => compileEmbeddedParsers d
  | ParserDescr.symbol _               => pure ()
  | ParserDescr.nonReservedSymbol _ _  => pure ()
+  | ParserDescr.unicodeSymbol _ _ _    => pure ()
  | ParserDescr.cat _ _                => pure ()

 /-- Precondition: `α` must match `ctx.tyName`. -/
--- a/src/Lean/PrettyPrinter/Delaborator/Options.lean
+++ b/src/Lean/PrettyPrinter/Delaborator/Options.lean
@ -33,10 +33,15 @@ register_builtin_option pp.parens : Bool := {
  group    := "pp"
  descr    := "(pretty printer) if set to true, notation is wrapped in parentheses regardless of precedence"
 }
+register_builtin_option pp.unicode : Bool := {
+  defValue := true
+  group    := "pp"
+  descr    := "(pretty printer) if set to false, avoid using non-unicode symbols when formatting"
+}
 register_builtin_option pp.unicode.fun : Bool := {
  defValue := false
  group    := "pp"
-  descr    := "(pretty printer) disable/enable unicode ↦ notation for functions"
+  descr    := "(pretty printer) if set to true, use unicode `↦` notation for functions"
 }
 register_builtin_option pp.match : Bool := {
  defValue := true
@ -280,7 +285,8 @@ def getPPExplicit (o : Options) : Bool := o.get pp.explicit.name (getPPAll o)
 def getPPForalls (o : Options) : Bool := o.get pp.foralls.name pp.foralls.defValue
 def getPPNotation (o : Options) : Bool := o.get pp.notation.name (!getPPAll o)
 def getPPParens (o : Options) : Bool := o.get pp.parens.name pp.parens.defValue
-def getPPUnicodeFun (o : Options) : Bool := o.get pp.unicode.fun.name false
+def getPPUnicode (o : Options) : Bool := o.get pp.unicode.name pp.unicode.defValue
+def getPPUnicodeFun (o : Options) : Bool := o.get pp.unicode.fun.name pp.unicode.fun.defValue
 def getPPMatch (o : Options) : Bool := o.get pp.match.name (!getPPAll o)
 def getPPSorrySource (o : Options) : Bool := o.get pp.sorrySource.name pp.sorrySource.defValue
 def getPPFieldNotation (o : Options) : Bool := o.get pp.fieldNotation.name (!getPPAll o)
--- a/src/Lean/PrettyPrinter/Formatter.lean
+++ b/src/Lean/PrettyPrinter/Formatter.lean
@ -12,6 +12,7 @@ public import Lean.Parser.StrInterpolation
 public import Lean.KeyedDeclsAttribute
 public import Lean.ParserCompiler.Attribute
 public import Lean.PrettyPrinter.Basic
+public import Lean.PrettyPrinter.Delaborator.Options

 public section

@ -437,11 +438,11 @@ def pushToken (info : SourceInfo) (tk : String) (ident : Bool) : FormatterM Unit
@[combinator_formatter symbolNoAntiquot, expose]
 def symbolNoAntiquot.formatter (sym : String) : Formatter := do
  let stx ← getCur
-  if stx.isToken sym then do
-    let (Syntax.atom info _) ← pure stx | unreachable!
-    withMaybeTag (getExprPos? stx) (pushToken info sym false)
+  if stx.isToken sym then
+    let (Syntax.atom info _) := stx | unreachable!
+    withMaybeTag (getExprPos? stx) <| pushToken info sym false
    goLeft
-  else do
+  else
    trace[PrettyPrinter.format.backtrack] "unexpected syntax '{format stx}', expected symbol '{sym}'"
    throwBacktrack

@ -453,14 +454,21 @@ def symbolNoAntiquot.formatter (sym : String) : Formatter := do
  symbolNoAntiquot.formatter ch.toString

@[combinator_formatter unicodeSymbolNoAntiquot, expose]
-def unicodeSymbolNoAntiquot.formatter (sym asciiSym : String) : Formatter := do
-  let Syntax.atom info val ← getCur
-    | throwError m!"not an atom: {← getCur}"
-  if val == sym.trim then
-    pushToken info sym false
+def unicodeSymbolNoAntiquot.formatter (sym asciiSym : String) (preserveForPP : Bool) : Formatter := do
+  let stx ← getCur
+  let usesUnicode := stx.isToken sym
+  let usesAscii := stx.isToken asciiSym
+  if usesUnicode || usesAscii then
+    let (Syntax.atom info _) := stx | unreachable!
+    -- Use unicode version if pp.unicode is enabled and either preserveForPP is false or the syntax contains the unicode version
+    if getPPUnicode (← getOptions) && (!preserveForPP || usesUnicode) then
+      withMaybeTag (getExprPos? stx) <| pushToken info sym false
+    else
+      withMaybeTag (getExprPos? stx) <| pushToken info asciiSym false
+    goLeft
  else
-    pushToken info asciiSym false
-  goLeft
+    trace[PrettyPrinter.format.backtrack] "unexpected syntax '{format stx}', expected symbol '{sym}' or '{asciiSym}'"
+    throwBacktrack

@[combinator_formatter identNoAntiquot, expose]
 def identNoAntiquot.formatter : Formatter := do
--- a/src/Lean/PrettyPrinter/Parenthesizer.lean
+++ b/src/Lean/PrettyPrinter/Parenthesizer.lean
@ -469,13 +469,25 @@ def trailingNode.parenthesizer (k : SyntaxNodeKind) (prec lhsPrec : Nat) (p : Pa

@[combinator_parenthesizer rawCh, expose] def rawCh.parenthesizer (_ch : Char) := visitToken

-@[combinator_parenthesizer symbolNoAntiquot, expose] def symbolNoAntiquot.parenthesizer (_sym : String) := visitToken
-@[combinator_parenthesizer unicodeSymbolNoAntiquot, expose] def unicodeSymbolNoAntiquot.parenthesizer (_sym _asciiSym : String) := visitToken
+@[combinator_parenthesizer symbolNoAntiquot, expose] def symbolNoAntiquot.parenthesizer (sym : String) := do
+  let stx ← getCur
+  if stx.isToken sym then
+    visitToken
+  else
+    trace[PrettyPrinter.parenthesize.backtrack] "unexpected syntax '{format stx}', expected symbol '{sym}'"
+    throwBacktrack
+@[combinator_parenthesizer nonReservedSymbolNoAntiquot, expose] def nonReservedSymbolNoAntiquot.parenthesizer := symbolNoAntiquot.parenthesizer
+@[combinator_parenthesizer unicodeSymbolNoAntiquot, expose] def unicodeSymbolNoAntiquot.parenthesizer (sym asciiSym : String) (_preserveForPP : Bool) := do
+  let stx ← getCur
+  if stx.isToken sym || stx.isToken asciiSym then
+    visitToken
+  else
+    trace[PrettyPrinter.parenthesize.backtrack] "unexpected syntax '{format stx}', expected symbol '{sym}' or '{asciiSym}'"
+    throwBacktrack

@[combinator_parenthesizer identNoAntiquot, expose] def identNoAntiquot.parenthesizer := do checkKind identKind; visitToken
@[combinator_parenthesizer rawIdentNoAntiquot, expose] def rawIdentNoAntiquot.parenthesizer := visitToken
@[combinator_parenthesizer identEq, expose] def identEq.parenthesizer (_id : Name) := visitToken
-@[combinator_parenthesizer nonReservedSymbolNoAntiquot, expose] def nonReservedSymbolNoAntiquot.parenthesizer (_sym : String) (_includeIdent : Bool) := visitToken

@[combinator_parenthesizer charLitNoAntiquot, expose] def charLitNoAntiquot.parenthesizer := visitToken
@[combinator_parenthesizer strLitNoAntiquot, expose] def strLitNoAntiquot.parenthesizer := visitToken
--- a/tests/lean/242.lean
+++ b/tests/lean/242.lean
@ -6,3 +6,12 @@ syntax " 0" : term
 syntax " 'a'" : term
 syntax " `a" : term
 syntax " \"a" : term
+
+syntax &"0" : term
+syntax &"'a'" : term
+syntax &"`a" : term
+syntax &"\"a" : term
+syntax &" 0" : term
+syntax &" 'a'" : term
+syntax &" `a" : term
+syntax &" \"a" : term
--- a/tests/lean/242.lean.expected.out
+++ b/tests/lean/242.lean.expected.out
@ -6,3 +6,11 @@
 242.lean:6:7-6:13: error: invalid atom
 242.lean:7:7-7:12: error: invalid atom
 242.lean:8:7-8:13: error: invalid atom
+242.lean:10:8-10:11: error: invalid atom
+242.lean:11:8-11:13: error: invalid atom
+242.lean:12:8-12:12: error: invalid atom
+242.lean:13:8-13:13: error: invalid atom
+242.lean:14:8-14:12: error: invalid atom
+242.lean:15:8-15:14: error: invalid atom
+242.lean:16:8-16:13: error: invalid atom
+242.lean:17:8-17:14: error: invalid atom
--- a/tests/lean/run/ppUnicode.lean
+++ b/tests/lean/run/ppUnicode.lean
@ -0,0 +1,80 @@
+import Lean
+/-!
+# Tests of `pp.unicode` pretty printer option
+-/
+
+open Lean PrettyPrinter
+
+/-!
+Testing a notation with a `unicode` operator.
+
+Respects the current setting of `pp.unicode` exactly.
+-/
+def And' := And
+
+notation:35 x:36 unicode(" ∧' ", " /\\' ") y:35 => And' x y
+
+/-- info: True ∧' False : Prop -/
+#guard_msgs in #check True ∧' False
+
+/-- info: True /\' False : Prop -/
+#guard_msgs in set_option pp.unicode false in #check True ∧' False
+
+/-- info: True✝ ∧' False✝ -/
+#guard_msgs in #eval do ppTerm (← `(True ∧' False))
+/-- info: True✝ ∧' False✝ -/
+#guard_msgs in #eval do ppTerm (← `(True /\' False))
+/-- info: True✝ /\' False✝ -/
+#guard_msgs in set_option pp.unicode false in #eval do ppTerm (← `(True ∧' False))
+/-- info: True✝ /\' False✝ -/
+#guard_msgs in set_option pp.unicode false in #eval do ppTerm (← `(True /\' False))
+
+/-!
+Testing a notation with a `unicode` operator with `preserveForPP`.
+
+Respects the current setting of `pp.unicode` *if* the underlying atom is in the unicode form.
+The `notation` command creates a pretty printer using the ASCII form,
+so this is only possible if a delaborator creates a syntax quotation with the unicode form.
+-/
+def And'' := And
+
+notation:35 x:36 unicode(" ∧'' ", " /\\'' ", preserveForPP) y:35 => And'' x y
+
+/-- info: True /\'' False : Prop -/
+#guard_msgs in #check True ∧'' False
+
+/-- info: True /\'' False : Prop -/
+#guard_msgs in set_option pp.unicode false in #check True ∧'' False
+
+/-- info: True✝ ∧'' False✝ -/
+#guard_msgs in #eval do ppTerm (← `(True ∧'' False))
+/-- info: True✝ /\'' False✝ -/
+#guard_msgs in #eval do ppTerm (← `(True /\'' False))
+/-- info: True✝ /\'' False✝ -/
+#guard_msgs in set_option pp.unicode false in #eval do ppTerm (← `(True ∧'' False))
+/-- info: True✝ /\'' False✝ -/
+#guard_msgs in set_option pp.unicode false in #eval do ppTerm (← `(True /\'' False))
+
+/-!
+The `fun` notation uses `preserveForPP`.
+Whne `pp.unicode.fun` is true, its elaborator uses `↦`.
+-/
+/-- info: fun x => x : ?m.1 → ?m.1 -/
+#guard_msgs in #check fun x => x
+
+/-- info: fun x ↦ x : ?m.1 → ?m.1 -/
+#guard_msgs in set_option pp.unicode.fun true in #check fun x => x
+
+/-- info: fun x => x : ?m.1 -> ?m.1 -/
+#guard_msgs in set_option pp.unicode.fun true in set_option pp.unicode false in #check fun x => x
+
+-- The delaborator is what uses `↦`, so the option has no effect here.
+/-- info: fun x✝ => x✝ -/
+#guard_msgs in set_option pp.unicode.fun true in #eval do ppTerm (← `(fun x => x))
+
+/-- info: fun x✝ => x✝ -/
+#guard_msgs in #eval do ppTerm (← `(fun x => x))
+/-- info: fun x✝ ↦ x✝ -/
+#guard_msgs in #eval do ppTerm (← `(fun x ↦ x))
+/-- info: fun x✝ => x✝ -/
+#guard_msgs in set_option pp.unicode false in #eval do ppTerm (← `(fun x ↦ x))