diff --git a/RELEASES.md b/RELEASES.md index 6736fe1254..a2d912f10c 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -11,6 +11,15 @@ of each version. v4.5.0 (development in progress) --------- +* Modify the lexical syntax of string literals to have string gaps, which are escape sequences of the form `"\" newline whitespace*`. + These have the interpetation of an empty string and allow a string to flow across multiple lines without introducing additional whitespace. + The following is equivalent to `"this is a string"`. + ```lean + "this is \ + a string" + ``` + [PR #2821](https://github.com/leanprover/lean4/pull/2821) and [RFC #2838](https://github.com/leanprover/lean4/issues/2838). + v4.4.0 --------- diff --git a/doc/lexical_structure.md b/doc/lexical_structure.md index 3b62d226bd..d6d924b419 100644 --- a/doc/lexical_structure.md +++ b/doc/lexical_structure.md @@ -79,15 +79,19 @@ special characters: [Unicode table](https://unicode-table.com/en/) so "\xA9 Copyright 2021" is "© Copyright 2021". - `\uHHHH` puts the character represented by the 4 digit hexadecimal into the string, so the following string "\u65e5\u672c" will become "日本" which means "Japan". +- `\` followed by a newline and then any amount of whitespace is a "gap" that is equivalent to the empty string, +useful for letting a string literal span across multiple lines. Gaps spanning multiple lines can be confusing, +so the parser raises an error if the trailing whitespace contains any newlines. So the complete syntax is: ``` string : '"' string_item '"' - string_item : string_char | string_escape - string_char : [^\\] - string_escape: "\" ("\" | '"' | "'" | "n" | "t" | "x" hex_char{2} | "u" hex_char{4} ) + string_item : string_char | char_escape | string_gap + string_char : [^"\\] + char_escape : "\" ("\" | '"' | "'" | "n" | "t" | "x" hex_char{2} | "u" hex_char{4}) hex_char : [0-9a-fA-F] + string_gap : "\" newline whitespace* ``` Char Literals @@ -96,7 +100,9 @@ Char Literals Char literals are enclosed by single quotes (``'``). ``` - char: "'" string_item "'" + char : "'" char_item "'" + char_item : char_char | char_escape + char_char : [^'\\] ``` Numeric Literals diff --git a/src/Init/Meta.lean b/src/Init/Meta.lean index dee5526ebb..2d6a8a1e3b 100644 --- a/src/Init/Meta.lean +++ b/src/Init/Meta.lean @@ -773,6 +773,16 @@ def decodeQuotedChar (s : String) (i : String.Pos) : Option (Char × String.Pos) else none +/-- +Decodes a valid string gap after the `\`. +Note that this function matches `"\" whitespace+` rather than +the more restrictive `"\" newline whitespace*` since this simplifies the implementation. +Justification: this does not overlap with any other sequences beginning with `\`. +-/ +def decodeStringGap (s : String) (i : String.Pos) : Option String.Pos := do + guard <| (s.get i).isWhitespace + s.nextWhile Char.isWhitespace (s.next i) + partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Option String := do let c := s.get i let i := s.next i @@ -781,8 +791,12 @@ partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Optio else if s.atEnd i then none else if c == '\\' then do - let (c, i) ← decodeQuotedChar s i - decodeStrLitAux s i (acc.push c) + if let some (c, i) := decodeQuotedChar s i then + decodeStrLitAux s i (acc.push c) + else if let some i := decodeStringGap s i then + decodeStrLitAux s i acc + else + none else decodeStrLitAux s i (acc.push c) @@ -1162,8 +1176,12 @@ private partial def decodeInterpStrLit (s : String) : Option String := else if s.atEnd i then none else if c == '\\' then do - let (c, i) ← decodeInterpStrQuotedChar s i - loop i (acc.push c) + if let some (c, i) := decodeInterpStrQuotedChar s i then + loop i (acc.push c) + else if let some i := decodeStringGap s i then + loop i acc + else + none else loop i (acc.push c) loop ⟨1⟩ "" diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean index d34063e055..03c0b3aa19 100644 --- a/src/Lean/Parser/Basic.lean +++ b/src/Lean/Parser/Basic.lean @@ -564,7 +564,40 @@ def hexDigitFn : ParserFn := fun c s => if curr.isDigit || ('a' <= curr && curr <= 'f') || ('A' <= curr && curr <= 'F') then s.setPos i else s.mkUnexpectedError "invalid hexadecimal numeral" -def quotedCharCoreFn (isQuotable : Char → Bool) : ParserFn := fun c s => +/-- +Parses the whitespace after the `\` when there is a string gap. +Raises an error if the whitespace does not contain exactly one newline character. +Processes `\r\n` as a newline. +-/ +partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s => + let i := s.pos + if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline + else + let curr := c.input.get' i h + if curr == '\n' then + if seenNewline then + -- Having more than one newline in a string gap is visually confusing + s.mkUnexpectedError "unexpected additional newline in string gap" + else + stringGapFn true false c (s.next' c.input i h) + else if curr == '\r' then + stringGapFn seenNewline true c (s.next' c.input i h) + else if afterCR then + s.mkUnexpectedError "expecting newline after carriage return" + else if curr.isWhitespace then + stringGapFn seenNewline false c (s.next' c.input i h) + else if seenNewline then + s + else + s.mkUnexpectedError "expecting newline in string gap" + +/-- +Parses a string quotation after a `\`. +- `isQuotable` determines which characters are valid escapes +- `inString` enables features that are only valid within strings, + in particular `"\" newline whitespace*` gaps. +-/ +def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn := fun c s => let input := c.input let i := s.pos if h : input.atEnd i then s.mkEOIError @@ -576,6 +609,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) : ParserFn := fun c s => andthenFn hexDigitFn hexDigitFn c (s.next' input i h) else if curr == 'u' then andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h) + else if inString && (curr == '\n' || curr == '\r') then + stringGapFn false false c s else s.mkUnexpectedError "invalid escape sequence" @@ -583,7 +618,14 @@ def isQuotableCharDefault (c : Char) : Bool := c == '\\' || c == '\"' || c == '\'' || c == 'r' || c == 'n' || c == 't' def quotedCharFn : ParserFn := - quotedCharCoreFn isQuotableCharDefault + quotedCharCoreFn isQuotableCharDefault false + +/-- +Like `quotedCharFn` but enables escapes that are only valid inside strings. +In particular, string gaps (`"\" newline whitespace*`). +-/ +def quotedStringFn : ParserFn := + quotedCharCoreFn isQuotableCharDefault true /-- Push `(Syntax.node tk )` onto syntax stack if parse was successful. -/ def mkNodeToken (n : SyntaxNodeKind) (startPos : String.Pos) : ParserFn := fun c s => Id.run do @@ -624,7 +666,7 @@ partial def strLitFnAux (startPos : String.Pos) : ParserFn := fun c s => let s := s.setPos (input.next' i h) if curr == '\"' then mkNodeToken strLitKind startPos c s - else if curr == '\\' then andthenFn quotedCharFn (strLitFnAux startPos) c s + else if curr == '\\' then andthenFn quotedStringFn (strLitFnAux startPos) c s else strLitFnAux startPos c s def decimalNumberFn (startPos : String.Pos) (c : ParserContext) : ParserState → ParserState := fun s => diff --git a/src/Lean/Parser/StrInterpolation.lean b/src/Lean/Parser/StrInterpolation.lean index 421a22341a..dc9ff6d92d 100644 --- a/src/Lean/Parser/StrInterpolation.lean +++ b/src/Lean/Parser/StrInterpolation.lean @@ -24,7 +24,7 @@ partial def interpolatedStrFn (p : ParserFn) : ParserFn := fun c s => let s := mkNodeToken interpolatedStrLitKind startPos c s s.mkNode interpolatedStrKind stackSize else if curr == '\\' then - andthenFn (quotedCharCoreFn isQuotableCharForStrInterpolant) (parse startPos) c s + andthenFn (quotedCharCoreFn isQuotableCharForStrInterpolant true) (parse startPos) c s else if curr == '{' then let s := mkNodeToken interpolatedStrLitKind startPos c s let s := p c s diff --git a/tests/lean/string_gaps.lean b/tests/lean/string_gaps.lean new file mode 100644 index 0000000000..b6aadbe09c --- /dev/null +++ b/tests/lean/string_gaps.lean @@ -0,0 +1,140 @@ +import Lean.Parser.Extension +import Lean.Elab.Term + +/-! +# Testing string gaps in string literals + +String gaps are described in RFC #2838 +-/ + +/-! +A string gap with no trailing space. +-/ +#eval "a\ +b" + +/-! +A string gap with trailing space before the `b`, which is consumed. +-/ +#eval "a\ + b" + +/-! +A string gap with space before the gap, which is not consumed. +-/ +#eval "a \ + b" + +/-! +Multiple string gaps in a row. +-/ +#eval "a \ + \ + \ + b" + +/-! +Two tests from the RFC. +-/ +#eval "this is \ + a string" +#eval "this is \ + a string" + +/-! +Two examples of how spaces are accounted for in string gaps. `\x20` is a way to force a leading space. +-/ +#eval "there are three spaces between the brackets < \ + >" +#eval "there are three spaces between the brackets <\ + \x20 >" + +/-! +Using `\n` to terminate a string gap, which is a technique suggested by Mario for using string gaps to write +multiline literals in an indented context. +-/ +#eval "this is\ + \n a string with two space indent" + +/-! +Similar tests but for interpolated strings. +-/ +#eval s!"a\ +b" +#eval s!"a\ + b" +#eval s!"a\ + b" + +/-! +The `{` terminates the string gap. +-/ +#eval s!"a\ + {"b"}\ + " + +open Lean + +/-! +## Testing whitespace handling with specific line terminators +-/ + +/-! +Standard string gap, with LF +-/ +#eval show MetaM String from do + let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\n b\"" + let some s := stx.isStrLit? | failure + return s + +/-! +Standard string gap, with CRLF +-/ +#eval show MetaM String from do + let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r\n b\"" + let some s := stx.isStrLit? | failure + return s + +/-! +Isolated CR, which is an error +-/ +#eval show MetaM String from do + let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r b\"" + let some s := stx.isStrLit? | failure + return s + +/-! +Not a string gap since there's no end-of-line. +-/ +#eval show MetaM String from do + let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\ b\"" + let some s := stx.isStrLit? | failure + return s + +/-! +## Scala-style stripMargin + +This is a test that string gaps could be paired with a new string elaboration syntax +for indented multiline string literals. +-/ + +def String.dedent (s : String) : Option String := + let parts := s.split (· == '\n') |>.map String.trimLeft + match parts with + | [] => "" + | [p] => p + | p₀ :: parts => + if !parts.all (·.startsWith "|") then + none + else + p₀ ++ "\n" ++ String.intercalate "\n" (parts.map fun p => p.drop 1) + +elab "d!" s:str : term => do + let some s := s.raw.isStrLit? | Lean.Elab.throwIllFormedSyntax + let some s := String.dedent s | Lean.Elab.throwIllFormedSyntax + pure $ Lean.mkStrLit s + +#eval d!"this is \ + line 1 + | line 2, indented + |line 3" diff --git a/tests/lean/string_gaps.lean.expected.out b/tests/lean/string_gaps.lean.expected.out new file mode 100644 index 0000000000..386346a9ab --- /dev/null +++ b/tests/lean/string_gaps.lean.expected.out @@ -0,0 +1,20 @@ +"ab" +"ab" +"a b" +"a b" +"this is a string" +"this is a string" +"there are three spaces between the brackets < >" +"there are three spaces between the brackets < >" +"this is\n a string with two space indent" +"ab" +"ab" +"ab" +"ab" +"ab" +"ab" + +string_gaps.lean:101:0-104:10: error: :1:4: expecting newline after carriage return + +string_gaps.lean:109:0-112:10: error: :1:3: invalid escape sequence +"this is line 1\n line 2, indented\nline 3" diff --git a/tests/lean/string_gaps_err_newline.lean b/tests/lean/string_gaps_err_newline.lean new file mode 100644 index 0000000000..14820ef488 --- /dev/null +++ b/tests/lean/string_gaps_err_newline.lean @@ -0,0 +1,9 @@ +/-! +# String gaps error test + +A string gap shouldn't have more than one trailing newline according to RFC #2838 +-/ + +#eval "a\ + +b" diff --git a/tests/lean/string_gaps_err_newline.lean.expected.out b/tests/lean/string_gaps_err_newline.lean.expected.out new file mode 100644 index 0000000000..a5d95aa550 --- /dev/null +++ b/tests/lean/string_gaps_err_newline.lean.expected.out @@ -0,0 +1 @@ +string_gaps_err_newline.lean:8:0: error: unexpected additional newline in string gap