feat: string gaps for continuing string literals across multiple lines (#2821)
Implements "gaps" in string literals. These are escape sequences of the
form `"\" newline whitespace+` that have the interpretation of an empty
string. For example,
```
"this is \
a string"
```
is equivalent to `"this is a string"`. These are modeled after string
continuations in
[Rust](https://doc.rust-lang.org/beta/reference/tokens.html#string-literals).
Implements RFC #2838
This commit is contained in:
parent
ec8811a75a
commit
bcbcf50442
9 changed files with 257 additions and 12 deletions
|
|
@ -11,6 +11,15 @@ of each version.
|
|||
v4.5.0 (development in progress)
|
||||
---------
|
||||
|
||||
* Modify the lexical syntax of string literals to have string gaps, which are escape sequences of the form `"\" newline whitespace*`.
|
||||
These have the interpetation of an empty string and allow a string to flow across multiple lines without introducing additional whitespace.
|
||||
The following is equivalent to `"this is a string"`.
|
||||
```lean
|
||||
"this is \
|
||||
a string"
|
||||
```
|
||||
[PR #2821](https://github.com/leanprover/lean4/pull/2821) and [RFC #2838](https://github.com/leanprover/lean4/issues/2838).
|
||||
|
||||
v4.4.0
|
||||
---------
|
||||
|
||||
|
|
|
|||
|
|
@ -79,15 +79,19 @@ special characters:
|
|||
[Unicode table](https://unicode-table.com/en/) so "\xA9 Copyright 2021" is "© Copyright 2021".
|
||||
- `\uHHHH` puts the character represented by the 4 digit hexadecimal into the string, so the following
|
||||
string "\u65e5\u672c" will become "日本" which means "Japan".
|
||||
- `\` followed by a newline and then any amount of whitespace is a "gap" that is equivalent to the empty string,
|
||||
useful for letting a string literal span across multiple lines. Gaps spanning multiple lines can be confusing,
|
||||
so the parser raises an error if the trailing whitespace contains any newlines.
|
||||
|
||||
So the complete syntax is:
|
||||
|
||||
```
|
||||
string : '"' string_item '"'
|
||||
string_item : string_char | string_escape
|
||||
string_char : [^\\]
|
||||
string_escape: "\" ("\" | '"' | "'" | "n" | "t" | "x" hex_char{2} | "u" hex_char{4} )
|
||||
string_item : string_char | char_escape | string_gap
|
||||
string_char : [^"\\]
|
||||
char_escape : "\" ("\" | '"' | "'" | "n" | "t" | "x" hex_char{2} | "u" hex_char{4})
|
||||
hex_char : [0-9a-fA-F]
|
||||
string_gap : "\" newline whitespace*
|
||||
```
|
||||
|
||||
Char Literals
|
||||
|
|
@ -96,7 +100,9 @@ Char Literals
|
|||
Char literals are enclosed by single quotes (``'``).
|
||||
|
||||
```
|
||||
char: "'" string_item "'"
|
||||
char : "'" char_item "'"
|
||||
char_item : char_char | char_escape
|
||||
char_char : [^'\\]
|
||||
```
|
||||
|
||||
Numeric Literals
|
||||
|
|
|
|||
|
|
@ -773,6 +773,16 @@ def decodeQuotedChar (s : String) (i : String.Pos) : Option (Char × String.Pos)
|
|||
else
|
||||
none
|
||||
|
||||
/--
|
||||
Decodes a valid string gap after the `\`.
|
||||
Note that this function matches `"\" whitespace+` rather than
|
||||
the more restrictive `"\" newline whitespace*` since this simplifies the implementation.
|
||||
Justification: this does not overlap with any other sequences beginning with `\`.
|
||||
-/
|
||||
def decodeStringGap (s : String) (i : String.Pos) : Option String.Pos := do
|
||||
guard <| (s.get i).isWhitespace
|
||||
s.nextWhile Char.isWhitespace (s.next i)
|
||||
|
||||
partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Option String := do
|
||||
let c := s.get i
|
||||
let i := s.next i
|
||||
|
|
@ -781,8 +791,12 @@ partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Optio
|
|||
else if s.atEnd i then
|
||||
none
|
||||
else if c == '\\' then do
|
||||
let (c, i) ← decodeQuotedChar s i
|
||||
decodeStrLitAux s i (acc.push c)
|
||||
if let some (c, i) := decodeQuotedChar s i then
|
||||
decodeStrLitAux s i (acc.push c)
|
||||
else if let some i := decodeStringGap s i then
|
||||
decodeStrLitAux s i acc
|
||||
else
|
||||
none
|
||||
else
|
||||
decodeStrLitAux s i (acc.push c)
|
||||
|
||||
|
|
@ -1162,8 +1176,12 @@ private partial def decodeInterpStrLit (s : String) : Option String :=
|
|||
else if s.atEnd i then
|
||||
none
|
||||
else if c == '\\' then do
|
||||
let (c, i) ← decodeInterpStrQuotedChar s i
|
||||
loop i (acc.push c)
|
||||
if let some (c, i) := decodeInterpStrQuotedChar s i then
|
||||
loop i (acc.push c)
|
||||
else if let some i := decodeStringGap s i then
|
||||
loop i acc
|
||||
else
|
||||
none
|
||||
else
|
||||
loop i (acc.push c)
|
||||
loop ⟨1⟩ ""
|
||||
|
|
|
|||
|
|
@ -564,7 +564,40 @@ def hexDigitFn : ParserFn := fun c s =>
|
|||
if curr.isDigit || ('a' <= curr && curr <= 'f') || ('A' <= curr && curr <= 'F') then s.setPos i
|
||||
else s.mkUnexpectedError "invalid hexadecimal numeral"
|
||||
|
||||
def quotedCharCoreFn (isQuotable : Char → Bool) : ParserFn := fun c s =>
|
||||
/--
|
||||
Parses the whitespace after the `\` when there is a string gap.
|
||||
Raises an error if the whitespace does not contain exactly one newline character.
|
||||
Processes `\r\n` as a newline.
|
||||
-/
|
||||
partial def stringGapFn (seenNewline afterCR : Bool) : ParserFn := fun c s =>
|
||||
let i := s.pos
|
||||
if h : c.input.atEnd i then s -- let strLitFnAux handle the EOI error if !seenNewline
|
||||
else
|
||||
let curr := c.input.get' i h
|
||||
if curr == '\n' then
|
||||
if seenNewline then
|
||||
-- Having more than one newline in a string gap is visually confusing
|
||||
s.mkUnexpectedError "unexpected additional newline in string gap"
|
||||
else
|
||||
stringGapFn true false c (s.next' c.input i h)
|
||||
else if curr == '\r' then
|
||||
stringGapFn seenNewline true c (s.next' c.input i h)
|
||||
else if afterCR then
|
||||
s.mkUnexpectedError "expecting newline after carriage return"
|
||||
else if curr.isWhitespace then
|
||||
stringGapFn seenNewline false c (s.next' c.input i h)
|
||||
else if seenNewline then
|
||||
s
|
||||
else
|
||||
s.mkUnexpectedError "expecting newline in string gap"
|
||||
|
||||
/--
|
||||
Parses a string quotation after a `\`.
|
||||
- `isQuotable` determines which characters are valid escapes
|
||||
- `inString` enables features that are only valid within strings,
|
||||
in particular `"\" newline whitespace*` gaps.
|
||||
-/
|
||||
def quotedCharCoreFn (isQuotable : Char → Bool) (inString : Bool) : ParserFn := fun c s =>
|
||||
let input := c.input
|
||||
let i := s.pos
|
||||
if h : input.atEnd i then s.mkEOIError
|
||||
|
|
@ -576,6 +609,8 @@ def quotedCharCoreFn (isQuotable : Char → Bool) : ParserFn := fun c s =>
|
|||
andthenFn hexDigitFn hexDigitFn c (s.next' input i h)
|
||||
else if curr == 'u' then
|
||||
andthenFn hexDigitFn (andthenFn hexDigitFn (andthenFn hexDigitFn hexDigitFn)) c (s.next' input i h)
|
||||
else if inString && (curr == '\n' || curr == '\r') then
|
||||
stringGapFn false false c s
|
||||
else
|
||||
s.mkUnexpectedError "invalid escape sequence"
|
||||
|
||||
|
|
@ -583,7 +618,14 @@ def isQuotableCharDefault (c : Char) : Bool :=
|
|||
c == '\\' || c == '\"' || c == '\'' || c == 'r' || c == 'n' || c == 't'
|
||||
|
||||
def quotedCharFn : ParserFn :=
|
||||
quotedCharCoreFn isQuotableCharDefault
|
||||
quotedCharCoreFn isQuotableCharDefault false
|
||||
|
||||
/--
|
||||
Like `quotedCharFn` but enables escapes that are only valid inside strings.
|
||||
In particular, string gaps (`"\" newline whitespace*`).
|
||||
-/
|
||||
def quotedStringFn : ParserFn :=
|
||||
quotedCharCoreFn isQuotableCharDefault true
|
||||
|
||||
/-- Push `(Syntax.node tk <new-atom>)` onto syntax stack if parse was successful. -/
|
||||
def mkNodeToken (n : SyntaxNodeKind) (startPos : String.Pos) : ParserFn := fun c s => Id.run do
|
||||
|
|
@ -624,7 +666,7 @@ partial def strLitFnAux (startPos : String.Pos) : ParserFn := fun c s =>
|
|||
let s := s.setPos (input.next' i h)
|
||||
if curr == '\"' then
|
||||
mkNodeToken strLitKind startPos c s
|
||||
else if curr == '\\' then andthenFn quotedCharFn (strLitFnAux startPos) c s
|
||||
else if curr == '\\' then andthenFn quotedStringFn (strLitFnAux startPos) c s
|
||||
else strLitFnAux startPos c s
|
||||
|
||||
def decimalNumberFn (startPos : String.Pos) (c : ParserContext) : ParserState → ParserState := fun s =>
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ partial def interpolatedStrFn (p : ParserFn) : ParserFn := fun c s =>
|
|||
let s := mkNodeToken interpolatedStrLitKind startPos c s
|
||||
s.mkNode interpolatedStrKind stackSize
|
||||
else if curr == '\\' then
|
||||
andthenFn (quotedCharCoreFn isQuotableCharForStrInterpolant) (parse startPos) c s
|
||||
andthenFn (quotedCharCoreFn isQuotableCharForStrInterpolant true) (parse startPos) c s
|
||||
else if curr == '{' then
|
||||
let s := mkNodeToken interpolatedStrLitKind startPos c s
|
||||
let s := p c s
|
||||
|
|
|
|||
140
tests/lean/string_gaps.lean
Normal file
140
tests/lean/string_gaps.lean
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
import Lean.Parser.Extension
|
||||
import Lean.Elab.Term
|
||||
|
||||
/-!
|
||||
# Testing string gaps in string literals
|
||||
|
||||
String gaps are described in RFC #2838
|
||||
-/
|
||||
|
||||
/-!
|
||||
A string gap with no trailing space.
|
||||
-/
|
||||
#eval "a\
|
||||
b"
|
||||
|
||||
/-!
|
||||
A string gap with trailing space before the `b`, which is consumed.
|
||||
-/
|
||||
#eval "a\
|
||||
b"
|
||||
|
||||
/-!
|
||||
A string gap with space before the gap, which is not consumed.
|
||||
-/
|
||||
#eval "a \
|
||||
b"
|
||||
|
||||
/-!
|
||||
Multiple string gaps in a row.
|
||||
-/
|
||||
#eval "a \
|
||||
\
|
||||
\
|
||||
b"
|
||||
|
||||
/-!
|
||||
Two tests from the RFC.
|
||||
-/
|
||||
#eval "this is \
|
||||
a string"
|
||||
#eval "this is \
|
||||
a string"
|
||||
|
||||
/-!
|
||||
Two examples of how spaces are accounted for in string gaps. `\x20` is a way to force a leading space.
|
||||
-/
|
||||
#eval "there are three spaces between the brackets < \
|
||||
>"
|
||||
#eval "there are three spaces between the brackets <\
|
||||
\x20 >"
|
||||
|
||||
/-!
|
||||
Using `\n` to terminate a string gap, which is a technique suggested by Mario for using string gaps to write
|
||||
multiline literals in an indented context.
|
||||
-/
|
||||
#eval "this is\
|
||||
\n a string with two space indent"
|
||||
|
||||
/-!
|
||||
Similar tests but for interpolated strings.
|
||||
-/
|
||||
#eval s!"a\
|
||||
b"
|
||||
#eval s!"a\
|
||||
b"
|
||||
#eval s!"a\
|
||||
b"
|
||||
|
||||
/-!
|
||||
The `{` terminates the string gap.
|
||||
-/
|
||||
#eval s!"a\
|
||||
{"b"}\
|
||||
"
|
||||
|
||||
open Lean
|
||||
|
||||
/-!
|
||||
## Testing whitespace handling with specific line terminators
|
||||
-/
|
||||
|
||||
/-!
|
||||
Standard string gap, with LF
|
||||
-/
|
||||
#eval show MetaM String from do
|
||||
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\n b\""
|
||||
let some s := stx.isStrLit? | failure
|
||||
return s
|
||||
|
||||
/-!
|
||||
Standard string gap, with CRLF
|
||||
-/
|
||||
#eval show MetaM String from do
|
||||
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r\n b\""
|
||||
let some s := stx.isStrLit? | failure
|
||||
return s
|
||||
|
||||
/-!
|
||||
Isolated CR, which is an error
|
||||
-/
|
||||
#eval show MetaM String from do
|
||||
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\\r b\""
|
||||
let some s := stx.isStrLit? | failure
|
||||
return s
|
||||
|
||||
/-!
|
||||
Not a string gap since there's no end-of-line.
|
||||
-/
|
||||
#eval show MetaM String from do
|
||||
let stx ← ofExcept <| Parser.runParserCategory (← getEnv) `term "\"a\\ b\""
|
||||
let some s := stx.isStrLit? | failure
|
||||
return s
|
||||
|
||||
/-!
|
||||
## Scala-style stripMargin
|
||||
|
||||
This is a test that string gaps could be paired with a new string elaboration syntax
|
||||
for indented multiline string literals.
|
||||
-/
|
||||
|
||||
def String.dedent (s : String) : Option String :=
|
||||
let parts := s.split (· == '\n') |>.map String.trimLeft
|
||||
match parts with
|
||||
| [] => ""
|
||||
| [p] => p
|
||||
| p₀ :: parts =>
|
||||
if !parts.all (·.startsWith "|") then
|
||||
none
|
||||
else
|
||||
p₀ ++ "\n" ++ String.intercalate "\n" (parts.map fun p => p.drop 1)
|
||||
|
||||
elab "d!" s:str : term => do
|
||||
let some s := s.raw.isStrLit? | Lean.Elab.throwIllFormedSyntax
|
||||
let some s := String.dedent s | Lean.Elab.throwIllFormedSyntax
|
||||
pure $ Lean.mkStrLit s
|
||||
|
||||
#eval d!"this is \
|
||||
line 1
|
||||
| line 2, indented
|
||||
|line 3"
|
||||
20
tests/lean/string_gaps.lean.expected.out
Normal file
20
tests/lean/string_gaps.lean.expected.out
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
"ab"
|
||||
"ab"
|
||||
"a b"
|
||||
"a b"
|
||||
"this is a string"
|
||||
"this is a string"
|
||||
"there are three spaces between the brackets < >"
|
||||
"there are three spaces between the brackets < >"
|
||||
"this is\n a string with two space indent"
|
||||
"ab"
|
||||
"ab"
|
||||
"ab"
|
||||
"ab"
|
||||
"ab"
|
||||
"ab"
|
||||
|
||||
string_gaps.lean:101:0-104:10: error: <input>:1:4: expecting newline after carriage return
|
||||
|
||||
string_gaps.lean:109:0-112:10: error: <input>:1:3: invalid escape sequence
|
||||
"this is line 1\n line 2, indented\nline 3"
|
||||
9
tests/lean/string_gaps_err_newline.lean
Normal file
9
tests/lean/string_gaps_err_newline.lean
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
/-!
|
||||
# String gaps error test
|
||||
|
||||
A string gap shouldn't have more than one trailing newline according to RFC #2838
|
||||
-/
|
||||
|
||||
#eval "a\
|
||||
|
||||
b"
|
||||
1
tests/lean/string_gaps_err_newline.lean.expected.out
Normal file
1
tests/lean/string_gaps_err_newline.lean.expected.out
Normal file
|
|
@ -0,0 +1 @@
|
|||
string_gaps_err_newline.lean:8:0: error: unexpected additional newline in string gap
|
||||
Loading…
Add table
Reference in a new issue