feat: Rust-style raw string literals (#2929)
For example, `r"\n"` and `r#"The word "this" is in quotes."#`. Implements #1422
This commit is contained in:
parent
79c7b27034
commit
ae6fe098cb
7 changed files with 219 additions and 3 deletions
|
|
@ -20,6 +20,11 @@ v4.5.0 (development in progress)
|
|||
```
|
||||
[PR #2821](https://github.com/leanprover/lean4/pull/2821) and [RFC #2838](https://github.com/leanprover/lean4/issues/2838).
|
||||
|
||||
* Add raw string literal syntax. For example, `r"\n"` is equivalent to `"\\n"`, with no escape processing.
|
||||
To include double quote characters in a raw string one can add sufficiently many `#` characters before and after
|
||||
the bounding `"`s, as in `r#"the "the" is in quotes"#` for `"the \"the\" is in quotes"`.
|
||||
[PR #2929](https://github.com/leanprover/lean4/pull/2929) and [issue #1422](https://github.com/leanprover/lean4/issues/1422).
|
||||
|
||||
* The low-level `termination_by'` clause is no longer supported.
|
||||
|
||||
Migration guide: Use `termination_by` instead, e.g.:
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ A Lean program consists of a stream of UTF-8 tokens where each token
|
|||
is one of the following:
|
||||
|
||||
```
|
||||
token: symbol | command | ident | string | char | numeral |
|
||||
token: symbol | command | ident | string | raw_string | char | numeral |
|
||||
: decimal | doc_comment | mod_doc_comment | field_notation
|
||||
```
|
||||
|
||||
|
|
@ -94,6 +94,22 @@ So the complete syntax is:
|
|||
string_gap : "\" newline whitespace*
|
||||
```
|
||||
|
||||
Raw String Literals
|
||||
===================
|
||||
|
||||
Raw string literals are string literals without any escape character processing.
|
||||
They begin with `r##...#"` (with zero or more `#` characters) and end with `"#...##` (with the same number of `#` characters).
|
||||
The contents of a raw string literal may contain `"##..#` so long as the number of `#` characters
|
||||
is less than the number of `#` characters used to begin the raw string literal.
|
||||
|
||||
```
|
||||
raw_string : raw_string_aux(0) | raw_string_aux(1) | raw_string_aux(2) | ...
|
||||
raw_string_aux(n) : 'r' '#'{n} '"' raw_string_item '"' '#'{n}
|
||||
raw_string_item(n) : raw_string_char | raw_string_quote(n)
|
||||
raw_string_char : [^"]
|
||||
raw_string_quote(n) : '"' '#'{0..n-1}
|
||||
```
|
||||
|
||||
Char Literals
|
||||
=============
|
||||
|
||||
|
|
|
|||
|
|
@ -800,9 +800,40 @@ partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Optio
|
|||
else
|
||||
decodeStrLitAux s i (acc.push c)
|
||||
|
||||
def decodeStrLit (s : String) : Option String :=
|
||||
decodeStrLitAux s ⟨1⟩ ""
|
||||
/--
|
||||
Takes a raw string literal, counts the number of `#`'s after the `r`, and interprets it as a string.
|
||||
The position `i` should start at `1`, which is the character after the leading `r`.
|
||||
The algorithm is simple: we are given `r##...#"...string..."##...#` with zero or more `#`s.
|
||||
By counting the number of leading `#`'s, we can extract the `...string...`.
|
||||
-/
|
||||
partial def decodeRawStrLitAux (s : String) (i : String.Pos) (num : Nat) : String :=
|
||||
let c := s.get i
|
||||
let i := s.next i
|
||||
if c == '#' then
|
||||
decodeRawStrLitAux s i (num + 1)
|
||||
else
|
||||
s.extract i ⟨s.utf8ByteSize - (num + 1)⟩
|
||||
|
||||
/--
|
||||
Takes the string literal lexical syntax parsed by the parser and interprets it as a string.
|
||||
This is where escape sequences are processed for example.
|
||||
The string `s` is is either a plain string literal or a raw string literal.
|
||||
|
||||
If it returns `none` then the string literal is ill-formed, which indicates a bug in the parser.
|
||||
The function is not required to return `none` if the string literal is ill-formed.
|
||||
-/
|
||||
def decodeStrLit (s : String) : Option String :=
|
||||
if s.get 0 == 'r' then
|
||||
decodeRawStrLitAux s ⟨1⟩ 0
|
||||
else
|
||||
decodeStrLitAux s ⟨1⟩ ""
|
||||
|
||||
/--
|
||||
If the provided `Syntax` is a string literal, returns the string it represents.
|
||||
|
||||
Even if the `Syntax` is a `str` node, the function may return `none` if its internally ill-formed.
|
||||
The parser should always create well-formed `str` nodes.
|
||||
-/
|
||||
def isStrLit? (stx : Syntax) : Option String :=
|
||||
match isLit? strLitKind stx with
|
||||
| some val => decodeStrLit val
|
||||
|
|
|
|||
|
|
@ -669,6 +669,90 @@ partial def strLitFnAux (startPos : String.Pos) : ParserFn := fun c s =>
|
|||
else if curr == '\\' then andthenFn quotedStringFn (strLitFnAux startPos) c s
|
||||
else strLitFnAux startPos c s
|
||||
|
||||
/--
|
||||
Raw strings have the syntax `r##...#"..."#...##` with zero or more `#`'s.
|
||||
If we are looking at a valid start to a raw string (`r##...#"`),
|
||||
returns true.
|
||||
We assume `i` begins at the position immediately after `r`.
|
||||
-/
|
||||
partial def isRawStrLitStart (input : String) (i : String.Pos) : Bool :=
|
||||
if h : input.atEnd i then false
|
||||
else
|
||||
let curr := input.get' i h
|
||||
if curr == '#' then
|
||||
isRawStrLitStart input (input.next' i h)
|
||||
else
|
||||
curr == '"'
|
||||
|
||||
/--
|
||||
Parses a raw string literal assuming `isRawStrLitStart` has returned true.
|
||||
The `startPos` is the start of the raw string (at the `r`).
|
||||
The parser state is assumed to be immediately after the `r`.
|
||||
-/
|
||||
partial def rawStrLitFnAux (startPos : String.Pos) : ParserFn := initState 0
|
||||
where
|
||||
/--
|
||||
Gives the "unterminated raw string literal" error.
|
||||
-/
|
||||
errorUnterminated (s : ParserState) := s.mkUnexpectedErrorAt "unterminated raw string literal" startPos
|
||||
/--
|
||||
Parses the `#`'s and `"` at the beginning of the raw string.
|
||||
The `num` variable counts the number of `#`s after the `r`.
|
||||
-/
|
||||
initState (num : Nat) : ParserFn := fun c s =>
|
||||
let input := c.input
|
||||
let i := s.pos
|
||||
if h : input.atEnd i then errorUnterminated s
|
||||
else
|
||||
let curr := input.get' i h
|
||||
let s := s.setPos (input.next' i h)
|
||||
if curr == '#' then
|
||||
initState (num + 1) c s
|
||||
else if curr == '"' then
|
||||
normalState num c s
|
||||
else
|
||||
-- This should not occur, since we assume `isRawStrLitStart` succeeded.
|
||||
errorUnterminated s
|
||||
/--
|
||||
Parses characters after the first `"`. If we need to start counting `#`'s to decide if we are closing
|
||||
the raw string literal, we switch to `closingState`.
|
||||
-/
|
||||
normalState (num : Nat) : ParserFn := fun c s =>
|
||||
let input := c.input
|
||||
let i := s.pos
|
||||
if h : input.atEnd i then errorUnterminated s
|
||||
else
|
||||
let curr := input.get' i h
|
||||
let s := s.setPos (input.next' i h)
|
||||
if curr == '\"' then
|
||||
if num == 0 then
|
||||
mkNodeToken strLitKind startPos c s
|
||||
else
|
||||
closingState num 0 c s
|
||||
else
|
||||
normalState num c s
|
||||
/--
|
||||
Parses `#` characters immediately after a `"`.
|
||||
The `closingNum` variable counts the number of `#`s seen after the `"`.
|
||||
Note: `num > 0` since the `num = 0` case is entirely handled by `normalState`.
|
||||
-/
|
||||
closingState (num : Nat) (closingNum : Nat) : ParserFn := fun c s =>
|
||||
let input := c.input
|
||||
let i := s.pos
|
||||
if h : input.atEnd i then errorUnterminated s
|
||||
else
|
||||
let curr := input.get' i h
|
||||
let s := s.setPos (input.next' i h)
|
||||
if curr == '#' then
|
||||
if closingNum + 1 == num then
|
||||
mkNodeToken strLitKind startPos c s
|
||||
else
|
||||
closingState num (closingNum + 1) c s
|
||||
else if curr == '\"' then
|
||||
closingState num 0 c s
|
||||
else
|
||||
normalState num c s
|
||||
|
||||
def decimalNumberFn (startPos : String.Pos) (c : ParserContext) : ParserState → ParserState := fun s =>
|
||||
let s := takeWhileFn (fun c => c.isDigit) c s
|
||||
let input := c.input
|
||||
|
|
@ -862,6 +946,8 @@ private def tokenFnAux : ParserFn := fun c s =>
|
|||
numberFnAux c s
|
||||
else if curr == '`' && isIdFirstOrBeginEscape (getNext input i) then
|
||||
nameLitAux i c s
|
||||
else if curr == 'r' && isRawStrLitStart input (input.next i) then
|
||||
rawStrLitFnAux i c (s.next input i)
|
||||
else
|
||||
let tk := c.tokens.matchPrefix input i
|
||||
identFnAux i tk .anonymous c s
|
||||
|
|
|
|||
5
tests/lean/rawStringEOF.lean
Normal file
5
tests/lean/rawStringEOF.lean
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
/-!
|
||||
# Testing an unterminated raw string literal
|
||||
-/
|
||||
|
||||
#check r###"this is a raw string, unterminated"##
|
||||
1
tests/lean/rawStringEOF.lean.expected.out
Normal file
1
tests/lean/rawStringEOF.lean.expected.out
Normal file
|
|
@ -0,0 +1 @@
|
|||
rawStringEOF.lean:5:7: error: unterminated raw string literal
|
||||
72
tests/lean/run/rawStrings.lean
Normal file
72
tests/lean/run/rawStrings.lean
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
/-!
|
||||
# Testing raw string literals
|
||||
|
||||
Implemented in PR #2929 for issue #1422.
|
||||
-/
|
||||
|
||||
/-!
|
||||
Empty raw string
|
||||
-/
|
||||
example : r"" = "" := rfl
|
||||
|
||||
/-!
|
||||
Empty raw string with at least one `#`
|
||||
-/
|
||||
example : r#""# = "" := rfl
|
||||
|
||||
/-!
|
||||
A nonempty raw string
|
||||
-/
|
||||
example : r"hi" = "hi" := rfl
|
||||
|
||||
/-!
|
||||
A nonempty raw string with at least one `#`
|
||||
-/
|
||||
example : r#"hi"# = "hi" := rfl
|
||||
|
||||
/-!
|
||||
A nonempty raw string with `#`, testing embedded `"`'s
|
||||
-/
|
||||
example : r#""hello""# = "\"hello\"" := rfl
|
||||
|
||||
/-!
|
||||
A nonempty raw string with `#`, testing embedded `"`'s, one not at the end of the string
|
||||
-/
|
||||
example : r#""hello" said the world"# = "\"hello\" said the world" := rfl
|
||||
|
||||
/-!
|
||||
A nonempty raw string for just `"`
|
||||
-/
|
||||
example : r#"""# = "\"" := rfl
|
||||
|
||||
/-!
|
||||
A raw string with a `\`, which does not get interpreted as an escape
|
||||
-/
|
||||
example : r"\n" = "\\n" := rfl
|
||||
|
||||
/-!
|
||||
A raw string for just `\`, and it doesn't escape the final `"`
|
||||
-/
|
||||
example : r"\" = "\\" := rfl
|
||||
|
||||
/-!
|
||||
A raw string with `#` inside, testing that the first `"` doesn't get double-interpreted
|
||||
as both the start and end.
|
||||
-/
|
||||
example : r#"#"# = "#" := rfl
|
||||
|
||||
/-!
|
||||
Testing using `##` raw strings to allow `"#` inside the string.
|
||||
-/
|
||||
example : r##"need two #'s in "# to close"## = "need two #'s in \"# to close" := rfl
|
||||
|
||||
/-!
|
||||
From Rust reference
|
||||
-/
|
||||
example : r##"foo #"# bar"## = "foo #\"# bar" := rfl
|
||||
|
||||
/-!
|
||||
Testing that we are conservative when counting closing `#`s.
|
||||
-/
|
||||
infix:100 " # " => Prod.mk
|
||||
example : r#"a"##"b" = ("a", "b") := rfl
|
||||
Loading…
Add table
Reference in a new issue