diff --git a/RELEASES.md b/RELEASES.md index 47c52e8794..2e36c134d1 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -20,6 +20,11 @@ v4.5.0 (development in progress) ``` [PR #2821](https://github.com/leanprover/lean4/pull/2821) and [RFC #2838](https://github.com/leanprover/lean4/issues/2838). +* Add raw string literal syntax. For example, `r"\n"` is equivalent to `"\\n"`, with no escape processing. + To include double quote characters in a raw string one can add sufficiently many `#` characters before and after + the bounding `"`s, as in `r#"the "the" is in quotes"#` for `"the \"the\" is in quotes"`. + [PR #2929](https://github.com/leanprover/lean4/pull/2929) and [issue #1422](https://github.com/leanprover/lean4/issues/1422). + * The low-level `termination_by'` clause is no longer supported. Migration guide: Use `termination_by` instead, e.g.: diff --git a/doc/lexical_structure.md b/doc/lexical_structure.md index d6d924b419..93febbdb28 100644 --- a/doc/lexical_structure.md +++ b/doc/lexical_structure.md @@ -8,7 +8,7 @@ A Lean program consists of a stream of UTF-8 tokens where each token is one of the following: ``` - token: symbol | command | ident | string | char | numeral | + token: symbol | command | ident | string | raw_string | char | numeral | : decimal | doc_comment | mod_doc_comment | field_notation ``` @@ -94,6 +94,22 @@ So the complete syntax is: string_gap : "\" newline whitespace* ``` +Raw String Literals +=================== + +Raw string literals are string literals without any escape character processing. +They begin with `r##...#"` (with zero or more `#` characters) and end with `"#...##` (with the same number of `#` characters). +The contents of a raw string literal may contain `"##..#` so long as the number of `#` characters +is less than the number of `#` characters used to begin the raw string literal. + +``` + raw_string : raw_string_aux(0) | raw_string_aux(1) | raw_string_aux(2) | ... + raw_string_aux(n) : 'r' '#'{n} '"' raw_string_item '"' '#'{n} + raw_string_item(n) : raw_string_char | raw_string_quote(n) + raw_string_char : [^"] + raw_string_quote(n) : '"' '#'{0..n-1} +``` + Char Literals ============= diff --git a/src/Init/Meta.lean b/src/Init/Meta.lean index 2d6a8a1e3b..226d6b552a 100644 --- a/src/Init/Meta.lean +++ b/src/Init/Meta.lean @@ -800,9 +800,40 @@ partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Optio else decodeStrLitAux s i (acc.push c) -def decodeStrLit (s : String) : Option String := - decodeStrLitAux s ⟨1⟩ "" +/-- +Takes a raw string literal, counts the number of `#`'s after the `r`, and interprets it as a string. +The position `i` should start at `1`, which is the character after the leading `r`. +The algorithm is simple: we are given `r##...#"...string..."##...#` with zero or more `#`s. +By counting the number of leading `#`'s, we can extract the `...string...`. +-/ +partial def decodeRawStrLitAux (s : String) (i : String.Pos) (num : Nat) : String := + let c := s.get i + let i := s.next i + if c == '#' then + decodeRawStrLitAux s i (num + 1) + else + s.extract i ⟨s.utf8ByteSize - (num + 1)⟩ +/-- +Takes the string literal lexical syntax parsed by the parser and interprets it as a string. +This is where escape sequences are processed for example. +The string `s` is is either a plain string literal or a raw string literal. + +If it returns `none` then the string literal is ill-formed, which indicates a bug in the parser. +The function is not required to return `none` if the string literal is ill-formed. +-/ +def decodeStrLit (s : String) : Option String := + if s.get 0 == 'r' then + decodeRawStrLitAux s ⟨1⟩ 0 + else + decodeStrLitAux s ⟨1⟩ "" + +/-- +If the provided `Syntax` is a string literal, returns the string it represents. + +Even if the `Syntax` is a `str` node, the function may return `none` if its internally ill-formed. +The parser should always create well-formed `str` nodes. +-/ def isStrLit? (stx : Syntax) : Option String := match isLit? strLitKind stx with | some val => decodeStrLit val diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean index 03c0b3aa19..a767cce458 100644 --- a/src/Lean/Parser/Basic.lean +++ b/src/Lean/Parser/Basic.lean @@ -669,6 +669,90 @@ partial def strLitFnAux (startPos : String.Pos) : ParserFn := fun c s => else if curr == '\\' then andthenFn quotedStringFn (strLitFnAux startPos) c s else strLitFnAux startPos c s +/-- +Raw strings have the syntax `r##...#"..."#...##` with zero or more `#`'s. +If we are looking at a valid start to a raw string (`r##...#"`), +returns true. +We assume `i` begins at the position immediately after `r`. +-/ +partial def isRawStrLitStart (input : String) (i : String.Pos) : Bool := + if h : input.atEnd i then false + else + let curr := input.get' i h + if curr == '#' then + isRawStrLitStart input (input.next' i h) + else + curr == '"' + +/-- +Parses a raw string literal assuming `isRawStrLitStart` has returned true. +The `startPos` is the start of the raw string (at the `r`). +The parser state is assumed to be immediately after the `r`. +-/ +partial def rawStrLitFnAux (startPos : String.Pos) : ParserFn := initState 0 +where + /-- + Gives the "unterminated raw string literal" error. + -/ + errorUnterminated (s : ParserState) := s.mkUnexpectedErrorAt "unterminated raw string literal" startPos + /-- + Parses the `#`'s and `"` at the beginning of the raw string. + The `num` variable counts the number of `#`s after the `r`. + -/ + initState (num : Nat) : ParserFn := fun c s => + let input := c.input + let i := s.pos + if h : input.atEnd i then errorUnterminated s + else + let curr := input.get' i h + let s := s.setPos (input.next' i h) + if curr == '#' then + initState (num + 1) c s + else if curr == '"' then + normalState num c s + else + -- This should not occur, since we assume `isRawStrLitStart` succeeded. + errorUnterminated s + /-- + Parses characters after the first `"`. If we need to start counting `#`'s to decide if we are closing + the raw string literal, we switch to `closingState`. + -/ + normalState (num : Nat) : ParserFn := fun c s => + let input := c.input + let i := s.pos + if h : input.atEnd i then errorUnterminated s + else + let curr := input.get' i h + let s := s.setPos (input.next' i h) + if curr == '\"' then + if num == 0 then + mkNodeToken strLitKind startPos c s + else + closingState num 0 c s + else + normalState num c s + /-- + Parses `#` characters immediately after a `"`. + The `closingNum` variable counts the number of `#`s seen after the `"`. + Note: `num > 0` since the `num = 0` case is entirely handled by `normalState`. + -/ + closingState (num : Nat) (closingNum : Nat) : ParserFn := fun c s => + let input := c.input + let i := s.pos + if h : input.atEnd i then errorUnterminated s + else + let curr := input.get' i h + let s := s.setPos (input.next' i h) + if curr == '#' then + if closingNum + 1 == num then + mkNodeToken strLitKind startPos c s + else + closingState num (closingNum + 1) c s + else if curr == '\"' then + closingState num 0 c s + else + normalState num c s + def decimalNumberFn (startPos : String.Pos) (c : ParserContext) : ParserState → ParserState := fun s => let s := takeWhileFn (fun c => c.isDigit) c s let input := c.input @@ -862,6 +946,8 @@ private def tokenFnAux : ParserFn := fun c s => numberFnAux c s else if curr == '`' && isIdFirstOrBeginEscape (getNext input i) then nameLitAux i c s + else if curr == 'r' && isRawStrLitStart input (input.next i) then + rawStrLitFnAux i c (s.next input i) else let tk := c.tokens.matchPrefix input i identFnAux i tk .anonymous c s diff --git a/tests/lean/rawStringEOF.lean b/tests/lean/rawStringEOF.lean new file mode 100644 index 0000000000..9079f5c5d2 --- /dev/null +++ b/tests/lean/rawStringEOF.lean @@ -0,0 +1,5 @@ +/-! +# Testing an unterminated raw string literal +-/ + +#check r###"this is a raw string, unterminated"## diff --git a/tests/lean/rawStringEOF.lean.expected.out b/tests/lean/rawStringEOF.lean.expected.out new file mode 100644 index 0000000000..17dfc7afd1 --- /dev/null +++ b/tests/lean/rawStringEOF.lean.expected.out @@ -0,0 +1 @@ +rawStringEOF.lean:5:7: error: unterminated raw string literal diff --git a/tests/lean/run/rawStrings.lean b/tests/lean/run/rawStrings.lean new file mode 100644 index 0000000000..fe83d2b8cb --- /dev/null +++ b/tests/lean/run/rawStrings.lean @@ -0,0 +1,72 @@ +/-! +# Testing raw string literals + +Implemented in PR #2929 for issue #1422. +-/ + +/-! +Empty raw string +-/ +example : r"" = "" := rfl + +/-! +Empty raw string with at least one `#` +-/ +example : r#""# = "" := rfl + +/-! +A nonempty raw string +-/ +example : r"hi" = "hi" := rfl + +/-! +A nonempty raw string with at least one `#` +-/ +example : r#"hi"# = "hi" := rfl + +/-! +A nonempty raw string with `#`, testing embedded `"`'s +-/ +example : r#""hello""# = "\"hello\"" := rfl + +/-! +A nonempty raw string with `#`, testing embedded `"`'s, one not at the end of the string +-/ +example : r#""hello" said the world"# = "\"hello\" said the world" := rfl + +/-! +A nonempty raw string for just `"` +-/ +example : r#"""# = "\"" := rfl + +/-! +A raw string with a `\`, which does not get interpreted as an escape +-/ +example : r"\n" = "\\n" := rfl + +/-! +A raw string for just `\`, and it doesn't escape the final `"` +-/ +example : r"\" = "\\" := rfl + +/-! +A raw string with `#` inside, testing that the first `"` doesn't get double-interpreted +as both the start and end. +-/ +example : r#"#"# = "#" := rfl + +/-! +Testing using `##` raw strings to allow `"#` inside the string. +-/ +example : r##"need two #'s in "# to close"## = "need two #'s in \"# to close" := rfl + +/-! +From Rust reference +-/ +example : r##"foo #"# bar"## = "foo #\"# bar" := rfl + +/-! +Testing that we are conservative when counting closing `#`s. +-/ +infix:100 " # " => Prod.mk +example : r#"a"##"b" = ("a", "b") := rfl