octive-lean/OctiveLean/Lexer.lean
Maximus Gorog db79eb3fde
Some checks are pending
Lean Action CI / build (push) Waiting to run
Initial commit: Lean 4 reimplementation of GNU Octave
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 09:40:46 -06:00

364 lines
14 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import OctiveLean.Error
namespace OctiveLean
/-! Token kinds -/
inductive TokenKind where
-- Literals
| LitInt : Int → TokenKind
| LitFloat : Float → TokenKind
| LitStr : String → TokenKind
-- Identifiers
| Ident : String → TokenKind
-- Keywords
| KwFor | KwWhile | KwDo | KwUntil
| KwIf | KwElseif | KwElse
| KwEnd | KwEndfor | KwEndwhile | KwEndif | KwEndfunction
| KwFunction | KwReturn | KwBreak | KwContinue
| KwSwitch | KwCase | KwOtherwise | KwEndswitch
| KwTry | KwCatch | KwEndTryCatch
| KwUnwindProtect | KwUnwindProtectCleanup | KwEndUnwindProtect
| KwGlobal | KwPersistent | KwClear
-- Arithmetic operators
| Plus | Minus | Star | Slash | Backslash | Caret
| DotStar | DotSlash | DotBackslash | DotCaret
-- Comparison
| Lt | Le | Gt | Ge | EqEq | Neq | TildeEq
-- Logical
| Amp | Pipe | AmpAmp | PipePipe | Tilde | Bang
-- Assignment operators
| Eq | PlusEq | MinusEq | StarEq | SlashEq | CaretEq
-- Postfix
| Transpose | HTranspose -- .' and '
-- Punctuation
| LParen | RParen
| LBracket | RBracket
| LBrace | RBrace
| Comma | Semi | Colon | Dot | At
-- Statement terminators
| Newline
| Eof
deriving Repr, BEq
structure Token where
kind : TokenKind
line : Nat
col : Nat
deriving Repr
instance : Inhabited Token := ⟨{ kind := .Eof, line := 0, col := 0 }⟩
/-! Lexer state -/
private structure LexState where
chars : Array Char -- source as char array for O(1) indexing
pos : Nat
line : Nat
col : Nat
matDepth : Nat -- depth of '[' nesting
prevCanTranspose : Bool -- last token permits ' → transpose
private def LexState.fromSrc (src : String) : LexState :=
{ chars := src.toList.toArray, pos := 0, line := 1, col := 1,
matDepth := 0, prevCanTranspose := false }
private def LexState.curr (s : LexState) : Option Char :=
if s.pos < s.chars.size then some s.chars[s.pos]! else none
private def LexState.peek (s : LexState) (offset : Nat := 1) : Option Char :=
let i := s.pos + offset
if i < s.chars.size then some s.chars[i]! else none
private def LexState.advance (s : LexState) : LexState :=
match s.curr with
| some '\n' => { s with pos := s.pos + 1, line := s.line + 1, col := 1 }
| some _ => { s with pos := s.pos + 1, col := s.col + 1 }
| none => s
private def LexState.advanceN (s : LexState) (n : Nat) : LexState :=
List.range n |>.foldl (fun acc _ => acc.advance) s
private def LexState.slice (s : LexState) (start stop : Nat) : String :=
String.ofList (s.chars.toList.drop start |>.take (stop - start))
/-! Keyword table -/
private def keyword? (w : String) : Option TokenKind :=
match w with
| "for" => some .KwFor | "while" => some .KwWhile
| "do" => some .KwDo | "until" => some .KwUntil
| "if" => some .KwIf | "elseif" => some .KwElseif
| "else" => some .KwElse
| "end" => some .KwEnd | "endfor" => some .KwEndfor
| "endwhile" => some .KwEndwhile | "endif" => some .KwEndif
| "endfunction" => some .KwEndfunction
| "function" => some .KwFunction | "return" => some .KwReturn
| "break" => some .KwBreak | "continue" => some .KwContinue
| "switch" => some .KwSwitch | "case" => some .KwCase
| "otherwise" => some .KwOtherwise | "endswitch" => some .KwEndswitch
| "try" => some .KwTry | "catch" => some .KwCatch
| "end_try_catch" => some .KwEndTryCatch
| "unwind_protect" => some .KwUnwindProtect
| "unwind_protect_cleanup" => some .KwUnwindProtectCleanup
| "end_unwind_protect" => some .KwEndUnwindProtect
| "global" => some .KwGlobal | "persistent" => some .KwPersistent
| "clear" => some .KwClear
| _ => none
/-! Recursive lexer helpers — all marked `partial` since Lean can't prove
termination through the LexState wrapper without significant effort. -/
private partial def skipHorizWS (s : LexState) : LexState :=
match s.curr with
| some ' ' | some '\t' | some '\r' => skipHorizWS s.advance
| _ => s
private partial def skipLineComment (s : LexState) : LexState :=
match s.curr with
| some '\n' | none => s
| _ => skipLineComment s.advance
private partial def skipBlockComment (s : LexState) : LexState :=
match s.curr with
| none => s
| some '%' => if s.peek == some '}' then s.advanceN 2
else skipBlockComment s.advance
| _ => skipBlockComment s.advance
private partial def skipLineContinuation (s : LexState) : LexState :=
match s.curr with
| some '\n' | none => s.advance
| _ => skipLineContinuation s.advance
/-! Number parsing -/
private partial def eatDigits (s : LexState) : LexState × String :=
let start := s.pos
let rec go (st : LexState) : LexState :=
match st.curr with
| some c => if c.isDigit then go st.advance else st
| none => st
let st := go s
(st, s.slice start st.pos)
-- Build a float from separate integer, fractional, sign, and exponent strings.
private def buildFloat (intStr fracStr : String) (negExp : Bool) (expStr : String) : Float :=
let iv := Float.ofNat (intStr.toNat? |>.getD 0)
let fv := if fracStr.isEmpty then 0.0
else Float.ofNat (fracStr.toNat? |>.getD 0) /
Float.ofNat (10 ^ fracStr.length)
let ev := expStr.toNat? |>.getD 0
let mlt := Float.ofNat (10 ^ ev)
let base := iv + fv
if negExp then base / mlt else base * mlt
private def lexNumber (s : LexState) : LexState × TokenKind :=
let (s1, intStr) := eatDigits s
-- optional '.' followed by more digits
let (s2, fracStr, hasDot) :=
if s1.curr == some '.' then
-- make sure it's not '..' range or '.*' etc.
let nextOk := match s1.peek with
| some '.' | some '*' | some '/' | some '\\' | some '^' | some '\'' => false
| _ => true
if nextOk then
let (s1', fs) := eatDigits s1.advance
(s1', fs, true)
else (s1, "", false)
else (s1, "", false)
-- optional exponent
let (s3, negExp, expStr, hasExp) :=
match s2.curr with
| some 'e' | some 'E' =>
let s2' := s2.advance
let (neg, s2'') := match s2'.curr with
| some '-' => (true, s2'.advance)
| some '+' => (false, s2'.advance)
| _ => (false, s2')
let (s2''', es) := eatDigits s2''
(s2''', neg, es, true)
| _ => (s2, false, "", false)
if hasDot || hasExp then
(s3, .LitFloat (buildFloat intStr fracStr negExp expStr))
else
(s3, .LitInt (intStr.toInt? |>.getD 0))
/-! String lexing -/
private partial def lexSQString (s : LexState) : LexState × String :=
let rec go (st : LexState) (acc : String) : LexState × String :=
match st.curr with
| none | some '\n' => (st, acc)
| some '\'' =>
if st.peek == some '\'' then go (st.advanceN 2) (acc.push '\'')
else (st.advance, acc)
| some c => go st.advance (acc.push c)
go s ""
private partial def lexDQString (s : LexState) : LexState × String :=
let rec go (st : LexState) (acc : String) : LexState × String :=
match st.curr with
| none | some '"' => (st.advance, acc)
| some '\\' =>
let c := match st.peek with
| some 'n' => '\n' | some 't' => '\t' | some 'r' => '\r'
| some '\'' => '\'' | some '"' => '"' | some '\\' => '\\'
| some '0' => '\x00'
| _ => '\\'
go (st.advanceN 2) (acc.push c)
| some c => go st.advance (acc.push c)
go s ""
/-! Token emission helpers -/
private def transposePrev : TokenKind → Bool
| .Ident _ | .LitInt _ | .LitFloat _ | .RParen | .RBracket | .RBrace
| .Transpose | .HTranspose => true
| _ => false
/-! Main tokeniser — partial since it advances through an arbitrary string -/
private partial def tokenizeFrom (s : LexState) (acc : Array Token) :
Except String (Array Token) :=
let s := skipHorizWS s
let ln := s.line
let cl := s.col
let emit (k : TokenKind) (s' : LexState) :=
tokenizeFrom { s' with prevCanTranspose := transposePrev k }
(acc.push { kind := k, line := ln, col := cl })
let emitNoPrev (k : TokenKind) (s' : LexState) :=
tokenizeFrom { s' with prevCanTranspose := false }
(acc.push { kind := k, line := ln, col := cl })
match s.curr with
| none => .ok (acc.push { kind := .Eof, line := ln, col := cl })
| some c =>
match c with
-- Comments
| '%' =>
if s.peek == some '{' then tokenizeFrom (skipBlockComment (s.advanceN 2)) acc
else tokenizeFrom (skipLineComment s.advance) acc
| '#' => tokenizeFrom (skipLineComment s.advance) acc
-- Newlines (statement separators, collapse runs)
| '\n' =>
let acc' := match acc.back? with
| some t =>
match t.kind with
| .Newline | .Semi | .Comma | .LBracket | .LBrace | .LParen
| .Plus | .Minus | .Star | .Slash | .Backslash | .Caret
| .DotStar | .DotSlash | .DotCaret | .Eq | .Colon
| .AmpAmp | .PipePipe | .Amp | .Pipe
| .KwElse | .KwElseif | .KwFor | .KwWhile | .KwDo
| .KwIf | .KwSwitch | .KwCase | .KwFunction
| .KwOtherwise | .KwTry | .KwCatch
| .KwUnwindProtect | .KwUnwindProtectCleanup => acc
| _ => acc.push { kind := .Newline, line := ln, col := cl }
| none => acc
tokenizeFrom s.advance acc'
-- Numbers
| d =>
if d.isDigit then
let (s', k) := lexNumber s
tokenizeFrom { s' with prevCanTranspose := true }
(acc.push { kind := k, line := ln, col := cl })
-- Identifiers / keywords
else if d.isAlpha || d == '_' then
let start := s.pos
let rec eatId (st : LexState) : LexState :=
match st.curr with
| some x => if x.isAlphanum || x == '_' then eatId st.advance else st
| none => st
let s' := eatId s
let word := s.slice start s'.pos
let k := keyword? word |>.getD (.Ident word)
tokenizeFrom { s' with prevCanTranspose := transposePrev k }
(acc.push { kind := k, line := ln, col := cl })
else
-- Everything else: single/multi-char tokens
match c with
| '\'' =>
if s.prevCanTranspose then emit .HTranspose s.advance
else
let (s', str) := lexSQString s.advance
emitNoPrev (.LitStr str) s'
| '"' =>
let (s', str) := lexDQString s.advance
emitNoPrev (.LitStr str) s'
| '.' =>
if s.peek == some '.' && s.peek (offset := 2) == some '.' then
tokenizeFrom (skipLineContinuation (s.advanceN 3)) acc
else if s.peek == some '\'' then emitNoPrev .Transpose (s.advanceN 2)
else if s.peek == some '*' then emitNoPrev .DotStar (s.advanceN 2)
else if s.peek == some '/' then emitNoPrev .DotSlash (s.advanceN 2)
else if s.peek == some '\\' then emitNoPrev .DotBackslash (s.advanceN 2)
else if s.peek == some '^' then emitNoPrev .DotCaret (s.advanceN 2)
else emitNoPrev .Dot s.advance
| '+' =>
if s.peek == some '=' then emitNoPrev .PlusEq (s.advanceN 2)
else emitNoPrev .Plus s.advance
| '-' =>
if s.peek == some '=' then emitNoPrev .MinusEq (s.advanceN 2)
else emitNoPrev .Minus s.advance
| '*' =>
if s.peek == some '=' then emitNoPrev .StarEq (s.advanceN 2)
else emitNoPrev .Star s.advance
| '/' =>
if s.peek == some '=' then emitNoPrev .SlashEq (s.advanceN 2)
else emitNoPrev .Slash s.advance
| '\\' => emitNoPrev .Backslash s.advance
| '^' =>
if s.peek == some '=' then emitNoPrev .CaretEq (s.advanceN 2)
else emitNoPrev .Caret s.advance
| '<' =>
if s.peek == some '=' then emitNoPrev .Le (s.advanceN 2)
else emitNoPrev .Lt s.advance
| '>' =>
if s.peek == some '=' then emitNoPrev .Ge (s.advanceN 2)
else emitNoPrev .Gt s.advance
| '=' =>
if s.peek == some '=' then emitNoPrev .EqEq (s.advanceN 2)
else emitNoPrev .Eq s.advance
| '!' =>
if s.peek == some '=' then emitNoPrev .Neq (s.advanceN 2)
else emitNoPrev .Bang s.advance
| '~' =>
if s.peek == some '=' then emitNoPrev .TildeEq (s.advanceN 2)
else emitNoPrev .Tilde s.advance
| '&' =>
if s.peek == some '&' then emitNoPrev .AmpAmp (s.advanceN 2)
else emitNoPrev .Amp s.advance
| '|' =>
if s.peek == some '|' then emitNoPrev .PipePipe (s.advanceN 2)
else emitNoPrev .Pipe s.advance
| '@' => emitNoPrev .At s.advance
| '(' => emitNoPrev .LParen s.advance
| ')' => emit .RParen s.advance
| '[' =>
tokenizeFrom { s.advance with prevCanTranspose := false,
matDepth := s.matDepth + 1 }
(acc.push { kind := .LBracket, line := ln, col := cl })
| ']' =>
tokenizeFrom { s.advance with prevCanTranspose := true,
matDepth := s.matDepth - min s.matDepth 1 }
(acc.push { kind := .RBracket, line := ln, col := cl })
| '{' => emitNoPrev .LBrace s.advance
| '}' => emit .RBrace s.advance
| ',' => emitNoPrev .Comma s.advance
| ';' =>
let acc' := match acc.back? with
| some t =>
match t.kind with
| .Newline => acc.set! (acc.size - 1) { kind := .Semi, line := ln, col := cl }
| .Semi => acc
| _ => acc.push { kind := .Semi, line := ln, col := cl }
| none => acc.push { kind := .Semi, line := ln, col := cl }
tokenizeFrom { s.advance with prevCanTranspose := false } acc'
| ':' => emitNoPrev .Colon s.advance
-- skip unrecognised chars (BOM etc.)
| _ => tokenizeFrom s.advance acc
/-- Tokenise an Octave source string. -/
def tokenize (src : String) : Except String (Array Token) :=
tokenizeFrom (LexState.fromSrc src) #[]
end OctiveLean