vua/token.v
2025-06-24 03:00:44 -05:00

233 lines
4.0 KiB
V

module token
import scanner { Scanner }
import constants
// ===== Tokenizer =====
@[packed]
pub struct Token {
pub:
start u32
tag TokenType
}
pub enum TokenType as u8 {
eof
nil
string
number
decimal
true
false
table_start
table_end
identifier
comma
newline
equal
dot
angle_bracket_left
angle_bracket_right
keyword
operator
operator_len2
operator_len3
}
pub struct Tokenizer {
Scanner
}
fn (mut t Tokenizer) next() ?u8 {
pos := t.cur_pos()
if t.end_of_input() || pos > t.input.len {
return none
}
c := t.input[pos]
t.advance(1) // 1 char
return c
}
pub fn (mut t Tokenizer) parse_all() !Token {
t.skip_ws()
start_index := t.push_frame()!
c := t.any() or {
return Token{ start_index, .eof }
}
match true {
c == `"` {
t.rollback()
t.commit_frame()
return t.parse_str()
}
c in "0123456789".bytes() {
t.rollback()
t.commit_frame()
return t.parse_num()
}
c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".bytes() {
t.rollback()
t.commit_frame()
return t.parse_id()
}
c == `{` {
return Token{ start_index, .table_start }
}
c == `}` {
return Token{ start_index, .table_end }
}
c == `[` {
return Token{ start_index, .angle_bracket_left }
}
c == `]` {
return Token{ start_index, .angle_bracket_right }
}
c == 10 { // LF
return Token{ start_index, .newline }
}
else {
$if windows {
if c == 13 && t.term(10) { // CRLF
return Token{ start_index, .newline }
}
}
t.rollback()
t.commit_frame()
return t.parse_op()
}
}
}
fn (mut t Tokenizer) parse_op() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 32, init: 0}
for t.one_of("+-*/%^<>=~#.$,?") {
s << t.last()!
}
match s.bytestr() {
"+", "-", "*", "/", "%", "^", "#", "=" {
return Token{ start_index, .operator }
}
"==", "~=", "<=", ">=", "<", ">", ".." {
return Token{ start_index, .operator_len2 }
}
"..." {
return Token{ start_index, .operator_len3 }
}
"." {
return Token{ start_index, .dot }
}
"," {
return Token{ start_index, .comma }
}
else {
return t.syntax_error("invalid operator '${s.bytestr()}'")
}
}
}
fn (mut t Tokenizer) parse_id() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 32, init: 0}
if t.range(`a`, `z`) || t.range(`A`, `Z`) || t.term(`_`) {
s << t.last() or { return t.syntax_error("invalid identifier") }
} else {
return t.syntax_error("invalid identifier")
}
for t.range(`a`, `z`) || t.range(`A`, `Z`) || t.range(`0`, `9`) || t.term(`_`) {
s << t.last() or { break }
}
return match s.bytestr() {
"nil" { Token{ start_index, .nil } }
"true" { Token{ start_index, .true } }
"false" { Token{ start_index, .false } }
else {
if s.bytestr() in constants.keywords {
return Token{ start_index, .keyword }
}
return Token{ start_index, .identifier }
}
}
}
fn (mut t Tokenizer) parse_num() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 32, init: 0}
if t.one_of("0123456789") {
s << t.last()!
} else {
return t.syntax_error("invalid number")
}
mut is_decimal := false
loop:
for {
if t.one_of("0123456789") {
s << t.last() or { break loop }
continue
}
if t.term(`.`) {
if is_decimal {
return t.syntax_error("invalid number")
}
s << `.`
if !t.one_of("0123456789") {
return t.syntax_error("invalid number")
}
is_decimal = true
s << t.last() or { break loop }
continue
}
break
}
if s.len == 0 {
return t.syntax_error("invalid number")
}
return Token{
start: start_index,
tag: if is_decimal { .decimal } else { .number },
}
}
fn (mut t Tokenizer) parse_str() !Token {
start_index := t.push_frame()!
if !t.term(`"`) {
return t.syntax_error("invalid string")
}
for !t.end_of_input() {
if t.term(`"`) {
return Token{ start_index, .string }
}
if t.term(`\\`) {
// skip escaped char kek this should be improved
// TODO: improve this
t.advance(1)
continue
}
t.advance(1) // 1 char
}
return t.syntax_error("uncompleted string literal")
}