vua/tokenizer.v
2025-06-19 22:38:38 -05:00

434 lines
7.5 KiB
V

module tokenizer
import constants
@[packed]
pub struct Token {
pub:
start u32
tag VuaToken
}
pub enum VuaToken {
eof
nil
string
number
decimal
true
false
table_start
table_end
identifier
comma
newline
equal
dot
angle_bracket_left
angle_bracket_right
reserved_keyword
operator
operator_len2
operator_len3
}
pub struct Parser {
pub:
input string
pub mut:
max_pos u32
stack []u32 = []u32{len: 256, cap: 256, init: 0}
frame u32
mut:
lc int = 1
rc int
}
fn (mut p Parser) cur_pos() u32 {
return p.stack[p.frame]
}
fn (mut p Parser) advance(delta u32) {
p.rc += int(delta)
p.stack[p.frame] += delta;
if p.max_pos < p.stack[p.frame] { p.max_pos = p.stack[p.frame] }
}
@[manualfree]
fn (mut p Parser) push_frame() !u32 {
p.frame += 1
if p.frame == p.stack.len {
new_size := p.stack.len + (p.stack.len >> 1)
new_stack := []u32{len: new_size, cap: new_size, init:0}
unsafe { vmemcpy(new_stack[0..p.max_pos], p.stack[0..p.max_pos], new_stack.len) }
unsafe { p.stack.free() }
p.stack = new_stack
}
if p.frame > p.input.len {
return error('Buffer too small')
}
p.stack[p.frame] = p.stack[p.frame - 1]
return p.cur_pos()
}
fn (mut p Parser) pop_frame() {
if p.frame >= 1 { p.frame -= 1 }
}
fn (mut p Parser) commit_frame() {
p.frame -= 1
p.stack[p.frame] = p.stack[p.frame + 1];
}
@[unsafe]
fn (mut p Parser) free() {
p.stack.free()
}
struct SyntaxError {
Error
line int
row int
m string
}
fn (err SyntaxError) msg() string {
return "SyntaxError: ${err.m} -- at line ${err.line}-${err.row}"
}
fn (mut p Parser) syntax_error(m string) SyntaxError {
return SyntaxError{
line: p.lc,
row: p.rc,
m: m,
}
}
fn (mut p Parser) rollback() {
p.stack[p.frame] = if p.frame == 0 { u32(0) } else { p.stack[p.frame - 1] }
}
// ===== Char matching =====
pub fn (mut p Parser) end_of_input() bool {
return p.cur_pos() >= p.input.len
}
fn (mut p Parser) term(c u8) bool {
if p.end_of_input() || p.input[p.cur_pos()] != c {
return false
}
p.advance(1) // 1 char
return true
}
fn (mut p Parser) one_of(s string) bool {
if p.end_of_input() || p.input[p.cur_pos()] !in s.bytes() {
return false
}
p.advance(1) // 1 char
return true
}
fn (mut p Parser) none_of(s string) bool {
if p.end_of_input() || p.input[p.cur_pos()] in s.bytes() {
return false
}
p.advance(1) // 1 char
return true
}
fn (mut p Parser) range(low u8, high u8) bool {
if p.end_of_input() {
return false
}
c := p.input[p.cur_pos()]
if !(low <= c && c <= high) {
return false
}
p.advance(1) // 1 char
return true
}
// ===== Token extraction =====
fn (mut p Parser) save_stash(from u32) string {
return p.input[from..p.cur_pos()]
}
fn (mut p Parser) skip_ws() {
for !p.end_of_input() {
$if windows {
if p.input[p.cur_pos()] == u8(13) { // eg: WINDOWS ONLY
p.lc += 1
p.rc = 0 // reset rows
p.advance(1) // skip CR
if p.end_of_input() || p.input[p.cur_pos()] != u8(10) { // skip LF if present
continue
}
}
}
if p.input[p.cur_pos()] == u8(10) { // eg: LINUX ONLY
p.lc += 1
p.rc = 0 // reset rows
}
if p.input[p.cur_pos()] in ' \r\n\t'.bytes() {
p.advance(1)
continue
}
break
}
}
fn (mut p Parser) last() !u8 {
if p.stack[p.frame] > p.input.len {
return error('Buffer too small')
}
return p.input[p.cur_pos() - 1]
}
fn (mut p Parser) pred(func fn (u8) bool) bool {
if p.end_of_input() || !func(p.input[p.cur_pos()]) {
return false
}
p.advance(1) // 1 char
return true
}
fn (mut p Parser) many(s string) bool {
if p.input.len < p.cur_pos() + u32(s.len) {
return false
}
if s != p.input[p.cur_pos() .. p.cur_pos() + u32(s.len)] {
return false
}
p.advance(u32(s.len))
return true
}
fn (mut p Parser) any() ?u8 {
if p.end_of_input() {
return none
}
c := p.input[p.cur_pos()]
p.advance(1) // 1 char
return c
}
// ===== Tokenizer =====
pub struct Tokenizer {
Parser
}
fn (mut t Tokenizer) next() ?u8 {
pos := t.cur_pos()
if t.end_of_input() || pos > t.input.len {
return none
}
c := t.input[pos]
t.advance(1) // 1 char
return c
}
pub fn (mut t Tokenizer) parse_all() !Token {
t.skip_ws()
start_index := t.push_frame()!
c := t.any() or { return Token{ start_index, .eof } }
print(c.ascii_str())
match true {
c == `"` {
t.rollback()
t.commit_frame()
return t.parse_str()
}
c in "0123456789".bytes() {
t.rollback()
t.commit_frame()
return t.parse_num()
}
c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".bytes() {
t.rollback()
t.commit_frame()
return t.parse_id()
}
c == `{` {
return Token{ start_index, .table_start }
}
c == `}` {
return Token{ start_index, .table_end }
}
c == `[` {
return Token{ start_index, .angle_bracket_left }
}
c == `]` {
return Token{ start_index, .angle_bracket_right }
}
c == 10 { // LF
return Token{ start_index, .newline }
}
else {
$if windows {
if c == 13 && t.term(10) { // CRLF
return Token{ start_index, .newline }
}
}
t.rollback()
t.commit_frame()
return t.parse_op()
}
}
}
fn (mut t Tokenizer) parse_op() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 32, init: 0}
for t.one_of("+-*/%^<>=~#") {
s << t.last()!
}
match s.bytestr() {
"+", "-", "*", "/", "%", "^", "#", "=" {
return Token{ start_index, .operator }
}
"==", "~=", "<=", ">=", "<", ">", ".." {
return Token{ start_index, .operator_len2 }
}
"..." {
return Token{ start_index, .operator_len3 }
}
"." {
return Token{ start_index, .dot }
}
"," {
return Token{ start_index, .comma }
}
else {
return t.syntax_error("invalid operator '${s.bytestr()}'")
}
}
}
fn (mut t Tokenizer) parse_id() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 32, init: 0}
if t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.term('_'[0]) {
s << t.last() or { return t.syntax_error("invalid identifier") }
} else {
return t.syntax_error("invalid identifier")
}
for t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.range('0'[0], '9'[0]) || t.term('_'[0]) {
s << t.last() or { break }
}
return match s.bytestr() {
"true" { Token{ start_index, .true } }
"false" { Token{ start_index, .false } }
"nil" { Token{ start_index, .nil } }
else {
if s.bytestr() in constants.keywords {
return Token{start_index,.reserved_keyword}
}
return Token{start_index,.identifier}
}
}
}
fn (mut t Tokenizer) parse_num() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 32, init: 0}
if t.one_of("0123456789") {
s << t.last()!
} else {
return t.syntax_error("invalid number")
}
mut is_decimal := false
loop:
for {
if t.one_of("0123456789") {
s << t.last() or { break loop }
continue
}
if t.term('.'[0]) {
if is_decimal {
return t.syntax_error("invalid number")
}
s << '.'[0]
if !t.one_of("0123456789") {
return t.syntax_error("invalid number")
}
is_decimal = true
s << t.last() or { break loop }
continue
}
break
}
if s.len == 0 {
return t.syntax_error("invalid number")
}
return Token{
start: start_index,
tag: if is_decimal { .decimal } else { .number },
}
}
fn (mut t Tokenizer) parse_str() !Token {
start_index := t.push_frame()!
mut s := []u8{len: 0, cap: 4096, init: 0}
ok := t.term('"'[0])
if !ok {
return t.syntax_error("invalid string")
}
for c in t {
s << c
}
c := t.last() or {
return t.syntax_error("unclosed string")
}
match c {
`"` {
return Token{ start_index, .string }
}
else {
s << c
}
}
return Token{ start_index, .eof }
}