module tokenizer import constants @[packed] pub struct Token { pub: start u32 tag VuaToken } pub enum VuaToken { eof nil string number decimal true false table_start table_end identifier comma newline equal dot angle_bracket_left angle_bracket_right reserved_keyword operator operator_len2 operator_len3 } pub struct Parser { pub: input string pub mut: max_pos u32 stack []u32 = []u32{len: 256, cap: 256, init: 0} frame u32 mut: lc int = 1 rc int } fn (mut p Parser) cur_pos() u32 { return p.stack[p.frame] } fn (mut p Parser) advance(delta u32) { p.rc += int(delta) p.stack[p.frame] += delta; if p.max_pos < p.stack[p.frame] { p.max_pos = p.stack[p.frame] } } @[manualfree] fn (mut p Parser) push_frame() !u32 { p.frame += 1 if p.frame == p.stack.len { new_size := p.stack.len + (p.stack.len >> 1) new_stack := []u32{len: new_size, cap: new_size, init:0} unsafe { vmemcpy(new_stack[0..p.max_pos], p.stack[0..p.max_pos], new_stack.len) } unsafe { p.stack.free() } p.stack = new_stack } if p.frame > p.input.len { return error('Buffer too small') } p.stack[p.frame] = p.stack[p.frame - 1] return p.cur_pos() } fn (mut p Parser) pop_frame() { if p.frame >= 1 { p.frame -= 1 } } fn (mut p Parser) commit_frame() { p.frame -= 1 p.stack[p.frame] = p.stack[p.frame + 1]; } @[unsafe] fn (mut p Parser) free() { p.stack.free() } struct SyntaxError { Error line int row int m string } fn (err SyntaxError) msg() string { return "SyntaxError: ${err.m} -- at line ${err.line}-${err.row}" } fn (mut p Parser) syntax_error(m string) SyntaxError { return SyntaxError{ line: p.lc, row: p.rc, m: m, } } fn (mut p Parser) rollback() { p.stack[p.frame] = if p.frame == 0 { u32(0) } else { p.stack[p.frame - 1] } } // ===== Char matching ===== pub fn (mut p Parser) end_of_input() bool { return p.cur_pos() >= p.input.len } fn (mut p Parser) term(c u8) bool { if p.end_of_input() || p.input[p.cur_pos()] != c { return false } p.advance(1) // 1 char return true } fn (mut p Parser) one_of(s string) bool { if p.end_of_input() || p.input[p.cur_pos()] !in s.bytes() { return false } p.advance(1) // 1 char return true } fn (mut p Parser) none_of(s string) bool { if p.end_of_input() || p.input[p.cur_pos()] in s.bytes() { return false } p.advance(1) // 1 char return true } fn (mut p Parser) range(low u8, high u8) bool { if p.end_of_input() { return false } c := p.input[p.cur_pos()] if !(low <= c && c <= high) { return false } p.advance(1) // 1 char return true } // ===== Token extraction ===== fn (mut p Parser) save_stash(from u32) string { return p.input[from..p.cur_pos()] } fn (mut p Parser) skip_ws() { for !p.end_of_input() { $if windows { if p.input[p.cur_pos()] == u8(13) { // eg: WINDOWS ONLY p.lc += 1 p.rc = 0 // reset rows p.advance(1) // skip CR if p.end_of_input() || p.input[p.cur_pos()] != u8(10) { // skip LF if present continue } } } if p.input[p.cur_pos()] == u8(10) { // eg: LINUX ONLY p.lc += 1 p.rc = 0 // reset rows } if p.input[p.cur_pos()] in ' \r\n\t'.bytes() { p.advance(1) continue } break } } fn (mut p Parser) last() !u8 { if p.stack[p.frame] > p.input.len { return error('Buffer too small') } return p.input[p.cur_pos() - 1] } fn (mut p Parser) pred(func fn (u8) bool) bool { if p.end_of_input() || !func(p.input[p.cur_pos()]) { return false } p.advance(1) // 1 char return true } fn (mut p Parser) many(s string) bool { if p.input.len < p.cur_pos() + u32(s.len) { return false } if s != p.input[p.cur_pos() .. p.cur_pos() + u32(s.len)] { return false } p.advance(u32(s.len)) return true } fn (mut p Parser) any() ?u8 { if p.end_of_input() { return none } c := p.input[p.cur_pos()] p.advance(1) // 1 char return c } // ===== Tokenizer ===== pub struct Tokenizer { Parser } fn (mut t Tokenizer) next() ?u8 { pos := t.cur_pos() if t.end_of_input() || pos > t.input.len { return none } c := t.input[pos] t.advance(1) // 1 char return c } pub fn (mut t Tokenizer) parse_all() !Token { t.skip_ws() start_index := t.push_frame()! c := t.any() or { return Token{ start_index, .eof } } print(c.ascii_str()) match true { c == `"` { t.rollback() t.commit_frame() return t.parse_str() } c in "0123456789".bytes() { t.rollback() t.commit_frame() return t.parse_num() } c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".bytes() { t.rollback() t.commit_frame() return t.parse_id() } c == `{` { return Token{ start_index, .table_start } } c == `}` { return Token{ start_index, .table_end } } c == `[` { return Token{ start_index, .angle_bracket_left } } c == `]` { return Token{ start_index, .angle_bracket_right } } c == 10 { // LF return Token{ start_index, .newline } } else { $if windows { if c == 13 && t.term(10) { // CRLF return Token{ start_index, .newline } } } t.rollback() t.commit_frame() return t.parse_op() } } } fn (mut t Tokenizer) parse_op() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 32, init: 0} for t.one_of("+-*/%^<>=~#") { s << t.last()! } match s.bytestr() { "+", "-", "*", "/", "%", "^", "#", "=" { return Token{ start_index, .operator } } "==", "~=", "<=", ">=", "<", ">", ".." { return Token{ start_index, .operator_len2 } } "..." { return Token{ start_index, .operator_len3 } } "." { return Token{ start_index, .dot } } "," { return Token{ start_index, .comma } } else { return t.syntax_error("invalid operator '${s.bytestr()}'") } } } fn (mut t Tokenizer) parse_id() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 32, init: 0} if t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.term('_'[0]) { s << t.last() or { return t.syntax_error("invalid identifier") } } else { return t.syntax_error("invalid identifier") } for t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.range('0'[0], '9'[0]) || t.term('_'[0]) { s << t.last() or { break } } return match s.bytestr() { "true" { Token{ start_index, .true } } "false" { Token{ start_index, .false } } "nil" { Token{ start_index, .nil } } else { if s.bytestr() in constants.keywords { return Token{start_index,.reserved_keyword} } return Token{start_index,.identifier} } } } fn (mut t Tokenizer) parse_num() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 32, init: 0} if t.one_of("0123456789") { s << t.last()! } else { return t.syntax_error("invalid number") } mut is_decimal := false loop: for { if t.one_of("0123456789") { s << t.last() or { break loop } continue } if t.term('.'[0]) { if is_decimal { return t.syntax_error("invalid number") } s << '.'[0] if !t.one_of("0123456789") { return t.syntax_error("invalid number") } is_decimal = true s << t.last() or { break loop } continue } break } if s.len == 0 { return t.syntax_error("invalid number") } return Token{ start: start_index, tag: if is_decimal { .decimal } else { .number }, } } fn (mut t Tokenizer) parse_str() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 4096, init: 0} ok := t.term('"'[0]) if !ok { return t.syntax_error("invalid string") } for c in t { s << c } c := t.last() or { return t.syntax_error("unclosed string") } match c { `"` { return Token{ start_index, .string } } else { s << c } } return Token{ start_index, .eof } }