vua/tokenizer.v

module tokenizer
import constants

@[packed]
pub struct Token {
pub:
	start u32
	tag   VuaToken
}

pub enum VuaToken {
	eof
	nil
	string
	number
	decimal
	true
	false
	table_start
	table_end
	identifier
	comma
	newline
	equal
	dot
	angle_bracket_left
	angle_bracket_right
	reserved_keyword
	operator
	operator_len2
	operator_len3
}

pub struct Parser {
pub:
	input string
pub mut:
	max_pos u32
	stack []u32 = []u32{len: 256, cap: 256, init: 0}
	frame u32
mut:
	lc int = 1
	rc int
}

fn (mut p Parser) cur_pos() u32 {
	return p.stack[p.frame]
}

fn (mut p Parser) advance(delta u32) {
	p.rc += int(delta)
	p.stack[p.frame] += delta;
	if p.max_pos < p.stack[p.frame] { p.max_pos = p.stack[p.frame] }
}

@[manualfree]
fn (mut p Parser) push_frame() !u32 {
	p.frame += 1
	if p.frame == p.stack.len {
		new_size := p.stack.len + (p.stack.len >> 1)
		new_stack := []u32{len: new_size, cap: new_size, init:0}

		unsafe { vmemcpy(new_stack[0..p.max_pos], p.stack[0..p.max_pos], new_stack.len) }
		unsafe { p.stack.free() }

		p.stack = new_stack
	}
	if p.frame > p.input.len {
		return error('Buffer too small')
	}

	p.stack[p.frame] = p.stack[p.frame - 1]
	return p.cur_pos()
}

fn (mut p Parser) pop_frame() {
	if p.frame >= 1 { p.frame -= 1 }
}

fn (mut p Parser) commit_frame() {
	p.frame -= 1
	p.stack[p.frame] = p.stack[p.frame + 1];
}

@[unsafe]
fn (mut p Parser) free() {
	p.stack.free()
}

struct SyntaxError {
	Error
	line int
	row int
	m string
}

fn (err SyntaxError) msg() string {
	return "SyntaxError: ${err.m} -- at line ${err.line}-${err.row}"
}

fn (mut p Parser) syntax_error(m string) SyntaxError {
	return SyntaxError{
		line: p.lc,
		row: p.rc,
		m: m,
	}
}

fn (mut p Parser) rollback() {
	p.stack[p.frame] = if p.frame == 0 { u32(0) } else { p.stack[p.frame - 1] }
}

// ===== Char matching =====

pub fn (mut p Parser) end_of_input() bool {
	return p.cur_pos() >= p.input.len
}

fn (mut p Parser) term(c u8) bool {
	if p.end_of_input() || p.input[p.cur_pos()] != c {
		return false
	}

	p.advance(1) // 1 char
	return true
}

fn (mut p Parser) one_of(s string) bool {
	if p.end_of_input() || p.input[p.cur_pos()] !in s.bytes() {
		return false
	}
	p.advance(1) // 1 char
	return true
}

fn (mut p Parser) none_of(s string) bool {
	if p.end_of_input() || p.input[p.cur_pos()] in s.bytes() {
		return false
	}
	p.advance(1) // 1 char
	return true
}

fn (mut p Parser) range(low u8, high u8) bool {
	if p.end_of_input() {
		return false
	}
	c := p.input[p.cur_pos()]

	if !(low <= c && c <= high) {
		return false
	}

	p.advance(1) // 1 char
	return true
}

// ===== Token extraction =====

fn (mut p Parser) save_stash(from u32) string {
	return p.input[from..p.cur_pos()]
}

fn (mut p Parser) skip_ws() {
	for !p.end_of_input() {
		$if windows {
			if p.input[p.cur_pos()] == u8(13) { // eg: WINDOWS ONLY
				p.lc += 1
				p.rc = 0 // reset rows
				p.advance(1) // skip CR
				if p.end_of_input() || p.input[p.cur_pos()] != u8(10) { // skip LF if present
					continue
				}
			}
		}

		if p.input[p.cur_pos()] == u8(10) { // eg: LINUX ONLY
			p.lc += 1
			p.rc = 0 // reset rows
		}

		if p.input[p.cur_pos()] in ' \r\n\t'.bytes() {
			p.advance(1)
			continue
		}

		break
	}
}

fn (mut p Parser) last() !u8 {
	if p.stack[p.frame] > p.input.len {
		return error('Buffer too small')
	}
	return p.input[p.cur_pos() - 1]
}

fn (mut p Parser) pred(func fn (u8) bool) bool {
	if p.end_of_input() || !func(p.input[p.cur_pos()]) {
		return false
	}

	p.advance(1) // 1 char
	return true
}

fn (mut p Parser) many(s string) bool {
	if p.input.len < p.cur_pos() + u32(s.len) {
		return false
	}

	if s != p.input[p.cur_pos() .. p.cur_pos() + u32(s.len)] {
		return false
	}

	p.advance(u32(s.len))
	return true
}

fn (mut p Parser) any() ?u8 {
	if p.end_of_input() {
		return none
	}
	c := p.input[p.cur_pos()]
	p.advance(1) // 1 char
	return c
}

// ===== Tokenizer =====

pub struct Tokenizer {
	Parser
}

fn (mut t Tokenizer) next() ?u8 {
	pos := t.cur_pos()
	if t.end_of_input() || pos > t.input.len {
		return none
	}
	c := t.input[pos]
	t.advance(1) // 1 char
	return c
}

pub fn (mut t Tokenizer) parse_all() !Token {
	t.skip_ws()

	start_index := t.push_frame()!

	c := t.any() or { return Token{ start_index, .eof } }
	print(c.ascii_str())

	match true {
		c == `"` {
			t.rollback()
			t.commit_frame()
			return t.parse_str()
		}
		c in "0123456789".bytes() {
			t.rollback()
			t.commit_frame()
			return t.parse_num()
		}
		c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".bytes() {
			t.rollback()
			t.commit_frame()
			return t.parse_id()
		}
		c == `{` {
			return Token{ start_index, .table_start }
		}
		c == `}` {
			return Token{ start_index, .table_end }
		}
		c == `[` {
			return Token{ start_index, .angle_bracket_left }
		}
		c == `]` {
			return Token{ start_index, .angle_bracket_right }
		}
		c == 10 { // LF
			return Token{ start_index, .newline }
		}
		else {
			$if windows {
				if c == 13 && t.term(10) { // CRLF
					return Token{ start_index, .newline }
				}
			}

			t.rollback()
			t.commit_frame()
			return t.parse_op()
		}
	}
}

fn (mut t Tokenizer) parse_op() !Token {
	start_index := t.push_frame()!

	mut s := []u8{len: 0, cap: 32, init: 0}
	for t.one_of("+-*/%^<>=~#") {
		s << t.last()!
	}

	match s.bytestr() {
		"+", "-", "*", "/", "%", "^", "#", "=" {
			return Token{ start_index, .operator }
		}
		"==", "~=", "<=", ">=", "<", ">", ".." {
			return Token{ start_index, .operator_len2 }
		}
		"..." {
			return Token{ start_index, .operator_len3 }
		}
		"." {
			return Token{ start_index, .dot }
		}
		"," {
			return Token{ start_index, .comma }
		}
		else {
			return t.syntax_error("invalid operator '${s.bytestr()}'")
		}
	}
}

fn (mut t Tokenizer) parse_id() !Token {
	start_index := t.push_frame()!

	mut s := []u8{len: 0, cap: 32, init: 0}

	if t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.term('_'[0]) {
		s << t.last() or { return t.syntax_error("invalid identifier") }
	} else {
		return t.syntax_error("invalid identifier")
	}

	for t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.range('0'[0], '9'[0]) || t.term('_'[0]) {
		s << t.last() or { break }
	}

	return match s.bytestr() {
		"true" { Token{ start_index, .true } }
		"false" { Token{ start_index, .false } }
		"nil" { Token{ start_index, .nil } }
		else {
			if s.bytestr() in constants.keywords {
				return Token{start_index,.reserved_keyword}
			}
			return Token{start_index,.identifier}
		}
	}
}

fn (mut t Tokenizer) parse_num() !Token {
	start_index := t.push_frame()!

	mut s := []u8{len: 0, cap: 32, init: 0}

	if t.one_of("0123456789") {
		s << t.last()!
	} else {
		return t.syntax_error("invalid number")
	}

	mut is_decimal := false

	loop:
	for {
		if t.one_of("0123456789") {
			s << t.last() or { break loop }
			continue
		}

		if t.term('.'[0]) {
			if is_decimal {
				return t.syntax_error("invalid number")
			}
			s << '.'[0]

			if !t.one_of("0123456789") {
				return t.syntax_error("invalid number")
			}

			is_decimal = true
			s << t.last() or { break loop }

			continue
		}
		break
	}

	if s.len == 0 {
		return t.syntax_error("invalid number")
	}

	return Token{
		start: start_index,
		tag: if is_decimal { .decimal } else { .number },
	}
}

fn (mut t Tokenizer) parse_str() !Token {
	start_index := t.push_frame()!

	mut s := []u8{len: 0, cap: 4096, init: 0}

	ok := t.term('"'[0])

	if !ok {
		return t.syntax_error("invalid string")
	}

	for c in t {
		s << c
	}

	c := t.last() or {
		return t.syntax_error("unclosed string")
	}

	match c {
		`"` {
			return Token{ start_index, .string }
		}
		else {
			s << c
		}
	}

	return Token{ start_index, .eof }
}