434 lines
7.5 KiB
V
434 lines
7.5 KiB
V
module tokenizer
|
|
import constants
|
|
|
|
@[packed]
|
|
pub struct Token {
|
|
pub:
|
|
start u32
|
|
tag VuaToken
|
|
}
|
|
|
|
pub enum VuaToken {
|
|
eof
|
|
nil
|
|
string
|
|
number
|
|
decimal
|
|
true
|
|
false
|
|
table_start
|
|
table_end
|
|
identifier
|
|
comma
|
|
newline
|
|
equal
|
|
dot
|
|
angle_bracket_left
|
|
angle_bracket_right
|
|
reserved_keyword
|
|
operator
|
|
operator_len2
|
|
operator_len3
|
|
}
|
|
|
|
pub struct Parser {
|
|
pub:
|
|
input string
|
|
pub mut:
|
|
max_pos u32
|
|
stack []u32 = []u32{len: 256, cap: 256, init: 0}
|
|
frame u32
|
|
mut:
|
|
lc int = 1
|
|
rc int
|
|
}
|
|
|
|
fn (mut p Parser) cur_pos() u32 {
|
|
return p.stack[p.frame]
|
|
}
|
|
|
|
fn (mut p Parser) advance(delta u32) {
|
|
p.rc += int(delta)
|
|
p.stack[p.frame] += delta;
|
|
if p.max_pos < p.stack[p.frame] { p.max_pos = p.stack[p.frame] }
|
|
}
|
|
|
|
@[manualfree]
|
|
fn (mut p Parser) push_frame() !u32 {
|
|
p.frame += 1
|
|
if p.frame == p.stack.len {
|
|
new_size := p.stack.len + (p.stack.len >> 1)
|
|
new_stack := []u32{len: new_size, cap: new_size, init:0}
|
|
|
|
unsafe { vmemcpy(new_stack[0..p.max_pos], p.stack[0..p.max_pos], new_stack.len) }
|
|
unsafe { p.stack.free() }
|
|
|
|
p.stack = new_stack
|
|
}
|
|
if p.frame > p.input.len {
|
|
return error('Buffer too small')
|
|
}
|
|
|
|
p.stack[p.frame] = p.stack[p.frame - 1]
|
|
return p.cur_pos()
|
|
}
|
|
|
|
fn (mut p Parser) pop_frame() {
|
|
if p.frame >= 1 { p.frame -= 1 }
|
|
}
|
|
|
|
fn (mut p Parser) commit_frame() {
|
|
p.frame -= 1
|
|
p.stack[p.frame] = p.stack[p.frame + 1];
|
|
}
|
|
|
|
@[unsafe]
|
|
fn (mut p Parser) free() {
|
|
p.stack.free()
|
|
}
|
|
|
|
struct SyntaxError {
|
|
Error
|
|
line int
|
|
row int
|
|
m string
|
|
}
|
|
|
|
fn (err SyntaxError) msg() string {
|
|
return "SyntaxError: ${err.m} -- at line ${err.line}-${err.row}"
|
|
}
|
|
|
|
fn (mut p Parser) syntax_error(m string) SyntaxError {
|
|
return SyntaxError{
|
|
line: p.lc,
|
|
row: p.rc,
|
|
m: m,
|
|
}
|
|
}
|
|
|
|
fn (mut p Parser) rollback() {
|
|
p.stack[p.frame] = if p.frame == 0 { u32(0) } else { p.stack[p.frame - 1] }
|
|
}
|
|
|
|
// ===== Char matching =====
|
|
|
|
pub fn (mut p Parser) end_of_input() bool {
|
|
return p.cur_pos() >= p.input.len
|
|
}
|
|
|
|
fn (mut p Parser) term(c u8) bool {
|
|
if p.end_of_input() || p.input[p.cur_pos()] != c {
|
|
return false
|
|
}
|
|
|
|
p.advance(1) // 1 char
|
|
return true
|
|
}
|
|
|
|
fn (mut p Parser) one_of(s string) bool {
|
|
if p.end_of_input() || p.input[p.cur_pos()] !in s.bytes() {
|
|
return false
|
|
}
|
|
p.advance(1) // 1 char
|
|
return true
|
|
}
|
|
|
|
fn (mut p Parser) none_of(s string) bool {
|
|
if p.end_of_input() || p.input[p.cur_pos()] in s.bytes() {
|
|
return false
|
|
}
|
|
p.advance(1) // 1 char
|
|
return true
|
|
}
|
|
|
|
fn (mut p Parser) range(low u8, high u8) bool {
|
|
if p.end_of_input() {
|
|
return false
|
|
}
|
|
c := p.input[p.cur_pos()]
|
|
|
|
if !(low <= c && c <= high) {
|
|
return false
|
|
}
|
|
|
|
p.advance(1) // 1 char
|
|
return true
|
|
}
|
|
|
|
// ===== Token extraction =====
|
|
|
|
fn (mut p Parser) save_stash(from u32) string {
|
|
return p.input[from..p.cur_pos()]
|
|
}
|
|
|
|
fn (mut p Parser) skip_ws() {
|
|
for !p.end_of_input() {
|
|
$if windows {
|
|
if p.input[p.cur_pos()] == u8(13) { // eg: WINDOWS ONLY
|
|
p.lc += 1
|
|
p.rc = 0 // reset rows
|
|
p.advance(1) // skip CR
|
|
if p.end_of_input() || p.input[p.cur_pos()] != u8(10) { // skip LF if present
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
if p.input[p.cur_pos()] == u8(10) { // eg: LINUX ONLY
|
|
p.lc += 1
|
|
p.rc = 0 // reset rows
|
|
}
|
|
|
|
if p.input[p.cur_pos()] in ' \r\n\t'.bytes() {
|
|
p.advance(1)
|
|
continue
|
|
}
|
|
|
|
break
|
|
}
|
|
}
|
|
|
|
fn (mut p Parser) last() !u8 {
|
|
if p.stack[p.frame] > p.input.len {
|
|
return error('Buffer too small')
|
|
}
|
|
return p.input[p.cur_pos() - 1]
|
|
}
|
|
|
|
fn (mut p Parser) pred(func fn (u8) bool) bool {
|
|
if p.end_of_input() || !func(p.input[p.cur_pos()]) {
|
|
return false
|
|
}
|
|
|
|
p.advance(1) // 1 char
|
|
return true
|
|
}
|
|
|
|
fn (mut p Parser) many(s string) bool {
|
|
if p.input.len < p.cur_pos() + u32(s.len) {
|
|
return false
|
|
}
|
|
|
|
if s != p.input[p.cur_pos() .. p.cur_pos() + u32(s.len)] {
|
|
return false
|
|
}
|
|
|
|
p.advance(u32(s.len))
|
|
return true
|
|
}
|
|
|
|
fn (mut p Parser) any() ?u8 {
|
|
if p.end_of_input() {
|
|
return none
|
|
}
|
|
c := p.input[p.cur_pos()]
|
|
p.advance(1) // 1 char
|
|
return c
|
|
}
|
|
|
|
// ===== Tokenizer =====
|
|
|
|
pub struct Tokenizer {
|
|
Parser
|
|
}
|
|
|
|
fn (mut t Tokenizer) next() ?u8 {
|
|
pos := t.cur_pos()
|
|
if t.end_of_input() || pos > t.input.len {
|
|
return none
|
|
}
|
|
c := t.input[pos]
|
|
t.advance(1) // 1 char
|
|
return c
|
|
}
|
|
|
|
pub fn (mut t Tokenizer) parse_all() !Token {
|
|
t.skip_ws()
|
|
|
|
start_index := t.push_frame()!
|
|
|
|
c := t.any() or { return Token{ start_index, .eof } }
|
|
print(c.ascii_str())
|
|
|
|
match true {
|
|
c == `"` {
|
|
t.rollback()
|
|
t.commit_frame()
|
|
return t.parse_str()
|
|
}
|
|
c in "0123456789".bytes() {
|
|
t.rollback()
|
|
t.commit_frame()
|
|
return t.parse_num()
|
|
}
|
|
c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".bytes() {
|
|
t.rollback()
|
|
t.commit_frame()
|
|
return t.parse_id()
|
|
}
|
|
c == `{` {
|
|
return Token{ start_index, .table_start }
|
|
}
|
|
c == `}` {
|
|
return Token{ start_index, .table_end }
|
|
}
|
|
c == `[` {
|
|
return Token{ start_index, .angle_bracket_left }
|
|
}
|
|
c == `]` {
|
|
return Token{ start_index, .angle_bracket_right }
|
|
}
|
|
c == 10 { // LF
|
|
return Token{ start_index, .newline }
|
|
}
|
|
else {
|
|
$if windows {
|
|
if c == 13 && t.term(10) { // CRLF
|
|
return Token{ start_index, .newline }
|
|
}
|
|
}
|
|
|
|
t.rollback()
|
|
t.commit_frame()
|
|
return t.parse_op()
|
|
}
|
|
}
|
|
}
|
|
|
|
fn (mut t Tokenizer) parse_op() !Token {
|
|
start_index := t.push_frame()!
|
|
|
|
mut s := []u8{len: 0, cap: 32, init: 0}
|
|
for t.one_of("+-*/%^<>=~#") {
|
|
s << t.last()!
|
|
}
|
|
|
|
match s.bytestr() {
|
|
"+", "-", "*", "/", "%", "^", "#", "=" {
|
|
return Token{ start_index, .operator }
|
|
}
|
|
"==", "~=", "<=", ">=", "<", ">", ".." {
|
|
return Token{ start_index, .operator_len2 }
|
|
}
|
|
"..." {
|
|
return Token{ start_index, .operator_len3 }
|
|
}
|
|
"." {
|
|
return Token{ start_index, .dot }
|
|
}
|
|
"," {
|
|
return Token{ start_index, .comma }
|
|
}
|
|
else {
|
|
return t.syntax_error("invalid operator '${s.bytestr()}'")
|
|
}
|
|
}
|
|
}
|
|
|
|
fn (mut t Tokenizer) parse_id() !Token {
|
|
start_index := t.push_frame()!
|
|
|
|
mut s := []u8{len: 0, cap: 32, init: 0}
|
|
|
|
if t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.term('_'[0]) {
|
|
s << t.last() or { return t.syntax_error("invalid identifier") }
|
|
} else {
|
|
return t.syntax_error("invalid identifier")
|
|
}
|
|
|
|
for t.range('a'[0], 'z'[0]) || t.range('A'[0], 'Z'[0]) || t.range('0'[0], '9'[0]) || t.term('_'[0]) {
|
|
s << t.last() or { break }
|
|
}
|
|
|
|
return match s.bytestr() {
|
|
"true" { Token{ start_index, .true } }
|
|
"false" { Token{ start_index, .false } }
|
|
"nil" { Token{ start_index, .nil } }
|
|
else {
|
|
if s.bytestr() in constants.keywords {
|
|
return Token{start_index,.reserved_keyword}
|
|
}
|
|
return Token{start_index,.identifier}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn (mut t Tokenizer) parse_num() !Token {
|
|
start_index := t.push_frame()!
|
|
|
|
mut s := []u8{len: 0, cap: 32, init: 0}
|
|
|
|
if t.one_of("0123456789") {
|
|
s << t.last()!
|
|
} else {
|
|
return t.syntax_error("invalid number")
|
|
}
|
|
|
|
mut is_decimal := false
|
|
|
|
loop:
|
|
for {
|
|
if t.one_of("0123456789") {
|
|
s << t.last() or { break loop }
|
|
continue
|
|
}
|
|
|
|
if t.term('.'[0]) {
|
|
if is_decimal {
|
|
return t.syntax_error("invalid number")
|
|
}
|
|
s << '.'[0]
|
|
|
|
if !t.one_of("0123456789") {
|
|
return t.syntax_error("invalid number")
|
|
}
|
|
|
|
is_decimal = true
|
|
s << t.last() or { break loop }
|
|
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
|
|
if s.len == 0 {
|
|
return t.syntax_error("invalid number")
|
|
}
|
|
|
|
return Token{
|
|
start: start_index,
|
|
tag: if is_decimal { .decimal } else { .number },
|
|
}
|
|
}
|
|
|
|
fn (mut t Tokenizer) parse_str() !Token {
|
|
start_index := t.push_frame()!
|
|
|
|
mut s := []u8{len: 0, cap: 4096, init: 0}
|
|
|
|
ok := t.term('"'[0])
|
|
|
|
if !ok {
|
|
return t.syntax_error("invalid string")
|
|
}
|
|
|
|
for c in t {
|
|
s << c
|
|
}
|
|
|
|
c := t.last() or {
|
|
return t.syntax_error("unclosed string")
|
|
}
|
|
|
|
match c {
|
|
`"` {
|
|
return Token{ start_index, .string }
|
|
}
|
|
else {
|
|
s << c
|
|
}
|
|
}
|
|
|
|
return Token{ start_index, .eof }
|
|
}
|