module token import scanner { Scanner } import constants // ===== Tokenizer ===== @[packed] pub struct Token { pub: start u32 tag TokenType } pub enum TokenType as u8 { eof nil string number decimal true false table_start table_end identifier comma newline equal dot angle_bracket_left angle_bracket_right keyword operator operator_len2 operator_len3 } pub struct Tokenizer { Scanner } fn (mut t Tokenizer) next() ?u8 { pos := t.cur_pos() if t.end_of_input() || pos > t.input.len { return none } c := t.input[pos] t.advance(1) // 1 char return c } pub fn (mut t Tokenizer) parse_all() !Token { t.skip_ws() start_index := t.push_frame()! c := t.any() or { return Token{ start_index, .eof } } match true { c == `"` { t.rollback() t.commit_frame() return t.parse_str() } c in "0123456789".bytes() { t.rollback() t.commit_frame() return t.parse_num() } c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".bytes() { t.rollback() t.commit_frame() return t.parse_id() } c == `{` { return Token{ start_index, .table_start } } c == `}` { return Token{ start_index, .table_end } } c == `[` { return Token{ start_index, .angle_bracket_left } } c == `]` { return Token{ start_index, .angle_bracket_right } } c == 10 { // LF return Token{ start_index, .newline } } else { $if windows { if c == 13 && t.term(10) { // CRLF return Token{ start_index, .newline } } } t.rollback() t.commit_frame() return t.parse_op() } } } fn (mut t Tokenizer) parse_op() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 32, init: 0} for t.one_of("+-*/%^<>=~#.$,?") { s << t.last()! } match s.bytestr() { "+", "-", "*", "/", "%", "^", "#", "=" { return Token{ start_index, .operator } } "==", "~=", "<=", ">=", "<", ">", ".." { return Token{ start_index, .operator_len2 } } "..." { return Token{ start_index, .operator_len3 } } "." { return Token{ start_index, .dot } } "," { return Token{ start_index, .comma } } else { return t.syntax_error("invalid operator '${s.bytestr()}'") } } } fn (mut t Tokenizer) parse_id() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 32, init: 0} if t.range(`a`, `z`) || t.range(`A`, `Z`) || t.term(`_`) { s << t.last() or { return t.syntax_error("invalid identifier") } } else { return t.syntax_error("invalid identifier") } for t.range(`a`, `z`) || t.range(`A`, `Z`) || t.range(`0`, `9`) || t.term(`_`) { s << t.last() or { break } } return match s.bytestr() { "nil" { Token{ start_index, .nil } } "true" { Token{ start_index, .true } } "false" { Token{ start_index, .false } } else { if s.bytestr() in constants.keywords { return Token{ start_index, .keyword } } return Token{ start_index, .identifier } } } } fn (mut t Tokenizer) parse_num() !Token { start_index := t.push_frame()! mut s := []u8{len: 0, cap: 32, init: 0} if t.one_of("0123456789") { s << t.last()! } else { return t.syntax_error("invalid number") } mut is_decimal := false loop: for { if t.one_of("0123456789") { s << t.last() or { break loop } continue } if t.term(`.`) { if is_decimal { return t.syntax_error("invalid number") } s << `.` if !t.one_of("0123456789") { return t.syntax_error("invalid number") } is_decimal = true s << t.last() or { break loop } continue } break } if s.len == 0 { return t.syntax_error("invalid number") } return Token{ start: start_index, tag: if is_decimal { .decimal } else { .number }, } } fn (mut t Tokenizer) parse_str() !Token { start_index := t.push_frame()! if !t.term(`"`) { return t.syntax_error("invalid string") } for !t.end_of_input() { if t.term(`"`) { return Token{ start_index, .string } } if t.term(`\\`) { // skip escaped char kek this should be improved // TODO: improve this t.advance(1) continue } t.advance(1) // 1 char } return t.syntax_error("uncompleted string literal") }