aether/tokenizer.zig
2025-06-05 00:21:24 -05:00

561 lines
16 KiB
Zig

const std = @import("std");
const mem = std.mem;
pub const Error = error{
/// eg: invalid JSON syntax
InvalidSyntax,
/// eg: allocator error
OutOfMemory,
/// eg: bad escaping
UnexpectedCharacter,
/// eg: std.fmt.parseFloat failed
BadNumber,
/// fba error
BufferTooSmall,
};
pub const TokenType = enum(u8) {
zero,
eof,
null,
true,
false,
int,
float,
string,
property,
object_begin,
object_end,
array_begin,
array_end,
colon,
comma,
whitespace,
};
pub const Token = struct {
type: TokenType,
value: ?union {
int: i128,
float: f64,
string: []const u8,
symbol: u8,
},
start: usize,
};
pub const Self = @This();
text: []const u8,
max_position: usize,
stack: []usize,
frame: usize,
/// Initialize a new tokenizer
pub fn init(allocator: mem.Allocator, text: []const u8) mem.Allocator.Error!Self {
const initial_stack_size = @min(32, text.len / 16 + 1);
const stack = try allocator.alloc(usize, initial_stack_size);
stack[0] = 0;
return .{
.text = text,
.max_position = 0,
.stack = stack,
.frame = 0,
};
}
/// Clean up resources
pub fn deinit(self: *Self, allocator: mem.Allocator) void {
allocator.free(self.stack);
}
// ========== Core Parsing Functions ==========
fn currentPosition(self: *Self) usize {
return self.stack[self.frame];
}
fn advance(self: *Self, delta: usize) void {
self.stack[self.frame] += delta;
if (self.max_position < self.stack[self.frame])
self.max_position = self.stack[self.frame];
}
fn pushFrame(self: *Self, allocator: mem.Allocator) Error!usize {
self.frame += 1;
if (self.frame == self.stack.len) {
const new_size = self.stack.len + (self.stack.len >> 1); // 1.5x growth
const new_stack = try allocator.alloc(usize, new_size);
@memcpy(new_stack[0..self.frame], self.stack[0..self.frame]);
allocator.free(self.stack);
self.stack = new_stack;
}
if (self.frame > self.text.len)
return error.BufferTooSmall;
self.stack[self.frame] = self.stack[self.frame - 1];
return self.currentPosition();
}
fn popFrame(self: *Self) void {
if (self.frame >= 1)
self.frame -= 1;
}
fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
self.frame -= 1;
self.stack[self.frame] = self.stack[self.frame + 1];
return wrapped;
}
fn rollback(self: *Self) void {
self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
}
// ========== Character Matching ==========
fn lastChar(self: *Self) !u8 {
if (self.stack[self.frame] > self.text.len)
return error.BufferTooSmall;
return self.text[self.currentPosition() - 1];
}
fn currentChar(self: *Self) u8 {
return self.text[self.currentPosition()];
}
pub fn endOfInput(self: *Self) bool {
return self.currentPosition() >= self.text.len;
}
fn matchChar(self: *Self, c: u8) ?void {
if (self.endOfInput() or self.text[self.currentPosition()] != c) {
return null;
}
self.advance(1);
}
fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
// do not change this line for some reason it fucking breaks if I use currentChar directly
if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
return null;
}
self.advance(1);
}
fn matchString(self: *Self, s: []const u8) ?[]const u8 {
if (self.text.len < self.currentPosition() + s.len) {
// eof
return null;
}
const remaining_len = s.len;
const simd_width = 16; // 128-bit SIMD (SSE/NEON)
var j: usize = 0;
while (j + simd_width <= remaining_len) {
const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;
if (!@reduce(.And, expected_chunk == actual_chunk)) {
return null;
}
j += simd_width;
}
// Handle remaining bytes
while (j < remaining_len) {
if (s[j] != self.text[self.currentPosition() + j]) {
return null;
}
j += 1;
}
self.advance(s.len);
return null;
}
pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
if (self.endOfInput())
return null;
const c = self.text[self.currentPosition()];
if (!(c >= low and c <= high))
return null;
self.advance(1);
}
pub fn anyChar(self: *Self) ?u8 {
if (self.endOfInput())
return null;
const char = self.text[self.currentPosition()];
self.advance(1);
return char;
}
// ========== Token Extraction ==========
fn extractSlice(self: *Self, start: usize) []const u8 {
return self.text[start..self.currentPosition()];
}
// Skip all whitespace characters
pub fn skipWhitespace(self: *Self) void {
const start = self.currentPosition();
const end = skipWhitespaceSimd(self.text[start..]);
self.advance(end);
}
/// Parse a number token
pub fn nextNumber(self: *Self, allocator: mem.Allocator) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame(allocator);
errdefer self.popFrame();
self.matchChar('-') orelse {}; // this may not fail
while (self.matchCharRange('0', '9') != null) {}
self.matchChar('.') orelse {
if (self.matchChar('e') != null or self.matchChar('E') != null) {
self.matchChar('+') orelse self.matchChar('-') orelse {};
while (self.matchCharRange('0', '9') != null) {}
} else {
// int found
const int = std.fmt.parseInt(i128, self.extractSlice(start), 10) catch {
return error.BadNumber; // no floating point
};
return Token{
.type = .int,
.value = .{ .int = int },
.start = start,
};
}
// float with e found
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
return error.BadNumber; // no floating point
};
return Token{
.type = .float,
.value = .{ .float = float },
.start = start,
};
};
while (self.matchCharRange('0', '9') != null) {}
if (self.matchChar('e') != null or self.matchChar('E') != null) {
self.matchChar('+') orelse self.matchChar('-') orelse {};
while (self.matchCharRange('0', '9') != null) {}
}
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
return error.BadNumber; // floating point
};
return .{
.type = .float,
.value = .{ .float = float },
.start = start,
};
}
/// Parse an identifier token
pub fn nextIdentifier(self: *Self, allocator: mem.Allocator) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame(allocator);
errdefer self.popFrame();
var buffer = try allocator.alloc(u8, 0x100);
defer allocator.free(buffer);
self.matchCharPredicate(std.ascii.isAlphabetic) orelse
return error.InvalidSyntax;
buffer[0] = try self.lastChar();
var i: usize = 1;
while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
buffer[i] = try self.lastChar();
i += 1;
}
const ident = buffer[0..i];
// true
if (mem.eql(u8, ident, "true")) {
return .{
.type = .true,
.value = null,
.start = start,
};
}
// false
if (mem.eql(u8, ident, "false")) {
return .{
.type = .false,
.value = null,
.start = start,
};
}
// null
if (mem.eql(u8, ident, "null")) {
return .{
.type = .null,
.value = null,
.start = start,
};
}
return error.InvalidSyntax;
}
/// Get the next token from the input
/// WARNING: this function eats whitespaces
pub fn nextToken(self: *Self, allocator: mem.Allocator, allow_comments: bool) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame(allocator);
errdefer self.popFrame();
// Fall back to single character symbol
const c = self.anyChar() orelse return .{
.type = .eof,
.value = null,
.start = start,
};
const symbol_t: TokenType = switch (c) {
'/' => {
if (allow_comments and self.matchChar('/') != null) {
// Single line comment
while (true) {
const next_c = self.anyChar() orelse return .{
.type = .eof,
.value = null,
.start = start,
};
if (next_c == '\n') break;
}
self.skipWhitespace();
return self.nextToken(allocator, allow_comments);
} else if (allow_comments and self.matchChar('*') != null) {
// Multi-line comment
while (true) {
if (self.endOfInput())
return error.InvalidSyntax; // unterminated comment
const next_c = self.anyChar() orelse return .{
.type = .eof,
.value = null,
.start = start,
};
if (next_c == '*' and self.matchChar('/') != null) break;
}
self.skipWhitespace();
return self.nextToken(allocator, allow_comments);
}
return error.InvalidSyntax; // not a comment
},
'{' => .object_begin,
'}' => .object_end,
'[' => .array_begin,
']' => .array_end,
',' => {
self.skipWhitespace();
return self.commit(Token{
.type = .comma,
.value = null,
.start = start,
});
},
':' => .colon,
'"' => {
self.rollback();
const string = try self.nextString(allocator);
errdefer allocator.free(string);
return self.commit(string);
},
else => {
self.rollback();
// Try different token types in order of precedence
if (std.ascii.isDigit(c) or c == '-') {
return self.commit(self.nextNumber(allocator));
}
if (std.ascii.isAlphabetic(c)) {
return self.commit(self.nextIdentifier(allocator));
}
return error.InvalidSyntax;
},
};
return self.commit(Token{
.type = symbol_t,
.value = null,
.start = start,
});
}
pub fn nextString(self: *Self, allocator: mem.Allocator) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame(allocator);
errdefer self.popFrame();
self.matchChar('"') orelse unreachable;
var buffer: std.ArrayList(u8) = .init(allocator);
defer buffer.deinit();
loop: while (!self.endOfInput()) {
self.advance(1);
switch (try self.lastChar()) {
'"' => {
while (mem.indexOfScalar(u8, buffer.items, 0x00)) |idx|
_ = buffer.swapRemove(idx);
return .{
.type = .string,
.value = .{ .string = try buffer.toOwnedSlice() },
.start = start,
};
},
'\\' => {
self.advance(1);
switch (try self.lastChar()) {
0x22, 0x5C, 0x2F => |d| {
try buffer.append(d);
continue :loop;
},
'b' => try buffer.append(0x8),
'f' => try buffer.append(0xC),
'n' => try buffer.append(0xA),
'r' => try buffer.append(0xD),
't' => try buffer.append(0x9),
'u' => {
var code_points: [4]u8 = undefined;
inline for (0..4) |i| {
if (self.endOfInput())
return .{
.type = .eof,
.value = null,
.start = start,
};
self.advance(1);
code_points[i] = try self.lastChar();
}
const buf = try stringToUtf8(&code_points);
try buffer.appendSlice(buf);
continue :loop;
},
else => return error.UnexpectedCharacter,
} // end switch
},
else => |c| {
if (std.ascii.isControl(c) and c != std.ascii.control_code.del) {
return error.UnexpectedCharacter;
}
try buffer.append(c);
},
} // end switch
} // end while
return error.InvalidSyntax;
}
pub fn stringToUtf8(bytes: []u8) ![]u8 {
const code_point = std.fmt.parseInt(u21, bytes, 16) catch
return error.BadNumber;
var buffer: [4]u8 = undefined;
var index: usize = 0;
if (code_point <= 0x7F) {
if (index >= buffer.len) return error.BufferTooSmall;
buffer[index] = @as(u8, @intCast(code_point));
index += 1;
} else if (code_point <= 0x7FF) {
if (index + 2 > buffer.len) return error.BufferTooSmall;
buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
index += 2;
} else if (code_point <= 0xFFFF) {
if (index + 3 > buffer.len) return error.BufferTooSmall;
buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
index += 3;
} else if (code_point <= 0x10FFFF) {
if (index + 4 > buffer.len) return error.BufferTooSmall;
buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
index += 4;
} else unreachable;
return buffer[0..index];
}
pub fn skipWhitespaceSimd(text: []const u8) usize {
const ChunkSize = 16;
const Vec = @Vector(ChunkSize, u8);
// Broadcast whitespace characters to vectors
const space: Vec = @splat(' ');
const tab: Vec = @splat('\t');
const lf: Vec = @splat('\n');
const cr: Vec = @splat('\r');
var j: usize = 0;
const end = text.len;
// SIMD processing
while (j + ChunkSize <= end) {
const chunk: Vec = text[j..][0..ChunkSize].*;
// Compare against each whitespace character
const is_space = chunk == space;
const is_tab = chunk == tab;
const is_lf = chunk == lf;
const is_cr = chunk == cr;
// Combine comparisons using vector operations
const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
@select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
@select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
@select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));
const TrueMask: Vec = @splat(0xFF);
const FalseMask: Vec = @splat(0x00);
// Check if all characters are whitespace
if (@reduce(.And, anyws == TrueMask)) {
j += ChunkSize;
continue;
}
// Find first non-whitespace
const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
if (mask != 0) return j + @ctz(mask);
}
// Scalar processing for remaining bytes
while (j < end) switch (text[j]) {
' ', '\t', '\n', '\r' => j += 1,
else => break,
};
return j;
}