epic JSON parser

2025-05-23 18:17:59 -05:00 · 2025-05-23 18:17:59 -05:00 · 7bc4973bf4
commit 7bc4973bf4
5 changed files with 1081 additions and 0 deletions
--- a/2.zig
+++ b/2.zig
@ -0,0 +1,397 @@
 const std = @import("std");
 const Tokenizer = @import("tokenizer.zig");
 const StringPool = @import("strings.zig");
 const StringIndex = StringPool.StringIndex;
 const assert = std.debug.assert;
 const Self = @This();
 pub const Error = enum {};
 pub const JsonType = enum {
    null,
    bool,
    number,
    string,
    array,
    object,
 };
 pub const JsonValue = union(JsonType) {
    null: void,
    bool: bool,
    number: f64,
    string: StringIndex,
    array: ArrayIndex.Slice,
    object: ObjectIndex.Entry,
 };
 pub const JsonInput = union(JsonType) {
    null: void,
    bool: bool,
    number: f64,
    string: []const u8,
    array: []JsonInput,
    object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput),
 };
 pub const ArrayIndex = enum(usize) {
    _,
    pub const Slice = struct {
        start: usize,
        len: usize,
    };
 };
 pub const ObjectIndex = enum(usize) {
    _,
    pub const Entry = struct {
        len: usize,
        property_idx: usize,
        value_idx: usize,
    };
 };
 pub const Options = struct {
    comptime max_depth: usize = 256,
 };
 index: std.MultiArrayList(JsonValue) = .{},
 string_index: StringPool = .empty,
 options: Options = .{},
 pub const init: Self = .{};
 pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
    self.index.deinit(allocator);
    self.string_index.deinit(allocator);
 }
 fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
    try self.index.ensureUnusedCapacity(allocator, 1);
    const idx = self.index.addOneAssumeCapacity();
    self.index.set(idx, .{ .number = number });
    return idx;
 }
 fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
    const stridx = try self.string_index.add(allocator, bytes);
    try self.index.ensureUnusedCapacity(allocator, 1);
    const idx = self.index.addOneAssumeCapacity();
    self.index.set(idx, .{ .string = stridx });
    return idx;
 }
 fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
    var entry: ?ObjectIndex.Entry = null;
    for (object.keys(), object.values(), 0..) |key, value, times| {
        const stridx = try self.string_index.add(allocator, key);
        try self.index.ensureUnusedCapacity(allocator, 1);
        const vidx = self.index.addOneAssumeCapacity();
        self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
        if (times == 0) {
            entry = ObjectIndex.Entry{
                .len = object.entries.len,
                .property_idx = stridx,
                .value_idx = vidx,
            };
        }
    }
    try self.index.ensureUnusedCapacity(allocator, 1);
    const idx = self.index.addOneAssumeCapacity();
    if (entry) |e| {
        self.index.set(idx, .{ .object = e });
        return idx;
    } else {
        self.index.set(idx, .{ .object = ObjectIndex.Entry{
            .len = 0,
            .property_idx = 0,
            .value_idx = 0,
        } });
        return idx;
    }
 }
 fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
    try self.index.ensureUnusedCapacity(allocator, 1);
    const idx = self.index.addOneAssumeCapacity();
    const object: ObjectIndex.Entry = .{
        .property_idx = self.string_index.string_bytes.items.len,
        .value_idx = self.index.len,
        .len = 0,
    };
    self.index.set(idx, .{ .object = object });
    return idx;
 }
 fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
    var entry: ?ArrayIndex.Slice = null;
    for (array, 0..) |value, times| {
        try self.index.ensureUnusedCapacity(allocator, 1);
        const idx = self.index.addOneAssumeCapacity();
        self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
        if (times == 0) {
            entry = ArrayIndex.Slice{
                .start = idx,
                .len = array.len,
            };
        }
    }
    try self.index.ensureUnusedCapacity(allocator, 1);
    const idx = self.index.addOneAssumeCapacity();
    if (entry) |e| {
        self.index.set(idx, .{ .array = e });
        return idx;
    } else {
        self.index.set(idx, .{ .array = ArrayIndex.Slice{
            .start = 0,
            .len = 0,
        } });
        return idx;
    }
 }
 fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
    try self.index.ensureUnusedCapacity(allocator, 1);
    const idx = self.index.addOneAssumeCapacity();
    self.index.set(idx, .{ .bool = value });
    return idx;
 }
 fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
    switch (value) {
        .null => {},
        .bool => try self.addBool(allocator, value.bool),
        .number => try self.addNumber(allocator, value.number),
        .string => try self.addString(allocator, value.string),
        .array => try self.addArray(allocator, value.array),
        .object => try self.addObject(allocator, value.object),
    }
 }
 fn getString(self: *Self, index: []const u8) ?StringIndex {
    return self.string_index.string_table.get(index);
 }
 fn getNumber(self: *Self, index: usize) ?f64 {
    if (self.index.get(index)) |n| return n;
    return null;
 }
 fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct {
    []StringIndex,
    []usize,
 } {
    const entry = self.index.get(index);
    if (entry.object.len == 0) {
        return .{ &.{}, &.{} };
    }
    var pidx = entry.object.property_idx;
    var vidx = entry.object.value_idx;
    const keys = try allocator.alloc(StringIndex, entry.object.len);
    const values = try allocator.alloc(usize, entry.object.len);
    for (0..entry.object.len) |i| {
        const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index);
        keys[i] = @enumFromInt(pidx);
        values[i] = vidx;
        pidx += slice.len + 1;
        vidx += 1;
    }
    return .{ keys, values };
 }
 fn getArray(self: *Self, allocator: std.mem.Allocator, index: usize) ?[]usize {
    const entry = self.index.get(index) orelse return null;
    if (entry.array.len == 0) {
        return &.{};
    }
    var idx = entry.array.start;
    const values = try allocator.alloc(usize, entry.array.len);
    for (entry.array.len) |i| {
        values[i] = idx;
        idx += 1;
    }
    return values;
 }
 fn getBool(self: *Self, index: usize) ?bool {
    const entry = self.index.get(index) orelse return null;
    return entry.bool;
 }
 fn getNull(self: *Self, index: usize) ?void {
    const entry = self.index.get(index) orelse return null;
    return entry.null;
 }
 fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput {
    const entry = self.index.get(index);
    switch (entry) {
        .null => return .{ .null = {} },
        .bool => return .{ .bool = entry.bool },
        .number => return .{ .number = entry.number },
        .string => {
            const str = entry.string.slice(&self.string_index);
            return .{ .string = str };
        },
        .array => {
            const res = try allocator.alloc(JsonInput, entry.array.len);
            var idx = entry.array.start;
            for (0..entry.array.len) |i| {
                if (try self.getValue(allocator, idx)) |v| {
                    res[i] = v;
                    idx += 1;
                } else unreachable;
            }
            return .{ .array = res };
        },
        .object => {
            var kidx = entry.object.property_idx;
            var vidx = entry.object.value_idx;
            var obj: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput) = .empty;
            try obj.ensureTotalCapacity(allocator, entry.object.len);
            for (0..entry.object.len) |_| {
                const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index);
                const val = (try self.getValue(allocator, vidx)).?;
                obj.putAssumeCapacityNoClobber(key, val);
                kidx += 1;
                vidx += 1;
            }
            return .{ .object = obj };
        },
    }
 }
 pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
    const allocator = tokenizer.allocator;
    var it = tokenizer.iterator();
    var depth_buf = try allocator.alloc(usize, self.options.max_depth);
    defer allocator.free(depth_buf);
    var cycles: usize = 0;
    while (it.next()) |token| {
        switch (token.type) {
            .object_begin => {
                std.debug.print("{{", .{});
                const obj_idx = try self.addEmptyObject(allocator);
                depth_buf[cycles] = obj_idx;
                if (tokenizer.prev_token) |t| if (t.type == .object_begin) {
                    // add map to itself
                    const data = self.index.get(depth_buf[cycles - 1]);
                    switch (data) {
                        .object => |valid_entry| {
                            const new_data = ObjectIndex.Entry{
                                .len = valid_entry.len + 1,
                                .property_idx = self.string_index.string_table.size,
                                .value_idx = obj_idx,
                            };
                            self.index.set(depth_buf[cycles - 1], .{ .object = new_data });
                            tokenizer.prev_token = null; // reset
                        },
                        else => unreachable,
                    }
                } else tokenizer.pushBack(token);
                cycles += 1;
                continue;
            },
            .object_end => {
                const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?;
                std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len });
                for (keys, vals) |k, v| {
                    const key = k.slice(&self.string_index);
                    const val = self.index.get(v);
                    std.debug.print(
                        \\"{s}": {s},
                    , .{ key, @tagName(val) });
                }
                std.debug.print("}}", .{});
            },
            .string => {
                const idx = try self.addString(allocator, token.value.?.string);
                const last_obj = self.index.get(depth_buf[cycles - 1]);
                if (cycles > 0) {
                    const stridx = self.index.get(idx).string;
                    self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
                        .len = last_obj.object.len + 1,
                        .property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx,
                        .value_idx = last_obj.object.value_idx,
                    } });
                    continue;
                }
            },
            .number => {
                _ = try self.addNumber(allocator, token.value.?.number);
                const last_obj = self.index.get(depth_buf[cycles - 1]);
                if (cycles > 0) {
                    self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
                        .len = last_obj.object.len,
                        .property_idx = last_obj.object.property_idx,
                        .value_idx = last_obj.object.value_idx,
                    } });
                    continue;
                }
            },
            .true, .false => {
                _ = try self.addBool(allocator, if (token.type == .true) true else false);
                const last_obj = self.index.get(depth_buf[cycles - 1]);
                if (cycles > 0) {
                    self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
                        .len = last_obj.object.len,
                        .property_idx = last_obj.object.property_idx,
                        .value_idx = last_obj.object.value_idx,
                    } });
                    continue;
                }
            },
            else => {},
        }
        tokenizer.skipWhitespace();
    }
 }
 test parse {
    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena.deinit();
    const allocator = arena.allocator();
    var self = init;
    defer deinit(&self, allocator);
    var tokenizer = try Tokenizer.init(allocator, blk: {
        const json =
            \\ {
            \\   "key": 123,
            \\   "key2": false,
            \\   "key3": true,
            \\   "key4": null
            \\ }
        ;
        break :blk json;
    });
    try parse(&self, &tokenizer);
 }
--- a/build.zig
+++ b/build.zig
@ -0,0 +1,34 @@
 const std = @import("std");
 pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const optimize = .ReleaseSafe;
    const exe_mod = b.createModule(.{
        .root_source_file = b.path("src/main.zig"),
        .target = target,
        .optimize = optimize,
    });
    const exe = b.addExecutable(.{
        .name = "aether",
        .root_module = exe_mod,
    });
    b.installArtifact(exe);
    const run_cmd = b.addRunArtifact(exe);
    run_cmd.step.dependOn(b.getInstallStep());
    const run_step = b.step("run", "Run the app");
    run_step.dependOn(&run_cmd.step);
    const exe_unit_tests = b.addTest(.{
        .root_module = exe_mod,
    });
    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
    const test_step = b.step("test", "Run unit tests");
    test_step.dependOn(&run_exe_unit_tests.step);
 }
--- a/build.zig.zon
+++ b/build.zig.zon
@ -0,0 +1,16 @@
 .{
    .name = .aether,
    .version = "0.0.0",
    .fingerprint = 0x255cfdbd72bde30d,
    .minimum_zig_version = "0.15.0-dev.552+bc2f7c754",
    .dependencies = .{
    },
    .paths = .{
        "build.zig",
        "build.zig.zon",
        "src",
    },
 }
--- a/strings.zig
+++ b/strings.zig
@ -0,0 +1,81 @@
 /// credits to Andrew Kelley
 /// strings.zig
 const std = @import("std");
 const mem = std.mem;
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const Self = @This();
 const max_load_percent = std.hash_map.default_max_load_percentage;
 string_bytes: std.ArrayListUnmanaged(u8) = .empty,
 string_table: StringIndex.Table = .empty,
 pub const empty = Self{
    .string_bytes = .empty,
    .string_table = .empty,
 };
 pub fn deinit(self: *Self, allocator: Allocator) void {
    self.string_bytes.deinit(allocator);
    self.string_table.deinit(allocator);
 }
 pub const StringIndex = enum(u32) {
    _,
    const Table = std.HashMapUnmanaged(StringIndex, void, TableContext, max_load_percent);
    const TableContext = struct {
        bytes: []const u8,
        pub fn eql(_: @This(), a: StringIndex, b: StringIndex) bool {
            return a == b;
        }
        pub fn hash(ctx: @This(), key: StringIndex) u64 {
            return std.hash_map.hashString(mem.sliceTo(ctx.bytes[@intFromEnum(key)..], 0));
        }
    };
    const TableIndexAdapter = struct {
        bytes: []const u8,
        pub fn eql(ctx: @This(), a: []const u8, b: StringIndex) bool {
            return mem.eql(u8, a, mem.sliceTo(ctx.bytes[@intFromEnum(b)..], 0));
        }
        pub fn hash(_: @This(), adapted_key: []const u8) u64 {
            assert(mem.indexOfScalar(u8, adapted_key, 0) == null);
            return std.hash_map.hashString(adapted_key);
        }
    };
    pub fn slice(index: StringIndex, state: *const Self) [:0]const u8 {
        const start_slice = state.string_bytes.items[@intFromEnum(index)..];
        return start_slice[0..mem.indexOfScalar(u8, start_slice, 0).? :0];
    }
 };
 pub fn add(state: *Self, allocator: Allocator, bytes: []const u8) !StringIndex {
    try state.string_bytes.ensureUnusedCapacity(allocator, bytes.len + 1);
    const gop = try state.string_table.getOrPutContextAdapted(
        allocator,
        bytes,
        StringIndex.TableIndexAdapter{ .bytes = state.string_bytes.items },
        StringIndex.TableContext{ .bytes = state.string_bytes.items },
    );
    if (gop.found_existing) return gop.key_ptr.*;
    const new_off: StringIndex = @enumFromInt(state.string_bytes.items.len);
    state.string_bytes.appendSliceAssumeCapacity(bytes);
    state.string_bytes.appendAssumeCapacity(0);
    gop.key_ptr.* = new_off;
    return new_off;
 }
--- a/tokenizer.zig
+++ b/tokenizer.zig
@ -0,0 +1,553 @@
 const std = @import("std");
 const mem = std.mem;
 pub const Error = error{
    /// eg: invalid JSON syntax
    InvalidSyntax,
    /// eg: allocator error
    OutOfMemory,
    /// eg: bad escaping
    UnexpectedCharacter,
    /// eg: got the wrong token type, check TokenType
    UnexpectedToken,
    /// eg: std.fmt.parseFloat failed
    BadNumber,
    /// fba error
    BufferTooSmall,
    /// eg: missing comma
    CommaExpected,
    /// eg: missing colon
    ColonExpected,
    /// eg: missing object key
    KeyExpected,
    /// eg: error while writing
    PrintError,
    /// eg: trailing comma in object
    TrailingComma,
 };
 pub const TokenType = enum(u8) {
    eof,
    null,
    true,
    false,
    number,
    string,
    property,
    object_begin,
    object_end,
    array_begin,
    array_end,
    colon,
    comma,
    whitespace,
 };
 pub const Token = struct {
    type: TokenType,
    value: ?union {
        number: f64,
        string: []const u8,
        symbol: u8,
    },
    start: usize,
    end: usize,
 };
 pub const Self = @This();
 text: []const u8,
 position: usize,
 max_position: usize,
 stack: []usize,
 frame: usize,
 allocator: std.mem.Allocator,
 prev_token: ?Token = null,
 pub fn pushBack(self: *Self, token: Token) void {
    self.prev_token = token;
 }
 /// Initialize a new tokenizer
 pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
    const stack = try allocator.alloc(usize, 0x100);
    errdefer allocator.free(stack);
    @memset(stack, 0);
    return .{
        .text = text,
        .position = 0,
        .max_position = 0,
        .stack = stack,
        .frame = 0,
        .allocator = allocator,
    };
 }
 /// Clean up resources
 pub fn deinit(self: *Self) void {
    self.allocator.free(self.stack);
 }
 // ========== Core Parsing Functions ==========
 fn currentPosition(self: *Self) usize {
    return self.stack[self.frame];
 }
 fn advance(self: *Self, delta: usize) void {
    self.stack[self.frame] += delta;
    if (self.max_position < self.stack[self.frame])
        self.max_position = self.stack[self.frame];
 }
 fn pushFrame(self: *Self) Error!usize {
    self.frame += 1;
    if (self.frame == self.stack.len) {
        const new_stack = try self.allocator.alloc(usize, self.stack.len * 2);
        @memset(new_stack, 0);
        @memcpy(new_stack, self.stack);
        self.allocator.free(self.stack);
        self.stack = new_stack;
    }
    self.stack[self.frame] = self.stack[self.frame - 1];
    return self.currentPosition();
 }
 fn popFrame(self: *Self) void {
    self.frame -= 1;
 }
 fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
    self.frame -= 1;
    self.stack[self.frame] = self.stack[self.frame + 1];
    return wrapped;
 }
 fn rollback(self: *Self) void {
    self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
 }
 // ========== Character Matching ==========
 fn lastChar(self: *Self) u8 {
    return self.text[self.currentPosition() - 1];
 }
 fn currentChar(self: *Self) u8 {
    return self.text[self.currentPosition()];
 }
 fn endOfInput(self: *Self) bool {
    return self.currentPosition() >= self.text.len;
 }
 fn matchChar(self: *Self, c: u8) ?void {
    if (self.endOfInput() or self.text[self.currentPosition()] != c) {
        return null;
    }
    self.advance(1);
 }
 fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
    // do not change this line for some reason it fucking breaks if I use currentChar directly
    if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
        return null;
    }
    self.advance(1);
 }
 fn matchString(self: *Self, s: []const u8) ?[]const u8 {
    if (self.text.len < self.currentPosition() + s.len) {
        // eof
        return null;
    }
    const remaining_len = s.len;
    const simd_width = 16; // 128-bit SIMD (SSE/NEON)
    var j: usize = 0;
    while (j + simd_width <= remaining_len) {
        const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
        const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;
        if (!@reduce(.And, expected_chunk == actual_chunk)) {
            return error.InvalidSyntax;
        }
        j += simd_width;
    }
    // Handle remaining bytes
    while (j < remaining_len) {
        if (s[j] != self.text[self.currentPosition() + j]) {
            return error.InvalidSyntax;
        }
        j += 1;
    }
    self.advance(s.len);
 }
 pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
    if (self.endOfInput())
        return null;
    const c = self.text[self.currentPosition()];
    if (!(c >= low and c <= high))
        return null;
    self.advance(1);
 }
 // ========== Token Extraction ==========
 fn extractSlice(self: *Self, start: usize) []const u8 {
    return self.text[start..self.currentPosition()];
 }
 // Skip all whitespace characters
 pub fn skipWhitespace(self: *Self) void {
    const start = self.currentPosition();
    if (self.endOfInput())
        return;
    const end = skipWhitespaceSimd(self.text[start..]);
    self.advance(end);
 }
 /// Parse a number token
 pub fn nextNumber(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();
    self.skipWhitespace();
    self.matchChar('-') orelse {}; // this may not fail
    while (self.matchCharRange('0', '9') != null) {}
    self.matchChar('.') orelse {
        // int found
        const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
            return error.BadNumber; // no floating point
        };
        return self.commit(Token{
            .type = .number,
            .value = .{
                .number = float,
            },
            .start = start,
            .end = self.currentPosition(),
        });
    };
    while (self.matchCharRange('0', '9') != null) {}
    const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
        return error.BadNumber; // floating point
    };
    return self.commit(Token{
        .type = .number,
        .value = .{
            .number = float,
        },
        .start = start,
        .end = self.currentPosition(),
    });
 }
 /// Parse an identifier token
 pub fn nextIdentifier(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();
    self.skipWhitespace();
    var buffer = try self.allocator.alloc(u8, 0x100);
    defer self.allocator.free(buffer);
    self.matchCharPredicate(std.ascii.isAlphabetic) orelse {
        return error.UnexpectedToken;
    };
    buffer[0] = self.lastChar();
    var i: usize = 1;
    while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
        buffer[i] = self.lastChar();
        i += 1;
    }
    const ident = buffer[0..i];
    // true
    if (std.mem.eql(u8, ident, "true")) {
        return self.commit(Token{
            .type = .true,
            .value = null,
            .start = start,
            .end = self.currentPosition(),
        });
    }
    // false
    if (std.mem.eql(u8, ident, "false")) {
        return self.commit(Token{
            .type = .false,
            .value = null,
            .start = start,
            .end = self.currentPosition(),
        });
    }
    // null
    if (std.mem.eql(u8, ident, "null")) {
        return self.commit(Token{
            .type = .null,
            .value = null,
            .start = start,
            .end = self.currentPosition(),
        });
    }
    unreachable;
 }
 /// Get the next token from the input
 /// WARNING: this function eats whitespaces
 pub fn nextToken(self: *Self) Error!Token {
    if (self.prev_token) |tok| {
        self.prev_token = null;
        return tok;
    }
    const start = try self.pushFrame();
    errdefer self.popFrame();
    self.skipWhitespace();
    if (self.endOfInput()) {
        return Token{
            .type = .eof,
            .value = null,
            .start = start,
            .end = start,
        };
    }
    self.advance(1);
    // Fall back to single character symbol
    const c = self.lastChar();
    const symbol_t: TokenType = switch (c) {
        '{' => .object_begin,
        '}' => .object_end,
        '[' => .array_begin,
        ']' => .array_end,
        ',' => .comma,
        ':' => .colon,
        '"' => {
            self.rollback();
            return (self.nextString());
        },
        else => {
            self.rollback();
            // Try different token types in order of precedence
            if (std.ascii.isDigit(c) or c == '-') {
                return (self.nextNumber());
            }
            if (std.ascii.isAlphabetic(c)) {
                return (self.nextIdentifier());
            }
            return error.InvalidSyntax;
        },
    };
    return self.commit(Token{
        .type = symbol_t,
        .value = null,
        .start = start,
        .end = start + 1,
    });
 }
 pub fn nextString(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();
    self.skipWhitespace();
    self.matchChar('"') orelse {
        return error.UnexpectedToken;
    };
    var buffer: std.ArrayList(u8) = .init(self.allocator);
    loop: while (!self.endOfInput()) {
        self.advance(1);
        switch (self.lastChar()) {
            '"' => {
                return self.commit(Token{
                    .type = .string,
                    .value = .{ .string = try buffer.toOwnedSlice() },
                    .start = start,
                    .end = self.currentPosition(),
                });
            },
            '\\' => {
                self.advance(1);
                switch (self.lastChar()) {
                    0x22, 0x5C, 0x2F => |d| {
                        try buffer.append(d);
                        continue :loop;
                    },
                    'b' => try buffer.append(0x8),
                    'f' => try buffer.append(0xC),
                    'n' => try buffer.append(0xA),
                    'r' => try buffer.append(0xD),
                    't' => try buffer.append(0x9),
                    'u' => {
                        var code_points: [4]u8 = undefined;
                        inline for (0..4) |i| {
                            if (self.endOfInput())
                                return self.commit(Token{
                                    .type = .eof,
                                    .value = null,
                                    .start = start,
                                    .end = start + 1,
                                });
                            self.advance(1);
                            code_points[i] = self.lastChar();
                        }
                        const buf = try stringToUtf8(&code_points);
                        try buffer.appendSlice(buf);
                        continue :loop;
                    },
                    else => return error.UnexpectedCharacter,
                } // end switch
            },
            else => |c| {
                if (std.ascii.isControl(c)) {
                    return error.UnexpectedCharacter;
                }
                try buffer.append(c);
            },
        } // end switch
    } // end while
    return error.InvalidSyntax;
 }
 pub const Iterator = struct {
    tokenizer: *Self,
    pub fn next(it: *Iterator) ?Token {
        if (it.tokenizer.endOfInput()) return null;
        return it.tokenizer.nextToken() catch null;
    }
    pub fn reset(it: *Iterator) void {
        it.tokenizer.position = 0;
        it.tokenizer.max_position = 0;
        it.tokenizer.frame = 0;
        it.tokenizer.prev_token = null;
    }
 };
 /// iterator
 pub fn iterator(self: *Self) Iterator {
    return Iterator{
        .tokenizer = self,
    };
 }
 pub fn stringToUtf8(bytes: []u8) ![]u8 {
    const code_point = std.fmt.parseInt(u21, bytes, 16) catch {
        return error.BadNumber;
    };
    var buffer: [4]u8 = undefined;
    var index: usize = 0;
    if (code_point <= 0x7F) {
        if (index >= buffer.len) return error.BufferTooSmall;
        buffer[index] = @as(u8, @intCast(code_point));
        index += 1;
    } else if (code_point <= 0x7FF) {
        if (index + 2 > buffer.len) return error.BufferTooSmall;
        buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
        buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
        index += 2;
    } else if (code_point <= 0xFFFF) {
        if (index + 3 > buffer.len) return error.BufferTooSmall;
        buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
        buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
        buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
        index += 3;
    } else if (code_point <= 0x10FFFF) {
        if (index + 4 > buffer.len) return error.BufferTooSmall;
        buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
        buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
        buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
        buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
        index += 4;
    } else unreachable;
    return buffer[0..index];
 }
 pub fn skipWhitespaceSimd(text: []const u8) usize {
    const ChunkSize = 16;
    const Vec = @Vector(ChunkSize, u8);
    // Broadcast whitespace characters to vectors
    const space: Vec = @splat(' ');
    const tab: Vec = @splat('\t');
    const lf: Vec = @splat('\n');
    const cr: Vec = @splat('\r');
    var j: usize = 0;
    const end = text.len;
    // SIMD processing
    while (j + ChunkSize <= end) {
        const chunk: Vec = text[j..][0..ChunkSize].*;
        // Compare against each whitespace character
        const is_space = chunk == space;
        const is_tab = chunk == tab;
        const is_lf = chunk == lf;
        const is_cr = chunk == cr;
        // Combine comparisons using vector operations
        const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
            @select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
            @select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
            @select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));
        const TrueMask: Vec = @splat(0xFF);
        const FalseMask: Vec = @splat(0x00);
        // Check if all characters are whitespace
        if (@reduce(.And, anyws == TrueMask)) {
            j += ChunkSize;
            continue;
        }
        // Find first non-whitespace
        const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
        if (mask != 0) {
            return j + @ctz(mask);
        }
    }
    // Scalar processing for remaining bytes
    while (j < end) switch (text[j]) {
        ' ', '\t', '\n', '\r' => j += 1,
        else => break,
    };
    return j;
 }