epic JSON parser

2025-05-23 18:17:59 -05:00 · 2025-05-23 18:17:59 -05:00 · 7bc4973bf4
commit 7bc4973bf4
5 changed files with 1081 additions and 0 deletions
--- a/2.zig
+++ b/2.zig
@ -0,0 +1,397 @@
+const std = @import("std");
+const Tokenizer = @import("tokenizer.zig");
+const StringPool = @import("strings.zig");
+const StringIndex = StringPool.StringIndex;
+const assert = std.debug.assert;
+
+const Self = @This();
+
+pub const Error = enum {};
+
+pub const JsonType = enum {
+    null,
+    bool,
+    number,
+    string,
+    array,
+    object,
+};
+
+pub const JsonValue = union(JsonType) {
+    null: void,
+    bool: bool,
+    number: f64,
+    string: StringIndex,
+    array: ArrayIndex.Slice,
+    object: ObjectIndex.Entry,
+};
+
+pub const JsonInput = union(JsonType) {
+    null: void,
+    bool: bool,
+    number: f64,
+    string: []const u8,
+    array: []JsonInput,
+    object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput),
+};
+
+pub const ArrayIndex = enum(usize) {
+    _,
+
+    pub const Slice = struct {
+        start: usize,
+        len: usize,
+    };
+};
+
+pub const ObjectIndex = enum(usize) {
+    _,
+
+    pub const Entry = struct {
+        len: usize,
+        property_idx: usize,
+        value_idx: usize,
+    };
+};
+
+pub const Options = struct {
+    comptime max_depth: usize = 256,
+};
+
+index: std.MultiArrayList(JsonValue) = .{},
+string_index: StringPool = .empty,
+
+options: Options = .{},
+
+pub const init: Self = .{};
+
+pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
+    self.index.deinit(allocator);
+    self.string_index.deinit(allocator);
+}
+
+fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
+    try self.index.ensureUnusedCapacity(allocator, 1);
+    const idx = self.index.addOneAssumeCapacity();
+    self.index.set(idx, .{ .number = number });
+    return idx;
+}
+
+fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
+    const stridx = try self.string_index.add(allocator, bytes);
+    try self.index.ensureUnusedCapacity(allocator, 1);
+    const idx = self.index.addOneAssumeCapacity();
+    self.index.set(idx, .{ .string = stridx });
+    return idx;
+}
+
+fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
+    var entry: ?ObjectIndex.Entry = null;
+
+    for (object.keys(), object.values(), 0..) |key, value, times| {
+        const stridx = try self.string_index.add(allocator, key);
+        try self.index.ensureUnusedCapacity(allocator, 1);
+        const vidx = self.index.addOneAssumeCapacity();
+        self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
+        if (times == 0) {
+            entry = ObjectIndex.Entry{
+                .len = object.entries.len,
+                .property_idx = stridx,
+                .value_idx = vidx,
+            };
+        }
+    }
+
+    try self.index.ensureUnusedCapacity(allocator, 1);
+    const idx = self.index.addOneAssumeCapacity();
+    if (entry) |e| {
+        self.index.set(idx, .{ .object = e });
+        return idx;
+    } else {
+        self.index.set(idx, .{ .object = ObjectIndex.Entry{
+            .len = 0,
+            .property_idx = 0,
+            .value_idx = 0,
+        } });
+        return idx;
+    }
+}
+
+fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
+    try self.index.ensureUnusedCapacity(allocator, 1);
+    const idx = self.index.addOneAssumeCapacity();
+    const object: ObjectIndex.Entry = .{
+        .property_idx = self.string_index.string_bytes.items.len,
+        .value_idx = self.index.len,
+        .len = 0,
+    };
+    self.index.set(idx, .{ .object = object });
+    return idx;
+}
+
+fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
+    var entry: ?ArrayIndex.Slice = null;
+    for (array, 0..) |value, times| {
+        try self.index.ensureUnusedCapacity(allocator, 1);
+        const idx = self.index.addOneAssumeCapacity();
+        self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
+        if (times == 0) {
+            entry = ArrayIndex.Slice{
+                .start = idx,
+                .len = array.len,
+            };
+        }
+    }
+    try self.index.ensureUnusedCapacity(allocator, 1);
+    const idx = self.index.addOneAssumeCapacity();
+    if (entry) |e| {
+        self.index.set(idx, .{ .array = e });
+        return idx;
+    } else {
+        self.index.set(idx, .{ .array = ArrayIndex.Slice{
+            .start = 0,
+            .len = 0,
+        } });
+        return idx;
+    }
+}
+
+fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
+    try self.index.ensureUnusedCapacity(allocator, 1);
+    const idx = self.index.addOneAssumeCapacity();
+    self.index.set(idx, .{ .bool = value });
+    return idx;
+}
+
+fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
+    switch (value) {
+        .null => {},
+        .bool => try self.addBool(allocator, value.bool),
+        .number => try self.addNumber(allocator, value.number),
+        .string => try self.addString(allocator, value.string),
+        .array => try self.addArray(allocator, value.array),
+        .object => try self.addObject(allocator, value.object),
+    }
+}
+
+fn getString(self: *Self, index: []const u8) ?StringIndex {
+    return self.string_index.string_table.get(index);
+}
+
+fn getNumber(self: *Self, index: usize) ?f64 {
+    if (self.index.get(index)) |n| return n;
+    return null;
+}
+
+fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct {
+    []StringIndex,
+    []usize,
+} {
+    const entry = self.index.get(index);
+
+    if (entry.object.len == 0) {
+        return .{ &.{}, &.{} };
+    }
+
+    var pidx = entry.object.property_idx;
+    var vidx = entry.object.value_idx;
+
+    const keys = try allocator.alloc(StringIndex, entry.object.len);
+    const values = try allocator.alloc(usize, entry.object.len);
+
+    for (0..entry.object.len) |i| {
+        const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index);
+        keys[i] = @enumFromInt(pidx);
+        values[i] = vidx;
+        pidx += slice.len + 1;
+        vidx += 1;
+    }
+
+    return .{ keys, values };
+}
+
+fn getArray(self: *Self, allocator: std.mem.Allocator, index: usize) ?[]usize {
+    const entry = self.index.get(index) orelse return null;
+
+    if (entry.array.len == 0) {
+        return &.{};
+    }
+
+    var idx = entry.array.start;
+    const values = try allocator.alloc(usize, entry.array.len);
+
+    for (entry.array.len) |i| {
+        values[i] = idx;
+        idx += 1;
+    }
+    return values;
+}
+
+fn getBool(self: *Self, index: usize) ?bool {
+    const entry = self.index.get(index) orelse return null;
+    return entry.bool;
+}
+
+fn getNull(self: *Self, index: usize) ?void {
+    const entry = self.index.get(index) orelse return null;
+    return entry.null;
+}
+
+fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput {
+    const entry = self.index.get(index);
+    switch (entry) {
+        .null => return .{ .null = {} },
+        .bool => return .{ .bool = entry.bool },
+        .number => return .{ .number = entry.number },
+        .string => {
+            const str = entry.string.slice(&self.string_index);
+            return .{ .string = str };
+        },
+        .array => {
+            const res = try allocator.alloc(JsonInput, entry.array.len);
+            var idx = entry.array.start;
+            for (0..entry.array.len) |i| {
+                if (try self.getValue(allocator, idx)) |v| {
+                    res[i] = v;
+                    idx += 1;
+                } else unreachable;
+            }
+            return .{ .array = res };
+        },
+        .object => {
+            var kidx = entry.object.property_idx;
+            var vidx = entry.object.value_idx;
+            var obj: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput) = .empty;
+
+            try obj.ensureTotalCapacity(allocator, entry.object.len);
+            for (0..entry.object.len) |_| {
+                const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index);
+                const val = (try self.getValue(allocator, vidx)).?;
+
+                obj.putAssumeCapacityNoClobber(key, val);
+                kidx += 1;
+                vidx += 1;
+            }
+
+            return .{ .object = obj };
+        },
+    }
+}
+
+pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
+    const allocator = tokenizer.allocator;
+
+    var it = tokenizer.iterator();
+
+    var depth_buf = try allocator.alloc(usize, self.options.max_depth);
+    defer allocator.free(depth_buf);
+
+    var cycles: usize = 0;
+
+    while (it.next()) |token| {
+        switch (token.type) {
+            .object_begin => {
+                std.debug.print("{{", .{});
+                const obj_idx = try self.addEmptyObject(allocator);
+
+                depth_buf[cycles] = obj_idx;
+
+                if (tokenizer.prev_token) |t| if (t.type == .object_begin) {
+                    // add map to itself
+                    const data = self.index.get(depth_buf[cycles - 1]);
+
+                    switch (data) {
+                        .object => |valid_entry| {
+                            const new_data = ObjectIndex.Entry{
+                                .len = valid_entry.len + 1,
+                                .property_idx = self.string_index.string_table.size,
+                                .value_idx = obj_idx,
+                            };
+                            self.index.set(depth_buf[cycles - 1], .{ .object = new_data });
+                            tokenizer.prev_token = null; // reset
+                        },
+                        else => unreachable,
+                    }
+                } else tokenizer.pushBack(token);
+                cycles += 1;
+                continue;
+            },
+            .object_end => {
+                const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?;
+                std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len });
+                for (keys, vals) |k, v| {
+                    const key = k.slice(&self.string_index);
+                    const val = self.index.get(v);
+                    std.debug.print(
+                        \\"{s}": {s},
+                    , .{ key, @tagName(val) });
+                }
+                std.debug.print("}}", .{});
+            },
+            .string => {
+                const idx = try self.addString(allocator, token.value.?.string);
+                const last_obj = self.index.get(depth_buf[cycles - 1]);
+                if (cycles > 0) {
+                    const stridx = self.index.get(idx).string;
+                    self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
+                        .len = last_obj.object.len + 1,
+                        .property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx,
+                        .value_idx = last_obj.object.value_idx,
+                    } });
+                    continue;
+                }
+            },
+            .number => {
+                _ = try self.addNumber(allocator, token.value.?.number);
+                const last_obj = self.index.get(depth_buf[cycles - 1]);
+                if (cycles > 0) {
+                    self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
+                        .len = last_obj.object.len,
+                        .property_idx = last_obj.object.property_idx,
+                        .value_idx = last_obj.object.value_idx,
+                    } });
+                    continue;
+                }
+            },
+            .true, .false => {
+                _ = try self.addBool(allocator, if (token.type == .true) true else false);
+                const last_obj = self.index.get(depth_buf[cycles - 1]);
+                if (cycles > 0) {
+                    self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
+                        .len = last_obj.object.len,
+                        .property_idx = last_obj.object.property_idx,
+                        .value_idx = last_obj.object.value_idx,
+                    } });
+                    continue;
+                }
+            },
+            else => {},
+        }
+
+        tokenizer.skipWhitespace();
+    }
+}
+
+test parse {
+    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
+    defer arena.deinit();
+
+    const allocator = arena.allocator();
+
+    var self = init;
+    defer deinit(&self, allocator);
+
+    var tokenizer = try Tokenizer.init(allocator, blk: {
+        const json =
+            \\ {
+            \\   "key": 123,
+            \\   "key2": false,
+            \\   "key3": true,
+            \\   "key4": null
+            \\ }
+        ;
+        break :blk json;
+    });
+
+    try parse(&self, &tokenizer);
+}
--- a/build.zig
+++ b/build.zig
@ -0,0 +1,34 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = .ReleaseSafe;
+
+    const exe_mod = b.createModule(.{
+        .root_source_file = b.path("src/main.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    const exe = b.addExecutable(.{
+        .name = "aether",
+        .root_module = exe_mod,
+    });
+
+    b.installArtifact(exe);
+
+    const run_cmd = b.addRunArtifact(exe);
+    run_cmd.step.dependOn(b.getInstallStep());
+
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+
+    const exe_unit_tests = b.addTest(.{
+        .root_module = exe_mod,
+    });
+
+    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
+
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_exe_unit_tests.step);
+}
--- a/build.zig.zon
+++ b/build.zig.zon
@ -0,0 +1,16 @@
+.{
+    .name = .aether,
+
+    .version = "0.0.0",
+
+    .fingerprint = 0x255cfdbd72bde30d,
+    .minimum_zig_version = "0.15.0-dev.552+bc2f7c754",
+
+    .dependencies = .{
+    },
+    .paths = .{
+        "build.zig",
+        "build.zig.zon",
+        "src",
+    },
+}
--- a/strings.zig
+++ b/strings.zig
@ -0,0 +1,81 @@
+/// credits to Andrew Kelley
+/// strings.zig
+
+const std = @import("std");
+const mem = std.mem;
+const assert = std.debug.assert;
+
+const Allocator = std.mem.Allocator;
+const Self = @This();
+
+const max_load_percent = std.hash_map.default_max_load_percentage;
+
+string_bytes: std.ArrayListUnmanaged(u8) = .empty,
+string_table: StringIndex.Table = .empty,
+
+pub const empty = Self{
+    .string_bytes = .empty,
+    .string_table = .empty,
+};
+
+pub fn deinit(self: *Self, allocator: Allocator) void {
+    self.string_bytes.deinit(allocator);
+    self.string_table.deinit(allocator);
+}
+
+pub const StringIndex = enum(u32) {
+    _,
+
+    const Table = std.HashMapUnmanaged(StringIndex, void, TableContext, max_load_percent);
+
+    const TableContext = struct {
+        bytes: []const u8,
+
+        pub fn eql(_: @This(), a: StringIndex, b: StringIndex) bool {
+            return a == b;
+        }
+
+        pub fn hash(ctx: @This(), key: StringIndex) u64 {
+            return std.hash_map.hashString(mem.sliceTo(ctx.bytes[@intFromEnum(key)..], 0));
+        }
+    };
+
+    const TableIndexAdapter = struct {
+        bytes: []const u8,
+
+        pub fn eql(ctx: @This(), a: []const u8, b: StringIndex) bool {
+            return mem.eql(u8, a, mem.sliceTo(ctx.bytes[@intFromEnum(b)..], 0));
+        }
+
+        pub fn hash(_: @This(), adapted_key: []const u8) u64 {
+            assert(mem.indexOfScalar(u8, adapted_key, 0) == null);
+            return std.hash_map.hashString(adapted_key);
+        }
+    };
+
+    pub fn slice(index: StringIndex, state: *const Self) [:0]const u8 {
+        const start_slice = state.string_bytes.items[@intFromEnum(index)..];
+        return start_slice[0..mem.indexOfScalar(u8, start_slice, 0).? :0];
+    }
+};
+
+pub fn add(state: *Self, allocator: Allocator, bytes: []const u8) !StringIndex {
+    try state.string_bytes.ensureUnusedCapacity(allocator, bytes.len + 1);
+
+    const gop = try state.string_table.getOrPutContextAdapted(
+        allocator,
+        bytes,
+        StringIndex.TableIndexAdapter{ .bytes = state.string_bytes.items },
+        StringIndex.TableContext{ .bytes = state.string_bytes.items },
+    );
+    if (gop.found_existing) return gop.key_ptr.*;
+
+    const new_off: StringIndex = @enumFromInt(state.string_bytes.items.len);
+
+    state.string_bytes.appendSliceAssumeCapacity(bytes);
+    state.string_bytes.appendAssumeCapacity(0);
+
+    gop.key_ptr.* = new_off;
+
+    return new_off;
+}
--- a/tokenizer.zig
+++ b/tokenizer.zig
@ -0,0 +1,553 @@
+const std = @import("std");
+const mem = std.mem;
+
+pub const Error = error{
+    /// eg: invalid JSON syntax
+    InvalidSyntax,
+    /// eg: allocator error
+    OutOfMemory,
+    /// eg: bad escaping
+    UnexpectedCharacter,
+    /// eg: got the wrong token type, check TokenType
+    UnexpectedToken,
+    /// eg: std.fmt.parseFloat failed
+    BadNumber,
+    /// fba error
+    BufferTooSmall,
+    /// eg: missing comma
+    CommaExpected,
+    /// eg: missing colon
+    ColonExpected,
+    /// eg: missing object key
+    KeyExpected,
+    /// eg: error while writing
+    PrintError,
+    /// eg: trailing comma in object
+    TrailingComma,
+};
+
+pub const TokenType = enum(u8) {
+    eof,
+    null,
+    true,
+    false,
+    number,
+    string,
+    property,
+    object_begin,
+    object_end,
+    array_begin,
+    array_end,
+    colon,
+    comma,
+    whitespace,
+};
+
+pub const Token = struct {
+    type: TokenType,
+    value: ?union {
+        number: f64,
+        string: []const u8,
+        symbol: u8,
+    },
+    start: usize,
+    end: usize,
+};
+
+pub const Self = @This();
+
+text: []const u8,
+position: usize,
+max_position: usize,
+stack: []usize,
+frame: usize,
+allocator: std.mem.Allocator,
+
+prev_token: ?Token = null,
+
+pub fn pushBack(self: *Self, token: Token) void {
+    self.prev_token = token;
+}
+
+/// Initialize a new tokenizer
+pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
+    const stack = try allocator.alloc(usize, 0x100);
+    errdefer allocator.free(stack);
+    @memset(stack, 0);
+    return .{
+        .text = text,
+        .position = 0,
+        .max_position = 0,
+        .stack = stack,
+        .frame = 0,
+        .allocator = allocator,
+    };
+}
+
+/// Clean up resources
+pub fn deinit(self: *Self) void {
+    self.allocator.free(self.stack);
+}
+
+// ========== Core Parsing Functions ==========
+
+fn currentPosition(self: *Self) usize {
+    return self.stack[self.frame];
+}
+
+fn advance(self: *Self, delta: usize) void {
+    self.stack[self.frame] += delta;
+    if (self.max_position < self.stack[self.frame])
+        self.max_position = self.stack[self.frame];
+}
+
+fn pushFrame(self: *Self) Error!usize {
+    self.frame += 1;
+    if (self.frame == self.stack.len) {
+        const new_stack = try self.allocator.alloc(usize, self.stack.len * 2);
+        @memset(new_stack, 0);
+        @memcpy(new_stack, self.stack);
+        self.allocator.free(self.stack);
+        self.stack = new_stack;
+    }
+    self.stack[self.frame] = self.stack[self.frame - 1];
+    return self.currentPosition();
+}
+
+fn popFrame(self: *Self) void {
+    self.frame -= 1;
+}
+
+fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
+    self.frame -= 1;
+    self.stack[self.frame] = self.stack[self.frame + 1];
+    return wrapped;
+}
+
+fn rollback(self: *Self) void {
+    self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
+}
+
+// ========== Character Matching ==========
+
+fn lastChar(self: *Self) u8 {
+    return self.text[self.currentPosition() - 1];
+}
+
+fn currentChar(self: *Self) u8 {
+    return self.text[self.currentPosition()];
+}
+
+fn endOfInput(self: *Self) bool {
+    return self.currentPosition() >= self.text.len;
+}
+
+fn matchChar(self: *Self, c: u8) ?void {
+    if (self.endOfInput() or self.text[self.currentPosition()] != c) {
+        return null;
+    }
+    self.advance(1);
+}
+
+fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
+    // do not change this line for some reason it fucking breaks if I use currentChar directly
+    if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
+        return null;
+    }
+    self.advance(1);
+}
+
+fn matchString(self: *Self, s: []const u8) ?[]const u8 {
+    if (self.text.len < self.currentPosition() + s.len) {
+        // eof
+        return null;
+    }
+
+    const remaining_len = s.len;
+    const simd_width = 16; // 128-bit SIMD (SSE/NEON)
+
+    var j: usize = 0;
+    while (j + simd_width <= remaining_len) {
+        const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
+        const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;
+
+        if (!@reduce(.And, expected_chunk == actual_chunk)) {
+            return error.InvalidSyntax;
+        }
+        j += simd_width;
+    }
+
+    // Handle remaining bytes
+    while (j < remaining_len) {
+        if (s[j] != self.text[self.currentPosition() + j]) {
+            return error.InvalidSyntax;
+        }
+        j += 1;
+    }
+
+    self.advance(s.len);
+}
+
+pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
+    if (self.endOfInput())
+        return null;
+
+    const c = self.text[self.currentPosition()];
+
+    if (!(c >= low and c <= high))
+        return null;
+
+    self.advance(1);
+}
+
+// ========== Token Extraction ==========
+
+fn extractSlice(self: *Self, start: usize) []const u8 {
+    return self.text[start..self.currentPosition()];
+}
+
+// Skip all whitespace characters
+pub fn skipWhitespace(self: *Self) void {
+    const start = self.currentPosition();
+    if (self.endOfInput())
+        return;
+    const end = skipWhitespaceSimd(self.text[start..]);
+    self.advance(end);
+}
+
+/// Parse a number token
+pub fn nextNumber(self: *Self) Error!Token {
+    const start = try self.pushFrame();
+    errdefer self.popFrame();
+
+    self.skipWhitespace();
+
+    self.matchChar('-') orelse {}; // this may not fail
+
+    while (self.matchCharRange('0', '9') != null) {}
+
+    self.matchChar('.') orelse {
+        // int found
+        const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
+            return error.BadNumber; // no floating point
+        };
+
+        return self.commit(Token{
+            .type = .number,
+            .value = .{
+                .number = float,
+            },
+            .start = start,
+            .end = self.currentPosition(),
+        });
+    };
+
+    while (self.matchCharRange('0', '9') != null) {}
+
+    const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
+        return error.BadNumber; // floating point
+    };
+
+    return self.commit(Token{
+        .type = .number,
+        .value = .{
+            .number = float,
+        },
+        .start = start,
+        .end = self.currentPosition(),
+    });
+}
+
+/// Parse an identifier token
+pub fn nextIdentifier(self: *Self) Error!Token {
+    const start = try self.pushFrame();
+    errdefer self.popFrame();
+
+    self.skipWhitespace();
+
+    var buffer = try self.allocator.alloc(u8, 0x100);
+    defer self.allocator.free(buffer);
+
+    self.matchCharPredicate(std.ascii.isAlphabetic) orelse {
+        return error.UnexpectedToken;
+    };
+
+    buffer[0] = self.lastChar();
+
+    var i: usize = 1;
+    while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
+        buffer[i] = self.lastChar();
+        i += 1;
+    }
+
+    const ident = buffer[0..i];
+
+    // true
+    if (std.mem.eql(u8, ident, "true")) {
+        return self.commit(Token{
+            .type = .true,
+            .value = null,
+            .start = start,
+            .end = self.currentPosition(),
+        });
+    }
+
+    // false
+    if (std.mem.eql(u8, ident, "false")) {
+        return self.commit(Token{
+            .type = .false,
+            .value = null,
+            .start = start,
+            .end = self.currentPosition(),
+        });
+    }
+
+    // null
+    if (std.mem.eql(u8, ident, "null")) {
+        return self.commit(Token{
+            .type = .null,
+            .value = null,
+            .start = start,
+            .end = self.currentPosition(),
+        });
+    }
+
+    unreachable;
+}
+
+/// Get the next token from the input
+/// WARNING: this function eats whitespaces
+pub fn nextToken(self: *Self) Error!Token {
+    if (self.prev_token) |tok| {
+        self.prev_token = null;
+        return tok;
+    }
+
+    const start = try self.pushFrame();
+    errdefer self.popFrame();
+
+    self.skipWhitespace();
+
+    if (self.endOfInput()) {
+        return Token{
+            .type = .eof,
+            .value = null,
+            .start = start,
+            .end = start,
+        };
+    }
+
+    self.advance(1);
+    // Fall back to single character symbol
+    const c = self.lastChar();
+
+    const symbol_t: TokenType = switch (c) {
+        '{' => .object_begin,
+        '}' => .object_end,
+        '[' => .array_begin,
+        ']' => .array_end,
+        ',' => .comma,
+        ':' => .colon,
+        '"' => {
+            self.rollback();
+            return (self.nextString());
+        },
+        else => {
+            self.rollback();
+            // Try different token types in order of precedence
+            if (std.ascii.isDigit(c) or c == '-') {
+                return (self.nextNumber());
+            }
+
+            if (std.ascii.isAlphabetic(c)) {
+                return (self.nextIdentifier());
+            }
+
+            return error.InvalidSyntax;
+        },
+    };
+
+    return self.commit(Token{
+        .type = symbol_t,
+        .value = null,
+        .start = start,
+        .end = start + 1,
+    });
+}
+
+pub fn nextString(self: *Self) Error!Token {
+    const start = try self.pushFrame();
+    errdefer self.popFrame();
+
+    self.skipWhitespace();
+
+    self.matchChar('"') orelse {
+        return error.UnexpectedToken;
+    };
+
+    var buffer: std.ArrayList(u8) = .init(self.allocator);
+
+    loop: while (!self.endOfInput()) {
+        self.advance(1);
+
+        switch (self.lastChar()) {
+            '"' => {
+                return self.commit(Token{
+                    .type = .string,
+                    .value = .{ .string = try buffer.toOwnedSlice() },
+                    .start = start,
+                    .end = self.currentPosition(),
+                });
+            },
+            '\\' => {
+                self.advance(1);
+                switch (self.lastChar()) {
+                    0x22, 0x5C, 0x2F => |d| {
+                        try buffer.append(d);
+                        continue :loop;
+                    },
+                    'b' => try buffer.append(0x8),
+                    'f' => try buffer.append(0xC),
+                    'n' => try buffer.append(0xA),
+                    'r' => try buffer.append(0xD),
+                    't' => try buffer.append(0x9),
+                    'u' => {
+                        var code_points: [4]u8 = undefined;
+                        inline for (0..4) |i| {
+                            if (self.endOfInput())
+                                return self.commit(Token{
+                                    .type = .eof,
+                                    .value = null,
+                                    .start = start,
+                                    .end = start + 1,
+                                });
+                            self.advance(1);
+                            code_points[i] = self.lastChar();
+                        }
+                        const buf = try stringToUtf8(&code_points);
+                        try buffer.appendSlice(buf);
+                        continue :loop;
+                    },
+                    else => return error.UnexpectedCharacter,
+                } // end switch
+            },
+            else => |c| {
+                if (std.ascii.isControl(c)) {
+                    return error.UnexpectedCharacter;
+                }
+                try buffer.append(c);
+            },
+        } // end switch
+    } // end while
+
+    return error.InvalidSyntax;
+}
+
+pub const Iterator = struct {
+    tokenizer: *Self,
+    pub fn next(it: *Iterator) ?Token {
+        if (it.tokenizer.endOfInput()) return null;
+        return it.tokenizer.nextToken() catch null;
+    }
+    pub fn reset(it: *Iterator) void {
+        it.tokenizer.position = 0;
+        it.tokenizer.max_position = 0;
+        it.tokenizer.frame = 0;
+        it.tokenizer.prev_token = null;
+    }
+};
+
+/// iterator
+pub fn iterator(self: *Self) Iterator {
+    return Iterator{
+        .tokenizer = self,
+    };
+}
+
+pub fn stringToUtf8(bytes: []u8) ![]u8 {
+    const code_point = std.fmt.parseInt(u21, bytes, 16) catch {
+        return error.BadNumber;
+    };
+    var buffer: [4]u8 = undefined;
+    var index: usize = 0;
+
+    if (code_point <= 0x7F) {
+        if (index >= buffer.len) return error.BufferTooSmall;
+        buffer[index] = @as(u8, @intCast(code_point));
+        index += 1;
+    } else if (code_point <= 0x7FF) {
+        if (index + 2 > buffer.len) return error.BufferTooSmall;
+        buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
+        buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
+        index += 2;
+    } else if (code_point <= 0xFFFF) {
+        if (index + 3 > buffer.len) return error.BufferTooSmall;
+        buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
+        buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
+        buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
+        index += 3;
+    } else if (code_point <= 0x10FFFF) {
+        if (index + 4 > buffer.len) return error.BufferTooSmall;
+        buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
+        buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
+        buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
+        buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
+        index += 4;
+    } else unreachable;
+
+    return buffer[0..index];
+}
+
+pub fn skipWhitespaceSimd(text: []const u8) usize {
+    const ChunkSize = 16;
+    const Vec = @Vector(ChunkSize, u8);
+
+    // Broadcast whitespace characters to vectors
+    const space: Vec = @splat(' ');
+    const tab: Vec = @splat('\t');
+    const lf: Vec = @splat('\n');
+    const cr: Vec = @splat('\r');
+
+    var j: usize = 0;
+    const end = text.len;
+
+    // SIMD processing
+    while (j + ChunkSize <= end) {
+        const chunk: Vec = text[j..][0..ChunkSize].*;
+
+        // Compare against each whitespace character
+        const is_space = chunk == space;
+        const is_tab = chunk == tab;
+        const is_lf = chunk == lf;
+        const is_cr = chunk == cr;
+
+        // Combine comparisons using vector operations
+        const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
+            @select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
+            @select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
+            @select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));
+
+        const TrueMask: Vec = @splat(0xFF);
+        const FalseMask: Vec = @splat(0x00);
+
+        // Check if all characters are whitespace
+        if (@reduce(.And, anyws == TrueMask)) {
+            j += ChunkSize;
+            continue;
+        }
+
+        // Find first non-whitespace
+        const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
+        if (mask != 0) {
+            return j + @ctz(mask);
+        }
+    }
+
+    // Scalar processing for remaining bytes
+    while (j < end) switch (text[j]) {
+        ' ', '\t', '\n', '\r' => j += 1,
+        else => break,
+    };
+
+    return j;
+}