commit 7bc4973bf4d1755717e80bb68a1b12b3ac418aff Author: yuzu Date: Fri May 23 18:17:59 2025 -0500 epic JSON parser diff --git a/2.zig b/2.zig new file mode 100644 index 0000000..a9477fe --- /dev/null +++ b/2.zig @@ -0,0 +1,397 @@ +const std = @import("std"); +const Tokenizer = @import("tokenizer.zig"); +const StringPool = @import("strings.zig"); +const StringIndex = StringPool.StringIndex; +const assert = std.debug.assert; + +const Self = @This(); + +pub const Error = enum {}; + +pub const JsonType = enum { + null, + bool, + number, + string, + array, + object, +}; + +pub const JsonValue = union(JsonType) { + null: void, + bool: bool, + number: f64, + string: StringIndex, + array: ArrayIndex.Slice, + object: ObjectIndex.Entry, +}; + +pub const JsonInput = union(JsonType) { + null: void, + bool: bool, + number: f64, + string: []const u8, + array: []JsonInput, + object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput), +}; + +pub const ArrayIndex = enum(usize) { + _, + + pub const Slice = struct { + start: usize, + len: usize, + }; +}; + +pub const ObjectIndex = enum(usize) { + _, + + pub const Entry = struct { + len: usize, + property_idx: usize, + value_idx: usize, + }; +}; + +pub const Options = struct { + comptime max_depth: usize = 256, +}; + +index: std.MultiArrayList(JsonValue) = .{}, +string_index: StringPool = .empty, + +options: Options = .{}, + +pub const init: Self = .{}; + +pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { + self.index.deinit(allocator); + self.string_index.deinit(allocator); +} + +fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize { + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + self.index.set(idx, .{ .number = number }); + return idx; +} + +fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize { + const stridx = try self.string_index.add(allocator, bytes); + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + self.index.set(idx, .{ .string = stridx }); + return idx; +} + +fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize { + var entry: ?ObjectIndex.Entry = null; + + for (object.keys(), object.values(), 0..) |key, value, times| { + const stridx = try self.string_index.add(allocator, key); + try self.index.ensureUnusedCapacity(allocator, 1); + const vidx = self.index.addOneAssumeCapacity(); + self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value))); + if (times == 0) { + entry = ObjectIndex.Entry{ + .len = object.entries.len, + .property_idx = stridx, + .value_idx = vidx, + }; + } + } + + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + if (entry) |e| { + self.index.set(idx, .{ .object = e }); + return idx; + } else { + self.index.set(idx, .{ .object = ObjectIndex.Entry{ + .len = 0, + .property_idx = 0, + .value_idx = 0, + } }); + return idx; + } +} + +fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize { + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + const object: ObjectIndex.Entry = .{ + .property_idx = self.string_index.string_bytes.items.len, + .value_idx = self.index.len, + .len = 0, + }; + self.index.set(idx, .{ .object = object }); + return idx; +} + +fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize { + var entry: ?ArrayIndex.Slice = null; + for (array, 0..) |value, times| { + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value))); + if (times == 0) { + entry = ArrayIndex.Slice{ + .start = idx, + .len = array.len, + }; + } + } + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + if (entry) |e| { + self.index.set(idx, .{ .array = e }); + return idx; + } else { + self.index.set(idx, .{ .array = ArrayIndex.Slice{ + .start = 0, + .len = 0, + } }); + return idx; + } +} + +fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize { + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + self.index.set(idx, .{ .bool = value }); + return idx; +} + +fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void { + switch (value) { + .null => {}, + .bool => try self.addBool(allocator, value.bool), + .number => try self.addNumber(allocator, value.number), + .string => try self.addString(allocator, value.string), + .array => try self.addArray(allocator, value.array), + .object => try self.addObject(allocator, value.object), + } +} + +fn getString(self: *Self, index: []const u8) ?StringIndex { + return self.string_index.string_table.get(index); +} + +fn getNumber(self: *Self, index: usize) ?f64 { + if (self.index.get(index)) |n| return n; + return null; +} + +fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct { + []StringIndex, + []usize, +} { + const entry = self.index.get(index); + + if (entry.object.len == 0) { + return .{ &.{}, &.{} }; + } + + var pidx = entry.object.property_idx; + var vidx = entry.object.value_idx; + + const keys = try allocator.alloc(StringIndex, entry.object.len); + const values = try allocator.alloc(usize, entry.object.len); + + for (0..entry.object.len) |i| { + const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index); + keys[i] = @enumFromInt(pidx); + values[i] = vidx; + pidx += slice.len + 1; + vidx += 1; + } + + return .{ keys, values }; +} + +fn getArray(self: *Self, allocator: std.mem.Allocator, index: usize) ?[]usize { + const entry = self.index.get(index) orelse return null; + + if (entry.array.len == 0) { + return &.{}; + } + + var idx = entry.array.start; + const values = try allocator.alloc(usize, entry.array.len); + + for (entry.array.len) |i| { + values[i] = idx; + idx += 1; + } + return values; +} + +fn getBool(self: *Self, index: usize) ?bool { + const entry = self.index.get(index) orelse return null; + return entry.bool; +} + +fn getNull(self: *Self, index: usize) ?void { + const entry = self.index.get(index) orelse return null; + return entry.null; +} + +fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput { + const entry = self.index.get(index); + switch (entry) { + .null => return .{ .null = {} }, + .bool => return .{ .bool = entry.bool }, + .number => return .{ .number = entry.number }, + .string => { + const str = entry.string.slice(&self.string_index); + return .{ .string = str }; + }, + .array => { + const res = try allocator.alloc(JsonInput, entry.array.len); + var idx = entry.array.start; + for (0..entry.array.len) |i| { + if (try self.getValue(allocator, idx)) |v| { + res[i] = v; + idx += 1; + } else unreachable; + } + return .{ .array = res }; + }, + .object => { + var kidx = entry.object.property_idx; + var vidx = entry.object.value_idx; + var obj: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput) = .empty; + + try obj.ensureTotalCapacity(allocator, entry.object.len); + for (0..entry.object.len) |_| { + const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index); + const val = (try self.getValue(allocator, vidx)).?; + + obj.putAssumeCapacityNoClobber(key, val); + kidx += 1; + vidx += 1; + } + + return .{ .object = obj }; + }, + } +} + +pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { + const allocator = tokenizer.allocator; + + var it = tokenizer.iterator(); + + var depth_buf = try allocator.alloc(usize, self.options.max_depth); + defer allocator.free(depth_buf); + + var cycles: usize = 0; + + while (it.next()) |token| { + switch (token.type) { + .object_begin => { + std.debug.print("{{", .{}); + const obj_idx = try self.addEmptyObject(allocator); + + depth_buf[cycles] = obj_idx; + + if (tokenizer.prev_token) |t| if (t.type == .object_begin) { + // add map to itself + const data = self.index.get(depth_buf[cycles - 1]); + + switch (data) { + .object => |valid_entry| { + const new_data = ObjectIndex.Entry{ + .len = valid_entry.len + 1, + .property_idx = self.string_index.string_table.size, + .value_idx = obj_idx, + }; + self.index.set(depth_buf[cycles - 1], .{ .object = new_data }); + tokenizer.prev_token = null; // reset + }, + else => unreachable, + } + } else tokenizer.pushBack(token); + cycles += 1; + continue; + }, + .object_end => { + const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?; + std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len }); + for (keys, vals) |k, v| { + const key = k.slice(&self.string_index); + const val = self.index.get(v); + std.debug.print( + \\"{s}": {s}, + , .{ key, @tagName(val) }); + } + std.debug.print("}}", .{}); + }, + .string => { + const idx = try self.addString(allocator, token.value.?.string); + const last_obj = self.index.get(depth_buf[cycles - 1]); + if (cycles > 0) { + const stridx = self.index.get(idx).string; + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ + .len = last_obj.object.len + 1, + .property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx, + .value_idx = last_obj.object.value_idx, + } }); + continue; + } + }, + .number => { + _ = try self.addNumber(allocator, token.value.?.number); + const last_obj = self.index.get(depth_buf[cycles - 1]); + if (cycles > 0) { + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ + .len = last_obj.object.len, + .property_idx = last_obj.object.property_idx, + .value_idx = last_obj.object.value_idx, + } }); + continue; + } + }, + .true, .false => { + _ = try self.addBool(allocator, if (token.type == .true) true else false); + const last_obj = self.index.get(depth_buf[cycles - 1]); + if (cycles > 0) { + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ + .len = last_obj.object.len, + .property_idx = last_obj.object.property_idx, + .value_idx = last_obj.object.value_idx, + } }); + continue; + } + }, + else => {}, + } + + tokenizer.skipWhitespace(); + } +} + +test parse { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + + const allocator = arena.allocator(); + + var self = init; + defer deinit(&self, allocator); + + var tokenizer = try Tokenizer.init(allocator, blk: { + const json = + \\ { + \\ "key": 123, + \\ "key2": false, + \\ "key3": true, + \\ "key4": null + \\ } + ; + break :blk json; + }); + + try parse(&self, &tokenizer); +} diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..9db02ac --- /dev/null +++ b/build.zig @@ -0,0 +1,34 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = .ReleaseSafe; + + const exe_mod = b.createModule(.{ + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }); + + const exe = b.addExecutable(.{ + .name = "aether", + .root_module = exe_mod, + }); + + b.installArtifact(exe); + + const run_cmd = b.addRunArtifact(exe); + run_cmd.step.dependOn(b.getInstallStep()); + + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); + + const exe_unit_tests = b.addTest(.{ + .root_module = exe_mod, + }); + + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); + + const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_exe_unit_tests.step); +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..83974c8 --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,16 @@ +.{ + .name = .aether, + + .version = "0.0.0", + + .fingerprint = 0x255cfdbd72bde30d, + .minimum_zig_version = "0.15.0-dev.552+bc2f7c754", + + .dependencies = .{ + }, + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + }, +} diff --git a/strings.zig b/strings.zig new file mode 100644 index 0000000..8fb49ae --- /dev/null +++ b/strings.zig @@ -0,0 +1,81 @@ +/// credits to Andrew Kelley +/// strings.zig + +const std = @import("std"); +const mem = std.mem; +const assert = std.debug.assert; + +const Allocator = std.mem.Allocator; +const Self = @This(); + +const max_load_percent = std.hash_map.default_max_load_percentage; + +string_bytes: std.ArrayListUnmanaged(u8) = .empty, +string_table: StringIndex.Table = .empty, + +pub const empty = Self{ + .string_bytes = .empty, + .string_table = .empty, +}; + +pub fn deinit(self: *Self, allocator: Allocator) void { + self.string_bytes.deinit(allocator); + self.string_table.deinit(allocator); +} + +pub const StringIndex = enum(u32) { + _, + + const Table = std.HashMapUnmanaged(StringIndex, void, TableContext, max_load_percent); + + const TableContext = struct { + bytes: []const u8, + + pub fn eql(_: @This(), a: StringIndex, b: StringIndex) bool { + return a == b; + } + + pub fn hash(ctx: @This(), key: StringIndex) u64 { + return std.hash_map.hashString(mem.sliceTo(ctx.bytes[@intFromEnum(key)..], 0)); + } + }; + + const TableIndexAdapter = struct { + bytes: []const u8, + + pub fn eql(ctx: @This(), a: []const u8, b: StringIndex) bool { + return mem.eql(u8, a, mem.sliceTo(ctx.bytes[@intFromEnum(b)..], 0)); + } + + pub fn hash(_: @This(), adapted_key: []const u8) u64 { + assert(mem.indexOfScalar(u8, adapted_key, 0) == null); + return std.hash_map.hashString(adapted_key); + } + }; + + pub fn slice(index: StringIndex, state: *const Self) [:0]const u8 { + const start_slice = state.string_bytes.items[@intFromEnum(index)..]; + return start_slice[0..mem.indexOfScalar(u8, start_slice, 0).? :0]; + } +}; + +pub fn add(state: *Self, allocator: Allocator, bytes: []const u8) !StringIndex { + try state.string_bytes.ensureUnusedCapacity(allocator, bytes.len + 1); + + const gop = try state.string_table.getOrPutContextAdapted( + allocator, + bytes, + StringIndex.TableIndexAdapter{ .bytes = state.string_bytes.items }, + StringIndex.TableContext{ .bytes = state.string_bytes.items }, + ); + if (gop.found_existing) return gop.key_ptr.*; + + const new_off: StringIndex = @enumFromInt(state.string_bytes.items.len); + + state.string_bytes.appendSliceAssumeCapacity(bytes); + state.string_bytes.appendAssumeCapacity(0); + + gop.key_ptr.* = new_off; + + return new_off; +} diff --git a/tokenizer.zig b/tokenizer.zig new file mode 100644 index 0000000..57266a9 --- /dev/null +++ b/tokenizer.zig @@ -0,0 +1,553 @@ +const std = @import("std"); +const mem = std.mem; + +pub const Error = error{ + /// eg: invalid JSON syntax + InvalidSyntax, + /// eg: allocator error + OutOfMemory, + /// eg: bad escaping + UnexpectedCharacter, + /// eg: got the wrong token type, check TokenType + UnexpectedToken, + /// eg: std.fmt.parseFloat failed + BadNumber, + /// fba error + BufferTooSmall, + /// eg: missing comma + CommaExpected, + /// eg: missing colon + ColonExpected, + /// eg: missing object key + KeyExpected, + /// eg: error while writing + PrintError, + /// eg: trailing comma in object + TrailingComma, +}; + +pub const TokenType = enum(u8) { + eof, + null, + true, + false, + number, + string, + property, + object_begin, + object_end, + array_begin, + array_end, + colon, + comma, + whitespace, +}; + +pub const Token = struct { + type: TokenType, + value: ?union { + number: f64, + string: []const u8, + symbol: u8, + }, + start: usize, + end: usize, +}; + +pub const Self = @This(); + +text: []const u8, +position: usize, +max_position: usize, +stack: []usize, +frame: usize, +allocator: std.mem.Allocator, + +prev_token: ?Token = null, + +pub fn pushBack(self: *Self, token: Token) void { + self.prev_token = token; +} + +/// Initialize a new tokenizer +pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self { + const stack = try allocator.alloc(usize, 0x100); + errdefer allocator.free(stack); + @memset(stack, 0); + return .{ + .text = text, + .position = 0, + .max_position = 0, + .stack = stack, + .frame = 0, + .allocator = allocator, + }; +} + +/// Clean up resources +pub fn deinit(self: *Self) void { + self.allocator.free(self.stack); +} + +// ========== Core Parsing Functions ========== + +fn currentPosition(self: *Self) usize { + return self.stack[self.frame]; +} + +fn advance(self: *Self, delta: usize) void { + self.stack[self.frame] += delta; + if (self.max_position < self.stack[self.frame]) + self.max_position = self.stack[self.frame]; +} + +fn pushFrame(self: *Self) Error!usize { + self.frame += 1; + if (self.frame == self.stack.len) { + const new_stack = try self.allocator.alloc(usize, self.stack.len * 2); + @memset(new_stack, 0); + @memcpy(new_stack, self.stack); + self.allocator.free(self.stack); + self.stack = new_stack; + } + self.stack[self.frame] = self.stack[self.frame - 1]; + return self.currentPosition(); +} + +fn popFrame(self: *Self) void { + self.frame -= 1; +} + +fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) { + self.frame -= 1; + self.stack[self.frame] = self.stack[self.frame + 1]; + return wrapped; +} + +fn rollback(self: *Self) void { + self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1]; +} + +// ========== Character Matching ========== + +fn lastChar(self: *Self) u8 { + return self.text[self.currentPosition() - 1]; +} + +fn currentChar(self: *Self) u8 { + return self.text[self.currentPosition()]; +} + +fn endOfInput(self: *Self) bool { + return self.currentPosition() >= self.text.len; +} + +fn matchChar(self: *Self, c: u8) ?void { + if (self.endOfInput() or self.text[self.currentPosition()] != c) { + return null; + } + self.advance(1); +} + +fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void { + // do not change this line for some reason it fucking breaks if I use currentChar directly + if (self.endOfInput() or !pred(self.text[self.currentPosition()])) { + return null; + } + self.advance(1); +} + +fn matchString(self: *Self, s: []const u8) ?[]const u8 { + if (self.text.len < self.currentPosition() + s.len) { + // eof + return null; + } + + const remaining_len = s.len; + const simd_width = 16; // 128-bit SIMD (SSE/NEON) + + var j: usize = 0; + while (j + simd_width <= remaining_len) { + const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*; + const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*; + + if (!@reduce(.And, expected_chunk == actual_chunk)) { + return error.InvalidSyntax; + } + j += simd_width; + } + + // Handle remaining bytes + while (j < remaining_len) { + if (s[j] != self.text[self.currentPosition() + j]) { + return error.InvalidSyntax; + } + j += 1; + } + + self.advance(s.len); +} + +pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void { + if (self.endOfInput()) + return null; + + const c = self.text[self.currentPosition()]; + + if (!(c >= low and c <= high)) + return null; + + self.advance(1); +} + +// ========== Token Extraction ========== + +fn extractSlice(self: *Self, start: usize) []const u8 { + return self.text[start..self.currentPosition()]; +} + +// Skip all whitespace characters +pub fn skipWhitespace(self: *Self) void { + const start = self.currentPosition(); + if (self.endOfInput()) + return; + const end = skipWhitespaceSimd(self.text[start..]); + self.advance(end); +} + +/// Parse a number token +pub fn nextNumber(self: *Self) Error!Token { + const start = try self.pushFrame(); + errdefer self.popFrame(); + + self.skipWhitespace(); + + self.matchChar('-') orelse {}; // this may not fail + + while (self.matchCharRange('0', '9') != null) {} + + self.matchChar('.') orelse { + // int found + const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch { + return error.BadNumber; // no floating point + }; + + return self.commit(Token{ + .type = .number, + .value = .{ + .number = float, + }, + .start = start, + .end = self.currentPosition(), + }); + }; + + while (self.matchCharRange('0', '9') != null) {} + + const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch { + return error.BadNumber; // floating point + }; + + return self.commit(Token{ + .type = .number, + .value = .{ + .number = float, + }, + .start = start, + .end = self.currentPosition(), + }); +} + +/// Parse an identifier token +pub fn nextIdentifier(self: *Self) Error!Token { + const start = try self.pushFrame(); + errdefer self.popFrame(); + + self.skipWhitespace(); + + var buffer = try self.allocator.alloc(u8, 0x100); + defer self.allocator.free(buffer); + + self.matchCharPredicate(std.ascii.isAlphabetic) orelse { + return error.UnexpectedToken; + }; + + buffer[0] = self.lastChar(); + + var i: usize = 1; + while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) { + buffer[i] = self.lastChar(); + i += 1; + } + + const ident = buffer[0..i]; + + // true + if (std.mem.eql(u8, ident, "true")) { + return self.commit(Token{ + .type = .true, + .value = null, + .start = start, + .end = self.currentPosition(), + }); + } + + // false + if (std.mem.eql(u8, ident, "false")) { + return self.commit(Token{ + .type = .false, + .value = null, + .start = start, + .end = self.currentPosition(), + }); + } + + // null + if (std.mem.eql(u8, ident, "null")) { + return self.commit(Token{ + .type = .null, + .value = null, + .start = start, + .end = self.currentPosition(), + }); + } + + unreachable; +} + +/// Get the next token from the input +/// WARNING: this function eats whitespaces +pub fn nextToken(self: *Self) Error!Token { + if (self.prev_token) |tok| { + self.prev_token = null; + return tok; + } + + const start = try self.pushFrame(); + errdefer self.popFrame(); + + self.skipWhitespace(); + + if (self.endOfInput()) { + return Token{ + .type = .eof, + .value = null, + .start = start, + .end = start, + }; + } + + self.advance(1); + // Fall back to single character symbol + const c = self.lastChar(); + + const symbol_t: TokenType = switch (c) { + '{' => .object_begin, + '}' => .object_end, + '[' => .array_begin, + ']' => .array_end, + ',' => .comma, + ':' => .colon, + '"' => { + self.rollback(); + return (self.nextString()); + }, + else => { + self.rollback(); + // Try different token types in order of precedence + if (std.ascii.isDigit(c) or c == '-') { + return (self.nextNumber()); + } + + if (std.ascii.isAlphabetic(c)) { + return (self.nextIdentifier()); + } + + return error.InvalidSyntax; + }, + }; + + return self.commit(Token{ + .type = symbol_t, + .value = null, + .start = start, + .end = start + 1, + }); +} + +pub fn nextString(self: *Self) Error!Token { + const start = try self.pushFrame(); + errdefer self.popFrame(); + + self.skipWhitespace(); + + self.matchChar('"') orelse { + return error.UnexpectedToken; + }; + + var buffer: std.ArrayList(u8) = .init(self.allocator); + + loop: while (!self.endOfInput()) { + self.advance(1); + + switch (self.lastChar()) { + '"' => { + return self.commit(Token{ + .type = .string, + .value = .{ .string = try buffer.toOwnedSlice() }, + .start = start, + .end = self.currentPosition(), + }); + }, + '\\' => { + self.advance(1); + switch (self.lastChar()) { + 0x22, 0x5C, 0x2F => |d| { + try buffer.append(d); + continue :loop; + }, + 'b' => try buffer.append(0x8), + 'f' => try buffer.append(0xC), + 'n' => try buffer.append(0xA), + 'r' => try buffer.append(0xD), + 't' => try buffer.append(0x9), + 'u' => { + var code_points: [4]u8 = undefined; + inline for (0..4) |i| { + if (self.endOfInput()) + return self.commit(Token{ + .type = .eof, + .value = null, + .start = start, + .end = start + 1, + }); + self.advance(1); + code_points[i] = self.lastChar(); + } + const buf = try stringToUtf8(&code_points); + try buffer.appendSlice(buf); + continue :loop; + }, + else => return error.UnexpectedCharacter, + } // end switch + }, + else => |c| { + if (std.ascii.isControl(c)) { + return error.UnexpectedCharacter; + } + try buffer.append(c); + }, + } // end switch + } // end while + + return error.InvalidSyntax; +} + +pub const Iterator = struct { + tokenizer: *Self, + pub fn next(it: *Iterator) ?Token { + if (it.tokenizer.endOfInput()) return null; + return it.tokenizer.nextToken() catch null; + } + pub fn reset(it: *Iterator) void { + it.tokenizer.position = 0; + it.tokenizer.max_position = 0; + it.tokenizer.frame = 0; + it.tokenizer.prev_token = null; + } +}; + +/// iterator +pub fn iterator(self: *Self) Iterator { + return Iterator{ + .tokenizer = self, + }; +} + +pub fn stringToUtf8(bytes: []u8) ![]u8 { + const code_point = std.fmt.parseInt(u21, bytes, 16) catch { + return error.BadNumber; + }; + var buffer: [4]u8 = undefined; + var index: usize = 0; + + if (code_point <= 0x7F) { + if (index >= buffer.len) return error.BufferTooSmall; + buffer[index] = @as(u8, @intCast(code_point)); + index += 1; + } else if (code_point <= 0x7FF) { + if (index + 2 > buffer.len) return error.BufferTooSmall; + buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6))); + buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F))); + index += 2; + } else if (code_point <= 0xFFFF) { + if (index + 3 > buffer.len) return error.BufferTooSmall; + buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12))); + buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F))); + buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F))); + index += 3; + } else if (code_point <= 0x10FFFF) { + if (index + 4 > buffer.len) return error.BufferTooSmall; + buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18))); + buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F))); + buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F))); + buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F))); + index += 4; + } else unreachable; + + return buffer[0..index]; +} + +pub fn skipWhitespaceSimd(text: []const u8) usize { + const ChunkSize = 16; + const Vec = @Vector(ChunkSize, u8); + + // Broadcast whitespace characters to vectors + const space: Vec = @splat(' '); + const tab: Vec = @splat('\t'); + const lf: Vec = @splat('\n'); + const cr: Vec = @splat('\r'); + + var j: usize = 0; + const end = text.len; + + // SIMD processing + while (j + ChunkSize <= end) { + const chunk: Vec = text[j..][0..ChunkSize].*; + + // Compare against each whitespace character + const is_space = chunk == space; + const is_tab = chunk == tab; + const is_lf = chunk == lf; + const is_cr = chunk == cr; + + // Combine comparisons using vector operations + const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) | + @select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) | + @select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) | + @select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0))); + + const TrueMask: Vec = @splat(0xFF); + const FalseMask: Vec = @splat(0x00); + + // Check if all characters are whitespace + if (@reduce(.And, anyws == TrueMask)) { + j += ChunkSize; + continue; + } + + // Find first non-whitespace + const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask); + if (mask != 0) { + return j + @ctz(mask); + } + } + + // Scalar processing for remaining bytes + while (j < end) switch (text[j]) { + ' ', '\t', '\n', '\r' => j += 1, + else => break, + }; + + return j; +}