epic JSON parser

This commit is contained in:
yuzu 2025-05-23 18:17:59 -05:00
commit 7bc4973bf4
5 changed files with 1081 additions and 0 deletions

397
2.zig Normal file
View File

@ -0,0 +1,397 @@
const std = @import("std");
const Tokenizer = @import("tokenizer.zig");
const StringPool = @import("strings.zig");
const StringIndex = StringPool.StringIndex;
const assert = std.debug.assert;
const Self = @This();
pub const Error = enum {};
pub const JsonType = enum {
null,
bool,
number,
string,
array,
object,
};
pub const JsonValue = union(JsonType) {
null: void,
bool: bool,
number: f64,
string: StringIndex,
array: ArrayIndex.Slice,
object: ObjectIndex.Entry,
};
pub const JsonInput = union(JsonType) {
null: void,
bool: bool,
number: f64,
string: []const u8,
array: []JsonInput,
object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput),
};
pub const ArrayIndex = enum(usize) {
_,
pub const Slice = struct {
start: usize,
len: usize,
};
};
pub const ObjectIndex = enum(usize) {
_,
pub const Entry = struct {
len: usize,
property_idx: usize,
value_idx: usize,
};
};
pub const Options = struct {
comptime max_depth: usize = 256,
};
index: std.MultiArrayList(JsonValue) = .{},
string_index: StringPool = .empty,
options: Options = .{},
pub const init: Self = .{};
pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
self.index.deinit(allocator);
self.string_index.deinit(allocator);
}
fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .number = number });
return idx;
}
fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.string_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .string = stridx });
return idx;
}
fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
var entry: ?ObjectIndex.Entry = null;
for (object.keys(), object.values(), 0..) |key, value, times| {
const stridx = try self.string_index.add(allocator, key);
try self.index.ensureUnusedCapacity(allocator, 1);
const vidx = self.index.addOneAssumeCapacity();
self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) {
entry = ObjectIndex.Entry{
.len = object.entries.len,
.property_idx = stridx,
.value_idx = vidx,
};
}
}
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
if (entry) |e| {
self.index.set(idx, .{ .object = e });
return idx;
} else {
self.index.set(idx, .{ .object = ObjectIndex.Entry{
.len = 0,
.property_idx = 0,
.value_idx = 0,
} });
return idx;
}
}
fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
const object: ObjectIndex.Entry = .{
.property_idx = self.string_index.string_bytes.items.len,
.value_idx = self.index.len,
.len = 0,
};
self.index.set(idx, .{ .object = object });
return idx;
}
fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
var entry: ?ArrayIndex.Slice = null;
for (array, 0..) |value, times| {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) {
entry = ArrayIndex.Slice{
.start = idx,
.len = array.len,
};
}
}
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
if (entry) |e| {
self.index.set(idx, .{ .array = e });
return idx;
} else {
self.index.set(idx, .{ .array = ArrayIndex.Slice{
.start = 0,
.len = 0,
} });
return idx;
}
}
fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .bool = value });
return idx;
}
fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
switch (value) {
.null => {},
.bool => try self.addBool(allocator, value.bool),
.number => try self.addNumber(allocator, value.number),
.string => try self.addString(allocator, value.string),
.array => try self.addArray(allocator, value.array),
.object => try self.addObject(allocator, value.object),
}
}
fn getString(self: *Self, index: []const u8) ?StringIndex {
return self.string_index.string_table.get(index);
}
fn getNumber(self: *Self, index: usize) ?f64 {
if (self.index.get(index)) |n| return n;
return null;
}
fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct {
[]StringIndex,
[]usize,
} {
const entry = self.index.get(index);
if (entry.object.len == 0) {
return .{ &.{}, &.{} };
}
var pidx = entry.object.property_idx;
var vidx = entry.object.value_idx;
const keys = try allocator.alloc(StringIndex, entry.object.len);
const values = try allocator.alloc(usize, entry.object.len);
for (0..entry.object.len) |i| {
const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index);
keys[i] = @enumFromInt(pidx);
values[i] = vidx;
pidx += slice.len + 1;
vidx += 1;
}
return .{ keys, values };
}
fn getArray(self: *Self, allocator: std.mem.Allocator, index: usize) ?[]usize {
const entry = self.index.get(index) orelse return null;
if (entry.array.len == 0) {
return &.{};
}
var idx = entry.array.start;
const values = try allocator.alloc(usize, entry.array.len);
for (entry.array.len) |i| {
values[i] = idx;
idx += 1;
}
return values;
}
fn getBool(self: *Self, index: usize) ?bool {
const entry = self.index.get(index) orelse return null;
return entry.bool;
}
fn getNull(self: *Self, index: usize) ?void {
const entry = self.index.get(index) orelse return null;
return entry.null;
}
fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput {
const entry = self.index.get(index);
switch (entry) {
.null => return .{ .null = {} },
.bool => return .{ .bool = entry.bool },
.number => return .{ .number = entry.number },
.string => {
const str = entry.string.slice(&self.string_index);
return .{ .string = str };
},
.array => {
const res = try allocator.alloc(JsonInput, entry.array.len);
var idx = entry.array.start;
for (0..entry.array.len) |i| {
if (try self.getValue(allocator, idx)) |v| {
res[i] = v;
idx += 1;
} else unreachable;
}
return .{ .array = res };
},
.object => {
var kidx = entry.object.property_idx;
var vidx = entry.object.value_idx;
var obj: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput) = .empty;
try obj.ensureTotalCapacity(allocator, entry.object.len);
for (0..entry.object.len) |_| {
const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index);
const val = (try self.getValue(allocator, vidx)).?;
obj.putAssumeCapacityNoClobber(key, val);
kidx += 1;
vidx += 1;
}
return .{ .object = obj };
},
}
}
pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
const allocator = tokenizer.allocator;
var it = tokenizer.iterator();
var depth_buf = try allocator.alloc(usize, self.options.max_depth);
defer allocator.free(depth_buf);
var cycles: usize = 0;
while (it.next()) |token| {
switch (token.type) {
.object_begin => {
std.debug.print("{{", .{});
const obj_idx = try self.addEmptyObject(allocator);
depth_buf[cycles] = obj_idx;
if (tokenizer.prev_token) |t| if (t.type == .object_begin) {
// add map to itself
const data = self.index.get(depth_buf[cycles - 1]);
switch (data) {
.object => |valid_entry| {
const new_data = ObjectIndex.Entry{
.len = valid_entry.len + 1,
.property_idx = self.string_index.string_table.size,
.value_idx = obj_idx,
};
self.index.set(depth_buf[cycles - 1], .{ .object = new_data });
tokenizer.prev_token = null; // reset
},
else => unreachable,
}
} else tokenizer.pushBack(token);
cycles += 1;
continue;
},
.object_end => {
const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?;
std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len });
for (keys, vals) |k, v| {
const key = k.slice(&self.string_index);
const val = self.index.get(v);
std.debug.print(
\\"{s}": {s},
, .{ key, @tagName(val) });
}
std.debug.print("}}", .{});
},
.string => {
const idx = try self.addString(allocator, token.value.?.string);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
const stridx = self.index.get(idx).string;
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
.len = last_obj.object.len + 1,
.property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
.number => {
_ = try self.addNumber(allocator, token.value.?.number);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
.true, .false => {
_ = try self.addBool(allocator, if (token.type == .true) true else false);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
else => {},
}
tokenizer.skipWhitespace();
}
}
test parse {
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena.deinit();
const allocator = arena.allocator();
var self = init;
defer deinit(&self, allocator);
var tokenizer = try Tokenizer.init(allocator, blk: {
const json =
\\ {
\\ "key": 123,
\\ "key2": false,
\\ "key3": true,
\\ "key4": null
\\ }
;
break :blk json;
});
try parse(&self, &tokenizer);
}

34
build.zig Normal file
View File

@ -0,0 +1,34 @@
const std = @import("std");
pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = .ReleaseSafe;
const exe_mod = b.createModule(.{
.root_source_file = b.path("src/main.zig"),
.target = target,
.optimize = optimize,
});
const exe = b.addExecutable(.{
.name = "aether",
.root_module = exe_mod,
});
b.installArtifact(exe);
const run_cmd = b.addRunArtifact(exe);
run_cmd.step.dependOn(b.getInstallStep());
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
const exe_unit_tests = b.addTest(.{
.root_module = exe_mod,
});
const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
const test_step = b.step("test", "Run unit tests");
test_step.dependOn(&run_exe_unit_tests.step);
}

16
build.zig.zon Normal file
View File

@ -0,0 +1,16 @@
.{
.name = .aether,
.version = "0.0.0",
.fingerprint = 0x255cfdbd72bde30d,
.minimum_zig_version = "0.15.0-dev.552+bc2f7c754",
.dependencies = .{
},
.paths = .{
"build.zig",
"build.zig.zon",
"src",
},
}

81
strings.zig Normal file
View File

@ -0,0 +1,81 @@
/// credits to Andrew Kelley
/// strings.zig
const std = @import("std");
const mem = std.mem;
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const Self = @This();
const max_load_percent = std.hash_map.default_max_load_percentage;
string_bytes: std.ArrayListUnmanaged(u8) = .empty,
string_table: StringIndex.Table = .empty,
pub const empty = Self{
.string_bytes = .empty,
.string_table = .empty,
};
pub fn deinit(self: *Self, allocator: Allocator) void {
self.string_bytes.deinit(allocator);
self.string_table.deinit(allocator);
}
pub const StringIndex = enum(u32) {
_,
const Table = std.HashMapUnmanaged(StringIndex, void, TableContext, max_load_percent);
const TableContext = struct {
bytes: []const u8,
pub fn eql(_: @This(), a: StringIndex, b: StringIndex) bool {
return a == b;
}
pub fn hash(ctx: @This(), key: StringIndex) u64 {
return std.hash_map.hashString(mem.sliceTo(ctx.bytes[@intFromEnum(key)..], 0));
}
};
const TableIndexAdapter = struct {
bytes: []const u8,
pub fn eql(ctx: @This(), a: []const u8, b: StringIndex) bool {
return mem.eql(u8, a, mem.sliceTo(ctx.bytes[@intFromEnum(b)..], 0));
}
pub fn hash(_: @This(), adapted_key: []const u8) u64 {
assert(mem.indexOfScalar(u8, adapted_key, 0) == null);
return std.hash_map.hashString(adapted_key);
}
};
pub fn slice(index: StringIndex, state: *const Self) [:0]const u8 {
const start_slice = state.string_bytes.items[@intFromEnum(index)..];
return start_slice[0..mem.indexOfScalar(u8, start_slice, 0).? :0];
}
};
pub fn add(state: *Self, allocator: Allocator, bytes: []const u8) !StringIndex {
try state.string_bytes.ensureUnusedCapacity(allocator, bytes.len + 1);
const gop = try state.string_table.getOrPutContextAdapted(
allocator,
bytes,
StringIndex.TableIndexAdapter{ .bytes = state.string_bytes.items },
StringIndex.TableContext{ .bytes = state.string_bytes.items },
);
if (gop.found_existing) return gop.key_ptr.*;
const new_off: StringIndex = @enumFromInt(state.string_bytes.items.len);
state.string_bytes.appendSliceAssumeCapacity(bytes);
state.string_bytes.appendAssumeCapacity(0);
gop.key_ptr.* = new_off;
return new_off;
}

553
tokenizer.zig Normal file
View File

@ -0,0 +1,553 @@
const std = @import("std");
const mem = std.mem;
pub const Error = error{
/// eg: invalid JSON syntax
InvalidSyntax,
/// eg: allocator error
OutOfMemory,
/// eg: bad escaping
UnexpectedCharacter,
/// eg: got the wrong token type, check TokenType
UnexpectedToken,
/// eg: std.fmt.parseFloat failed
BadNumber,
/// fba error
BufferTooSmall,
/// eg: missing comma
CommaExpected,
/// eg: missing colon
ColonExpected,
/// eg: missing object key
KeyExpected,
/// eg: error while writing
PrintError,
/// eg: trailing comma in object
TrailingComma,
};
pub const TokenType = enum(u8) {
eof,
null,
true,
false,
number,
string,
property,
object_begin,
object_end,
array_begin,
array_end,
colon,
comma,
whitespace,
};
pub const Token = struct {
type: TokenType,
value: ?union {
number: f64,
string: []const u8,
symbol: u8,
},
start: usize,
end: usize,
};
pub const Self = @This();
text: []const u8,
position: usize,
max_position: usize,
stack: []usize,
frame: usize,
allocator: std.mem.Allocator,
prev_token: ?Token = null,
pub fn pushBack(self: *Self, token: Token) void {
self.prev_token = token;
}
/// Initialize a new tokenizer
pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
const stack = try allocator.alloc(usize, 0x100);
errdefer allocator.free(stack);
@memset(stack, 0);
return .{
.text = text,
.position = 0,
.max_position = 0,
.stack = stack,
.frame = 0,
.allocator = allocator,
};
}
/// Clean up resources
pub fn deinit(self: *Self) void {
self.allocator.free(self.stack);
}
// ========== Core Parsing Functions ==========
fn currentPosition(self: *Self) usize {
return self.stack[self.frame];
}
fn advance(self: *Self, delta: usize) void {
self.stack[self.frame] += delta;
if (self.max_position < self.stack[self.frame])
self.max_position = self.stack[self.frame];
}
fn pushFrame(self: *Self) Error!usize {
self.frame += 1;
if (self.frame == self.stack.len) {
const new_stack = try self.allocator.alloc(usize, self.stack.len * 2);
@memset(new_stack, 0);
@memcpy(new_stack, self.stack);
self.allocator.free(self.stack);
self.stack = new_stack;
}
self.stack[self.frame] = self.stack[self.frame - 1];
return self.currentPosition();
}
fn popFrame(self: *Self) void {
self.frame -= 1;
}
fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
self.frame -= 1;
self.stack[self.frame] = self.stack[self.frame + 1];
return wrapped;
}
fn rollback(self: *Self) void {
self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
}
// ========== Character Matching ==========
fn lastChar(self: *Self) u8 {
return self.text[self.currentPosition() - 1];
}
fn currentChar(self: *Self) u8 {
return self.text[self.currentPosition()];
}
fn endOfInput(self: *Self) bool {
return self.currentPosition() >= self.text.len;
}
fn matchChar(self: *Self, c: u8) ?void {
if (self.endOfInput() or self.text[self.currentPosition()] != c) {
return null;
}
self.advance(1);
}
fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
// do not change this line for some reason it fucking breaks if I use currentChar directly
if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
return null;
}
self.advance(1);
}
fn matchString(self: *Self, s: []const u8) ?[]const u8 {
if (self.text.len < self.currentPosition() + s.len) {
// eof
return null;
}
const remaining_len = s.len;
const simd_width = 16; // 128-bit SIMD (SSE/NEON)
var j: usize = 0;
while (j + simd_width <= remaining_len) {
const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;
if (!@reduce(.And, expected_chunk == actual_chunk)) {
return error.InvalidSyntax;
}
j += simd_width;
}
// Handle remaining bytes
while (j < remaining_len) {
if (s[j] != self.text[self.currentPosition() + j]) {
return error.InvalidSyntax;
}
j += 1;
}
self.advance(s.len);
}
pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
if (self.endOfInput())
return null;
const c = self.text[self.currentPosition()];
if (!(c >= low and c <= high))
return null;
self.advance(1);
}
// ========== Token Extraction ==========
fn extractSlice(self: *Self, start: usize) []const u8 {
return self.text[start..self.currentPosition()];
}
// Skip all whitespace characters
pub fn skipWhitespace(self: *Self) void {
const start = self.currentPosition();
if (self.endOfInput())
return;
const end = skipWhitespaceSimd(self.text[start..]);
self.advance(end);
}
/// Parse a number token
pub fn nextNumber(self: *Self) Error!Token {
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
self.matchChar('-') orelse {}; // this may not fail
while (self.matchCharRange('0', '9') != null) {}
self.matchChar('.') orelse {
// int found
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
return error.BadNumber; // no floating point
};
return self.commit(Token{
.type = .number,
.value = .{
.number = float,
},
.start = start,
.end = self.currentPosition(),
});
};
while (self.matchCharRange('0', '9') != null) {}
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
return error.BadNumber; // floating point
};
return self.commit(Token{
.type = .number,
.value = .{
.number = float,
},
.start = start,
.end = self.currentPosition(),
});
}
/// Parse an identifier token
pub fn nextIdentifier(self: *Self) Error!Token {
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
var buffer = try self.allocator.alloc(u8, 0x100);
defer self.allocator.free(buffer);
self.matchCharPredicate(std.ascii.isAlphabetic) orelse {
return error.UnexpectedToken;
};
buffer[0] = self.lastChar();
var i: usize = 1;
while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
buffer[i] = self.lastChar();
i += 1;
}
const ident = buffer[0..i];
// true
if (std.mem.eql(u8, ident, "true")) {
return self.commit(Token{
.type = .true,
.value = null,
.start = start,
.end = self.currentPosition(),
});
}
// false
if (std.mem.eql(u8, ident, "false")) {
return self.commit(Token{
.type = .false,
.value = null,
.start = start,
.end = self.currentPosition(),
});
}
// null
if (std.mem.eql(u8, ident, "null")) {
return self.commit(Token{
.type = .null,
.value = null,
.start = start,
.end = self.currentPosition(),
});
}
unreachable;
}
/// Get the next token from the input
/// WARNING: this function eats whitespaces
pub fn nextToken(self: *Self) Error!Token {
if (self.prev_token) |tok| {
self.prev_token = null;
return tok;
}
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
if (self.endOfInput()) {
return Token{
.type = .eof,
.value = null,
.start = start,
.end = start,
};
}
self.advance(1);
// Fall back to single character symbol
const c = self.lastChar();
const symbol_t: TokenType = switch (c) {
'{' => .object_begin,
'}' => .object_end,
'[' => .array_begin,
']' => .array_end,
',' => .comma,
':' => .colon,
'"' => {
self.rollback();
return (self.nextString());
},
else => {
self.rollback();
// Try different token types in order of precedence
if (std.ascii.isDigit(c) or c == '-') {
return (self.nextNumber());
}
if (std.ascii.isAlphabetic(c)) {
return (self.nextIdentifier());
}
return error.InvalidSyntax;
},
};
return self.commit(Token{
.type = symbol_t,
.value = null,
.start = start,
.end = start + 1,
});
}
pub fn nextString(self: *Self) Error!Token {
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
self.matchChar('"') orelse {
return error.UnexpectedToken;
};
var buffer: std.ArrayList(u8) = .init(self.allocator);
loop: while (!self.endOfInput()) {
self.advance(1);
switch (self.lastChar()) {
'"' => {
return self.commit(Token{
.type = .string,
.value = .{ .string = try buffer.toOwnedSlice() },
.start = start,
.end = self.currentPosition(),
});
},
'\\' => {
self.advance(1);
switch (self.lastChar()) {
0x22, 0x5C, 0x2F => |d| {
try buffer.append(d);
continue :loop;
},
'b' => try buffer.append(0x8),
'f' => try buffer.append(0xC),
'n' => try buffer.append(0xA),
'r' => try buffer.append(0xD),
't' => try buffer.append(0x9),
'u' => {
var code_points: [4]u8 = undefined;
inline for (0..4) |i| {
if (self.endOfInput())
return self.commit(Token{
.type = .eof,
.value = null,
.start = start,
.end = start + 1,
});
self.advance(1);
code_points[i] = self.lastChar();
}
const buf = try stringToUtf8(&code_points);
try buffer.appendSlice(buf);
continue :loop;
},
else => return error.UnexpectedCharacter,
} // end switch
},
else => |c| {
if (std.ascii.isControl(c)) {
return error.UnexpectedCharacter;
}
try buffer.append(c);
},
} // end switch
} // end while
return error.InvalidSyntax;
}
pub const Iterator = struct {
tokenizer: *Self,
pub fn next(it: *Iterator) ?Token {
if (it.tokenizer.endOfInput()) return null;
return it.tokenizer.nextToken() catch null;
}
pub fn reset(it: *Iterator) void {
it.tokenizer.position = 0;
it.tokenizer.max_position = 0;
it.tokenizer.frame = 0;
it.tokenizer.prev_token = null;
}
};
/// iterator
pub fn iterator(self: *Self) Iterator {
return Iterator{
.tokenizer = self,
};
}
pub fn stringToUtf8(bytes: []u8) ![]u8 {
const code_point = std.fmt.parseInt(u21, bytes, 16) catch {
return error.BadNumber;
};
var buffer: [4]u8 = undefined;
var index: usize = 0;
if (code_point <= 0x7F) {
if (index >= buffer.len) return error.BufferTooSmall;
buffer[index] = @as(u8, @intCast(code_point));
index += 1;
} else if (code_point <= 0x7FF) {
if (index + 2 > buffer.len) return error.BufferTooSmall;
buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
index += 2;
} else if (code_point <= 0xFFFF) {
if (index + 3 > buffer.len) return error.BufferTooSmall;
buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
index += 3;
} else if (code_point <= 0x10FFFF) {
if (index + 4 > buffer.len) return error.BufferTooSmall;
buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
index += 4;
} else unreachable;
return buffer[0..index];
}
pub fn skipWhitespaceSimd(text: []const u8) usize {
const ChunkSize = 16;
const Vec = @Vector(ChunkSize, u8);
// Broadcast whitespace characters to vectors
const space: Vec = @splat(' ');
const tab: Vec = @splat('\t');
const lf: Vec = @splat('\n');
const cr: Vec = @splat('\r');
var j: usize = 0;
const end = text.len;
// SIMD processing
while (j + ChunkSize <= end) {
const chunk: Vec = text[j..][0..ChunkSize].*;
// Compare against each whitespace character
const is_space = chunk == space;
const is_tab = chunk == tab;
const is_lf = chunk == lf;
const is_cr = chunk == cr;
// Combine comparisons using vector operations
const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
@select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
@select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
@select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));
const TrueMask: Vec = @splat(0xFF);
const FalseMask: Vec = @splat(0x00);
// Check if all characters are whitespace
if (@reduce(.And, anyws == TrueMask)) {
j += ChunkSize;
continue;
}
// Find first non-whitespace
const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
if (mask != 0) {
return j + @ctz(mask);
}
}
// Scalar processing for remaining bytes
while (j < end) switch (text[j]) {
' ', '\t', '\n', '\r' => j += 1,
else => break,
};
return j;
}