epic JSON parser
This commit is contained in:
commit
7bc4973bf4
397
2.zig
Normal file
397
2.zig
Normal file
@ -0,0 +1,397 @@
|
||||
const std = @import("std");
|
||||
const Tokenizer = @import("tokenizer.zig");
|
||||
const StringPool = @import("strings.zig");
|
||||
const StringIndex = StringPool.StringIndex;
|
||||
const assert = std.debug.assert;
|
||||
|
||||
const Self = @This();
|
||||
|
||||
pub const Error = enum {};
|
||||
|
||||
pub const JsonType = enum {
|
||||
null,
|
||||
bool,
|
||||
number,
|
||||
string,
|
||||
array,
|
||||
object,
|
||||
};
|
||||
|
||||
pub const JsonValue = union(JsonType) {
|
||||
null: void,
|
||||
bool: bool,
|
||||
number: f64,
|
||||
string: StringIndex,
|
||||
array: ArrayIndex.Slice,
|
||||
object: ObjectIndex.Entry,
|
||||
};
|
||||
|
||||
pub const JsonInput = union(JsonType) {
|
||||
null: void,
|
||||
bool: bool,
|
||||
number: f64,
|
||||
string: []const u8,
|
||||
array: []JsonInput,
|
||||
object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput),
|
||||
};
|
||||
|
||||
pub const ArrayIndex = enum(usize) {
|
||||
_,
|
||||
|
||||
pub const Slice = struct {
|
||||
start: usize,
|
||||
len: usize,
|
||||
};
|
||||
};
|
||||
|
||||
pub const ObjectIndex = enum(usize) {
|
||||
_,
|
||||
|
||||
pub const Entry = struct {
|
||||
len: usize,
|
||||
property_idx: usize,
|
||||
value_idx: usize,
|
||||
};
|
||||
};
|
||||
|
||||
pub const Options = struct {
|
||||
comptime max_depth: usize = 256,
|
||||
};
|
||||
|
||||
index: std.MultiArrayList(JsonValue) = .{},
|
||||
string_index: StringPool = .empty,
|
||||
|
||||
options: Options = .{},
|
||||
|
||||
pub const init: Self = .{};
|
||||
|
||||
pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
|
||||
self.index.deinit(allocator);
|
||||
self.string_index.deinit(allocator);
|
||||
}
|
||||
|
||||
fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
self.index.set(idx, .{ .number = number });
|
||||
return idx;
|
||||
}
|
||||
|
||||
fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
|
||||
const stridx = try self.string_index.add(allocator, bytes);
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
self.index.set(idx, .{ .string = stridx });
|
||||
return idx;
|
||||
}
|
||||
|
||||
fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
|
||||
var entry: ?ObjectIndex.Entry = null;
|
||||
|
||||
for (object.keys(), object.values(), 0..) |key, value, times| {
|
||||
const stridx = try self.string_index.add(allocator, key);
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const vidx = self.index.addOneAssumeCapacity();
|
||||
self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
|
||||
if (times == 0) {
|
||||
entry = ObjectIndex.Entry{
|
||||
.len = object.entries.len,
|
||||
.property_idx = stridx,
|
||||
.value_idx = vidx,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
if (entry) |e| {
|
||||
self.index.set(idx, .{ .object = e });
|
||||
return idx;
|
||||
} else {
|
||||
self.index.set(idx, .{ .object = ObjectIndex.Entry{
|
||||
.len = 0,
|
||||
.property_idx = 0,
|
||||
.value_idx = 0,
|
||||
} });
|
||||
return idx;
|
||||
}
|
||||
}
|
||||
|
||||
fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
const object: ObjectIndex.Entry = .{
|
||||
.property_idx = self.string_index.string_bytes.items.len,
|
||||
.value_idx = self.index.len,
|
||||
.len = 0,
|
||||
};
|
||||
self.index.set(idx, .{ .object = object });
|
||||
return idx;
|
||||
}
|
||||
|
||||
fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
|
||||
var entry: ?ArrayIndex.Slice = null;
|
||||
for (array, 0..) |value, times| {
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
|
||||
if (times == 0) {
|
||||
entry = ArrayIndex.Slice{
|
||||
.start = idx,
|
||||
.len = array.len,
|
||||
};
|
||||
}
|
||||
}
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
if (entry) |e| {
|
||||
self.index.set(idx, .{ .array = e });
|
||||
return idx;
|
||||
} else {
|
||||
self.index.set(idx, .{ .array = ArrayIndex.Slice{
|
||||
.start = 0,
|
||||
.len = 0,
|
||||
} });
|
||||
return idx;
|
||||
}
|
||||
}
|
||||
|
||||
fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
|
||||
try self.index.ensureUnusedCapacity(allocator, 1);
|
||||
const idx = self.index.addOneAssumeCapacity();
|
||||
self.index.set(idx, .{ .bool = value });
|
||||
return idx;
|
||||
}
|
||||
|
||||
fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
|
||||
switch (value) {
|
||||
.null => {},
|
||||
.bool => try self.addBool(allocator, value.bool),
|
||||
.number => try self.addNumber(allocator, value.number),
|
||||
.string => try self.addString(allocator, value.string),
|
||||
.array => try self.addArray(allocator, value.array),
|
||||
.object => try self.addObject(allocator, value.object),
|
||||
}
|
||||
}
|
||||
|
||||
fn getString(self: *Self, index: []const u8) ?StringIndex {
|
||||
return self.string_index.string_table.get(index);
|
||||
}
|
||||
|
||||
fn getNumber(self: *Self, index: usize) ?f64 {
|
||||
if (self.index.get(index)) |n| return n;
|
||||
return null;
|
||||
}
|
||||
|
||||
fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct {
|
||||
[]StringIndex,
|
||||
[]usize,
|
||||
} {
|
||||
const entry = self.index.get(index);
|
||||
|
||||
if (entry.object.len == 0) {
|
||||
return .{ &.{}, &.{} };
|
||||
}
|
||||
|
||||
var pidx = entry.object.property_idx;
|
||||
var vidx = entry.object.value_idx;
|
||||
|
||||
const keys = try allocator.alloc(StringIndex, entry.object.len);
|
||||
const values = try allocator.alloc(usize, entry.object.len);
|
||||
|
||||
for (0..entry.object.len) |i| {
|
||||
const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index);
|
||||
keys[i] = @enumFromInt(pidx);
|
||||
values[i] = vidx;
|
||||
pidx += slice.len + 1;
|
||||
vidx += 1;
|
||||
}
|
||||
|
||||
return .{ keys, values };
|
||||
}
|
||||
|
||||
fn getArray(self: *Self, allocator: std.mem.Allocator, index: usize) ?[]usize {
|
||||
const entry = self.index.get(index) orelse return null;
|
||||
|
||||
if (entry.array.len == 0) {
|
||||
return &.{};
|
||||
}
|
||||
|
||||
var idx = entry.array.start;
|
||||
const values = try allocator.alloc(usize, entry.array.len);
|
||||
|
||||
for (entry.array.len) |i| {
|
||||
values[i] = idx;
|
||||
idx += 1;
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
fn getBool(self: *Self, index: usize) ?bool {
|
||||
const entry = self.index.get(index) orelse return null;
|
||||
return entry.bool;
|
||||
}
|
||||
|
||||
fn getNull(self: *Self, index: usize) ?void {
|
||||
const entry = self.index.get(index) orelse return null;
|
||||
return entry.null;
|
||||
}
|
||||
|
||||
fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput {
|
||||
const entry = self.index.get(index);
|
||||
switch (entry) {
|
||||
.null => return .{ .null = {} },
|
||||
.bool => return .{ .bool = entry.bool },
|
||||
.number => return .{ .number = entry.number },
|
||||
.string => {
|
||||
const str = entry.string.slice(&self.string_index);
|
||||
return .{ .string = str };
|
||||
},
|
||||
.array => {
|
||||
const res = try allocator.alloc(JsonInput, entry.array.len);
|
||||
var idx = entry.array.start;
|
||||
for (0..entry.array.len) |i| {
|
||||
if (try self.getValue(allocator, idx)) |v| {
|
||||
res[i] = v;
|
||||
idx += 1;
|
||||
} else unreachable;
|
||||
}
|
||||
return .{ .array = res };
|
||||
},
|
||||
.object => {
|
||||
var kidx = entry.object.property_idx;
|
||||
var vidx = entry.object.value_idx;
|
||||
var obj: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput) = .empty;
|
||||
|
||||
try obj.ensureTotalCapacity(allocator, entry.object.len);
|
||||
for (0..entry.object.len) |_| {
|
||||
const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index);
|
||||
const val = (try self.getValue(allocator, vidx)).?;
|
||||
|
||||
obj.putAssumeCapacityNoClobber(key, val);
|
||||
kidx += 1;
|
||||
vidx += 1;
|
||||
}
|
||||
|
||||
return .{ .object = obj };
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
|
||||
const allocator = tokenizer.allocator;
|
||||
|
||||
var it = tokenizer.iterator();
|
||||
|
||||
var depth_buf = try allocator.alloc(usize, self.options.max_depth);
|
||||
defer allocator.free(depth_buf);
|
||||
|
||||
var cycles: usize = 0;
|
||||
|
||||
while (it.next()) |token| {
|
||||
switch (token.type) {
|
||||
.object_begin => {
|
||||
std.debug.print("{{", .{});
|
||||
const obj_idx = try self.addEmptyObject(allocator);
|
||||
|
||||
depth_buf[cycles] = obj_idx;
|
||||
|
||||
if (tokenizer.prev_token) |t| if (t.type == .object_begin) {
|
||||
// add map to itself
|
||||
const data = self.index.get(depth_buf[cycles - 1]);
|
||||
|
||||
switch (data) {
|
||||
.object => |valid_entry| {
|
||||
const new_data = ObjectIndex.Entry{
|
||||
.len = valid_entry.len + 1,
|
||||
.property_idx = self.string_index.string_table.size,
|
||||
.value_idx = obj_idx,
|
||||
};
|
||||
self.index.set(depth_buf[cycles - 1], .{ .object = new_data });
|
||||
tokenizer.prev_token = null; // reset
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
} else tokenizer.pushBack(token);
|
||||
cycles += 1;
|
||||
continue;
|
||||
},
|
||||
.object_end => {
|
||||
const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?;
|
||||
std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len });
|
||||
for (keys, vals) |k, v| {
|
||||
const key = k.slice(&self.string_index);
|
||||
const val = self.index.get(v);
|
||||
std.debug.print(
|
||||
\\"{s}": {s},
|
||||
, .{ key, @tagName(val) });
|
||||
}
|
||||
std.debug.print("}}", .{});
|
||||
},
|
||||
.string => {
|
||||
const idx = try self.addString(allocator, token.value.?.string);
|
||||
const last_obj = self.index.get(depth_buf[cycles - 1]);
|
||||
if (cycles > 0) {
|
||||
const stridx = self.index.get(idx).string;
|
||||
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
|
||||
.len = last_obj.object.len + 1,
|
||||
.property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx,
|
||||
.value_idx = last_obj.object.value_idx,
|
||||
} });
|
||||
continue;
|
||||
}
|
||||
},
|
||||
.number => {
|
||||
_ = try self.addNumber(allocator, token.value.?.number);
|
||||
const last_obj = self.index.get(depth_buf[cycles - 1]);
|
||||
if (cycles > 0) {
|
||||
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
|
||||
.len = last_obj.object.len,
|
||||
.property_idx = last_obj.object.property_idx,
|
||||
.value_idx = last_obj.object.value_idx,
|
||||
} });
|
||||
continue;
|
||||
}
|
||||
},
|
||||
.true, .false => {
|
||||
_ = try self.addBool(allocator, if (token.type == .true) true else false);
|
||||
const last_obj = self.index.get(depth_buf[cycles - 1]);
|
||||
if (cycles > 0) {
|
||||
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
|
||||
.len = last_obj.object.len,
|
||||
.property_idx = last_obj.object.property_idx,
|
||||
.value_idx = last_obj.object.value_idx,
|
||||
} });
|
||||
continue;
|
||||
}
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
|
||||
tokenizer.skipWhitespace();
|
||||
}
|
||||
}
|
||||
|
||||
test parse {
|
||||
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
|
||||
defer arena.deinit();
|
||||
|
||||
const allocator = arena.allocator();
|
||||
|
||||
var self = init;
|
||||
defer deinit(&self, allocator);
|
||||
|
||||
var tokenizer = try Tokenizer.init(allocator, blk: {
|
||||
const json =
|
||||
\\ {
|
||||
\\ "key": 123,
|
||||
\\ "key2": false,
|
||||
\\ "key3": true,
|
||||
\\ "key4": null
|
||||
\\ }
|
||||
;
|
||||
break :blk json;
|
||||
});
|
||||
|
||||
try parse(&self, &tokenizer);
|
||||
}
|
34
build.zig
Normal file
34
build.zig
Normal file
@ -0,0 +1,34 @@
|
||||
const std = @import("std");
|
||||
|
||||
pub fn build(b: *std.Build) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = .ReleaseSafe;
|
||||
|
||||
const exe_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/main.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = "aether",
|
||||
.root_module = exe_mod,
|
||||
});
|
||||
|
||||
b.installArtifact(exe);
|
||||
|
||||
const run_cmd = b.addRunArtifact(exe);
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
|
||||
const run_step = b.step("run", "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
|
||||
const exe_unit_tests = b.addTest(.{
|
||||
.root_module = exe_mod,
|
||||
});
|
||||
|
||||
const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
|
||||
|
||||
const test_step = b.step("test", "Run unit tests");
|
||||
test_step.dependOn(&run_exe_unit_tests.step);
|
||||
}
|
16
build.zig.zon
Normal file
16
build.zig.zon
Normal file
@ -0,0 +1,16 @@
|
||||
.{
|
||||
.name = .aether,
|
||||
|
||||
.version = "0.0.0",
|
||||
|
||||
.fingerprint = 0x255cfdbd72bde30d,
|
||||
.minimum_zig_version = "0.15.0-dev.552+bc2f7c754",
|
||||
|
||||
.dependencies = .{
|
||||
},
|
||||
.paths = .{
|
||||
"build.zig",
|
||||
"build.zig.zon",
|
||||
"src",
|
||||
},
|
||||
}
|
81
strings.zig
Normal file
81
strings.zig
Normal file
@ -0,0 +1,81 @@
|
||||
/// credits to Andrew Kelley
|
||||
/// strings.zig
|
||||
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
const assert = std.debug.assert;
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
const Self = @This();
|
||||
|
||||
const max_load_percent = std.hash_map.default_max_load_percentage;
|
||||
|
||||
string_bytes: std.ArrayListUnmanaged(u8) = .empty,
|
||||
string_table: StringIndex.Table = .empty,
|
||||
|
||||
pub const empty = Self{
|
||||
.string_bytes = .empty,
|
||||
.string_table = .empty,
|
||||
};
|
||||
|
||||
pub fn deinit(self: *Self, allocator: Allocator) void {
|
||||
self.string_bytes.deinit(allocator);
|
||||
self.string_table.deinit(allocator);
|
||||
}
|
||||
|
||||
pub const StringIndex = enum(u32) {
|
||||
_,
|
||||
|
||||
const Table = std.HashMapUnmanaged(StringIndex, void, TableContext, max_load_percent);
|
||||
|
||||
const TableContext = struct {
|
||||
bytes: []const u8,
|
||||
|
||||
pub fn eql(_: @This(), a: StringIndex, b: StringIndex) bool {
|
||||
return a == b;
|
||||
}
|
||||
|
||||
pub fn hash(ctx: @This(), key: StringIndex) u64 {
|
||||
return std.hash_map.hashString(mem.sliceTo(ctx.bytes[@intFromEnum(key)..], 0));
|
||||
}
|
||||
};
|
||||
|
||||
const TableIndexAdapter = struct {
|
||||
bytes: []const u8,
|
||||
|
||||
pub fn eql(ctx: @This(), a: []const u8, b: StringIndex) bool {
|
||||
return mem.eql(u8, a, mem.sliceTo(ctx.bytes[@intFromEnum(b)..], 0));
|
||||
}
|
||||
|
||||
pub fn hash(_: @This(), adapted_key: []const u8) u64 {
|
||||
assert(mem.indexOfScalar(u8, adapted_key, 0) == null);
|
||||
return std.hash_map.hashString(adapted_key);
|
||||
}
|
||||
};
|
||||
|
||||
pub fn slice(index: StringIndex, state: *const Self) [:0]const u8 {
|
||||
const start_slice = state.string_bytes.items[@intFromEnum(index)..];
|
||||
return start_slice[0..mem.indexOfScalar(u8, start_slice, 0).? :0];
|
||||
}
|
||||
};
|
||||
|
||||
pub fn add(state: *Self, allocator: Allocator, bytes: []const u8) !StringIndex {
|
||||
try state.string_bytes.ensureUnusedCapacity(allocator, bytes.len + 1);
|
||||
|
||||
const gop = try state.string_table.getOrPutContextAdapted(
|
||||
allocator,
|
||||
bytes,
|
||||
StringIndex.TableIndexAdapter{ .bytes = state.string_bytes.items },
|
||||
StringIndex.TableContext{ .bytes = state.string_bytes.items },
|
||||
);
|
||||
if (gop.found_existing) return gop.key_ptr.*;
|
||||
|
||||
const new_off: StringIndex = @enumFromInt(state.string_bytes.items.len);
|
||||
|
||||
state.string_bytes.appendSliceAssumeCapacity(bytes);
|
||||
state.string_bytes.appendAssumeCapacity(0);
|
||||
|
||||
gop.key_ptr.* = new_off;
|
||||
|
||||
return new_off;
|
||||
}
|
553
tokenizer.zig
Normal file
553
tokenizer.zig
Normal file
@ -0,0 +1,553 @@
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
|
||||
pub const Error = error{
|
||||
/// eg: invalid JSON syntax
|
||||
InvalidSyntax,
|
||||
/// eg: allocator error
|
||||
OutOfMemory,
|
||||
/// eg: bad escaping
|
||||
UnexpectedCharacter,
|
||||
/// eg: got the wrong token type, check TokenType
|
||||
UnexpectedToken,
|
||||
/// eg: std.fmt.parseFloat failed
|
||||
BadNumber,
|
||||
/// fba error
|
||||
BufferTooSmall,
|
||||
/// eg: missing comma
|
||||
CommaExpected,
|
||||
/// eg: missing colon
|
||||
ColonExpected,
|
||||
/// eg: missing object key
|
||||
KeyExpected,
|
||||
/// eg: error while writing
|
||||
PrintError,
|
||||
/// eg: trailing comma in object
|
||||
TrailingComma,
|
||||
};
|
||||
|
||||
pub const TokenType = enum(u8) {
|
||||
eof,
|
||||
null,
|
||||
true,
|
||||
false,
|
||||
number,
|
||||
string,
|
||||
property,
|
||||
object_begin,
|
||||
object_end,
|
||||
array_begin,
|
||||
array_end,
|
||||
colon,
|
||||
comma,
|
||||
whitespace,
|
||||
};
|
||||
|
||||
pub const Token = struct {
|
||||
type: TokenType,
|
||||
value: ?union {
|
||||
number: f64,
|
||||
string: []const u8,
|
||||
symbol: u8,
|
||||
},
|
||||
start: usize,
|
||||
end: usize,
|
||||
};
|
||||
|
||||
pub const Self = @This();
|
||||
|
||||
text: []const u8,
|
||||
position: usize,
|
||||
max_position: usize,
|
||||
stack: []usize,
|
||||
frame: usize,
|
||||
allocator: std.mem.Allocator,
|
||||
|
||||
prev_token: ?Token = null,
|
||||
|
||||
pub fn pushBack(self: *Self, token: Token) void {
|
||||
self.prev_token = token;
|
||||
}
|
||||
|
||||
/// Initialize a new tokenizer
|
||||
pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
|
||||
const stack = try allocator.alloc(usize, 0x100);
|
||||
errdefer allocator.free(stack);
|
||||
@memset(stack, 0);
|
||||
return .{
|
||||
.text = text,
|
||||
.position = 0,
|
||||
.max_position = 0,
|
||||
.stack = stack,
|
||||
.frame = 0,
|
||||
.allocator = allocator,
|
||||
};
|
||||
}
|
||||
|
||||
/// Clean up resources
|
||||
pub fn deinit(self: *Self) void {
|
||||
self.allocator.free(self.stack);
|
||||
}
|
||||
|
||||
// ========== Core Parsing Functions ==========
|
||||
|
||||
fn currentPosition(self: *Self) usize {
|
||||
return self.stack[self.frame];
|
||||
}
|
||||
|
||||
fn advance(self: *Self, delta: usize) void {
|
||||
self.stack[self.frame] += delta;
|
||||
if (self.max_position < self.stack[self.frame])
|
||||
self.max_position = self.stack[self.frame];
|
||||
}
|
||||
|
||||
fn pushFrame(self: *Self) Error!usize {
|
||||
self.frame += 1;
|
||||
if (self.frame == self.stack.len) {
|
||||
const new_stack = try self.allocator.alloc(usize, self.stack.len * 2);
|
||||
@memset(new_stack, 0);
|
||||
@memcpy(new_stack, self.stack);
|
||||
self.allocator.free(self.stack);
|
||||
self.stack = new_stack;
|
||||
}
|
||||
self.stack[self.frame] = self.stack[self.frame - 1];
|
||||
return self.currentPosition();
|
||||
}
|
||||
|
||||
fn popFrame(self: *Self) void {
|
||||
self.frame -= 1;
|
||||
}
|
||||
|
||||
fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
|
||||
self.frame -= 1;
|
||||
self.stack[self.frame] = self.stack[self.frame + 1];
|
||||
return wrapped;
|
||||
}
|
||||
|
||||
fn rollback(self: *Self) void {
|
||||
self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
|
||||
}
|
||||
|
||||
// ========== Character Matching ==========
|
||||
|
||||
fn lastChar(self: *Self) u8 {
|
||||
return self.text[self.currentPosition() - 1];
|
||||
}
|
||||
|
||||
fn currentChar(self: *Self) u8 {
|
||||
return self.text[self.currentPosition()];
|
||||
}
|
||||
|
||||
fn endOfInput(self: *Self) bool {
|
||||
return self.currentPosition() >= self.text.len;
|
||||
}
|
||||
|
||||
fn matchChar(self: *Self, c: u8) ?void {
|
||||
if (self.endOfInput() or self.text[self.currentPosition()] != c) {
|
||||
return null;
|
||||
}
|
||||
self.advance(1);
|
||||
}
|
||||
|
||||
fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
|
||||
// do not change this line for some reason it fucking breaks if I use currentChar directly
|
||||
if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
|
||||
return null;
|
||||
}
|
||||
self.advance(1);
|
||||
}
|
||||
|
||||
fn matchString(self: *Self, s: []const u8) ?[]const u8 {
|
||||
if (self.text.len < self.currentPosition() + s.len) {
|
||||
// eof
|
||||
return null;
|
||||
}
|
||||
|
||||
const remaining_len = s.len;
|
||||
const simd_width = 16; // 128-bit SIMD (SSE/NEON)
|
||||
|
||||
var j: usize = 0;
|
||||
while (j + simd_width <= remaining_len) {
|
||||
const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
|
||||
const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;
|
||||
|
||||
if (!@reduce(.And, expected_chunk == actual_chunk)) {
|
||||
return error.InvalidSyntax;
|
||||
}
|
||||
j += simd_width;
|
||||
}
|
||||
|
||||
// Handle remaining bytes
|
||||
while (j < remaining_len) {
|
||||
if (s[j] != self.text[self.currentPosition() + j]) {
|
||||
return error.InvalidSyntax;
|
||||
}
|
||||
j += 1;
|
||||
}
|
||||
|
||||
self.advance(s.len);
|
||||
}
|
||||
|
||||
pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
|
||||
if (self.endOfInput())
|
||||
return null;
|
||||
|
||||
const c = self.text[self.currentPosition()];
|
||||
|
||||
if (!(c >= low and c <= high))
|
||||
return null;
|
||||
|
||||
self.advance(1);
|
||||
}
|
||||
|
||||
// ========== Token Extraction ==========
|
||||
|
||||
fn extractSlice(self: *Self, start: usize) []const u8 {
|
||||
return self.text[start..self.currentPosition()];
|
||||
}
|
||||
|
||||
// Skip all whitespace characters
|
||||
pub fn skipWhitespace(self: *Self) void {
|
||||
const start = self.currentPosition();
|
||||
if (self.endOfInput())
|
||||
return;
|
||||
const end = skipWhitespaceSimd(self.text[start..]);
|
||||
self.advance(end);
|
||||
}
|
||||
|
||||
/// Parse a number token
|
||||
pub fn nextNumber(self: *Self) Error!Token {
|
||||
const start = try self.pushFrame();
|
||||
errdefer self.popFrame();
|
||||
|
||||
self.skipWhitespace();
|
||||
|
||||
self.matchChar('-') orelse {}; // this may not fail
|
||||
|
||||
while (self.matchCharRange('0', '9') != null) {}
|
||||
|
||||
self.matchChar('.') orelse {
|
||||
// int found
|
||||
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
|
||||
return error.BadNumber; // no floating point
|
||||
};
|
||||
|
||||
return self.commit(Token{
|
||||
.type = .number,
|
||||
.value = .{
|
||||
.number = float,
|
||||
},
|
||||
.start = start,
|
||||
.end = self.currentPosition(),
|
||||
});
|
||||
};
|
||||
|
||||
while (self.matchCharRange('0', '9') != null) {}
|
||||
|
||||
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
|
||||
return error.BadNumber; // floating point
|
||||
};
|
||||
|
||||
return self.commit(Token{
|
||||
.type = .number,
|
||||
.value = .{
|
||||
.number = float,
|
||||
},
|
||||
.start = start,
|
||||
.end = self.currentPosition(),
|
||||
});
|
||||
}
|
||||
|
||||
/// Parse an identifier token
|
||||
pub fn nextIdentifier(self: *Self) Error!Token {
|
||||
const start = try self.pushFrame();
|
||||
errdefer self.popFrame();
|
||||
|
||||
self.skipWhitespace();
|
||||
|
||||
var buffer = try self.allocator.alloc(u8, 0x100);
|
||||
defer self.allocator.free(buffer);
|
||||
|
||||
self.matchCharPredicate(std.ascii.isAlphabetic) orelse {
|
||||
return error.UnexpectedToken;
|
||||
};
|
||||
|
||||
buffer[0] = self.lastChar();
|
||||
|
||||
var i: usize = 1;
|
||||
while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
|
||||
buffer[i] = self.lastChar();
|
||||
i += 1;
|
||||
}
|
||||
|
||||
const ident = buffer[0..i];
|
||||
|
||||
// true
|
||||
if (std.mem.eql(u8, ident, "true")) {
|
||||
return self.commit(Token{
|
||||
.type = .true,
|
||||
.value = null,
|
||||
.start = start,
|
||||
.end = self.currentPosition(),
|
||||
});
|
||||
}
|
||||
|
||||
// false
|
||||
if (std.mem.eql(u8, ident, "false")) {
|
||||
return self.commit(Token{
|
||||
.type = .false,
|
||||
.value = null,
|
||||
.start = start,
|
||||
.end = self.currentPosition(),
|
||||
});
|
||||
}
|
||||
|
||||
// null
|
||||
if (std.mem.eql(u8, ident, "null")) {
|
||||
return self.commit(Token{
|
||||
.type = .null,
|
||||
.value = null,
|
||||
.start = start,
|
||||
.end = self.currentPosition(),
|
||||
});
|
||||
}
|
||||
|
||||
unreachable;
|
||||
}
|
||||
|
||||
/// Get the next token from the input
|
||||
/// WARNING: this function eats whitespaces
|
||||
pub fn nextToken(self: *Self) Error!Token {
|
||||
if (self.prev_token) |tok| {
|
||||
self.prev_token = null;
|
||||
return tok;
|
||||
}
|
||||
|
||||
const start = try self.pushFrame();
|
||||
errdefer self.popFrame();
|
||||
|
||||
self.skipWhitespace();
|
||||
|
||||
if (self.endOfInput()) {
|
||||
return Token{
|
||||
.type = .eof,
|
||||
.value = null,
|
||||
.start = start,
|
||||
.end = start,
|
||||
};
|
||||
}
|
||||
|
||||
self.advance(1);
|
||||
// Fall back to single character symbol
|
||||
const c = self.lastChar();
|
||||
|
||||
const symbol_t: TokenType = switch (c) {
|
||||
'{' => .object_begin,
|
||||
'}' => .object_end,
|
||||
'[' => .array_begin,
|
||||
']' => .array_end,
|
||||
',' => .comma,
|
||||
':' => .colon,
|
||||
'"' => {
|
||||
self.rollback();
|
||||
return (self.nextString());
|
||||
},
|
||||
else => {
|
||||
self.rollback();
|
||||
// Try different token types in order of precedence
|
||||
if (std.ascii.isDigit(c) or c == '-') {
|
||||
return (self.nextNumber());
|
||||
}
|
||||
|
||||
if (std.ascii.isAlphabetic(c)) {
|
||||
return (self.nextIdentifier());
|
||||
}
|
||||
|
||||
return error.InvalidSyntax;
|
||||
},
|
||||
};
|
||||
|
||||
return self.commit(Token{
|
||||
.type = symbol_t,
|
||||
.value = null,
|
||||
.start = start,
|
||||
.end = start + 1,
|
||||
});
|
||||
}
|
||||
|
||||
pub fn nextString(self: *Self) Error!Token {
|
||||
const start = try self.pushFrame();
|
||||
errdefer self.popFrame();
|
||||
|
||||
self.skipWhitespace();
|
||||
|
||||
self.matchChar('"') orelse {
|
||||
return error.UnexpectedToken;
|
||||
};
|
||||
|
||||
var buffer: std.ArrayList(u8) = .init(self.allocator);
|
||||
|
||||
loop: while (!self.endOfInput()) {
|
||||
self.advance(1);
|
||||
|
||||
switch (self.lastChar()) {
|
||||
'"' => {
|
||||
return self.commit(Token{
|
||||
.type = .string,
|
||||
.value = .{ .string = try buffer.toOwnedSlice() },
|
||||
.start = start,
|
||||
.end = self.currentPosition(),
|
||||
});
|
||||
},
|
||||
'\\' => {
|
||||
self.advance(1);
|
||||
switch (self.lastChar()) {
|
||||
0x22, 0x5C, 0x2F => |d| {
|
||||
try buffer.append(d);
|
||||
continue :loop;
|
||||
},
|
||||
'b' => try buffer.append(0x8),
|
||||
'f' => try buffer.append(0xC),
|
||||
'n' => try buffer.append(0xA),
|
||||
'r' => try buffer.append(0xD),
|
||||
't' => try buffer.append(0x9),
|
||||
'u' => {
|
||||
var code_points: [4]u8 = undefined;
|
||||
inline for (0..4) |i| {
|
||||
if (self.endOfInput())
|
||||
return self.commit(Token{
|
||||
.type = .eof,
|
||||
.value = null,
|
||||
.start = start,
|
||||
.end = start + 1,
|
||||
});
|
||||
self.advance(1);
|
||||
code_points[i] = self.lastChar();
|
||||
}
|
||||
const buf = try stringToUtf8(&code_points);
|
||||
try buffer.appendSlice(buf);
|
||||
continue :loop;
|
||||
},
|
||||
else => return error.UnexpectedCharacter,
|
||||
} // end switch
|
||||
},
|
||||
else => |c| {
|
||||
if (std.ascii.isControl(c)) {
|
||||
return error.UnexpectedCharacter;
|
||||
}
|
||||
try buffer.append(c);
|
||||
},
|
||||
} // end switch
|
||||
} // end while
|
||||
|
||||
return error.InvalidSyntax;
|
||||
}
|
||||
|
||||
pub const Iterator = struct {
|
||||
tokenizer: *Self,
|
||||
pub fn next(it: *Iterator) ?Token {
|
||||
if (it.tokenizer.endOfInput()) return null;
|
||||
return it.tokenizer.nextToken() catch null;
|
||||
}
|
||||
pub fn reset(it: *Iterator) void {
|
||||
it.tokenizer.position = 0;
|
||||
it.tokenizer.max_position = 0;
|
||||
it.tokenizer.frame = 0;
|
||||
it.tokenizer.prev_token = null;
|
||||
}
|
||||
};
|
||||
|
||||
/// iterator
|
||||
pub fn iterator(self: *Self) Iterator {
|
||||
return Iterator{
|
||||
.tokenizer = self,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn stringToUtf8(bytes: []u8) ![]u8 {
|
||||
const code_point = std.fmt.parseInt(u21, bytes, 16) catch {
|
||||
return error.BadNumber;
|
||||
};
|
||||
var buffer: [4]u8 = undefined;
|
||||
var index: usize = 0;
|
||||
|
||||
if (code_point <= 0x7F) {
|
||||
if (index >= buffer.len) return error.BufferTooSmall;
|
||||
buffer[index] = @as(u8, @intCast(code_point));
|
||||
index += 1;
|
||||
} else if (code_point <= 0x7FF) {
|
||||
if (index + 2 > buffer.len) return error.BufferTooSmall;
|
||||
buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
|
||||
buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
|
||||
index += 2;
|
||||
} else if (code_point <= 0xFFFF) {
|
||||
if (index + 3 > buffer.len) return error.BufferTooSmall;
|
||||
buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
|
||||
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
|
||||
buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
|
||||
index += 3;
|
||||
} else if (code_point <= 0x10FFFF) {
|
||||
if (index + 4 > buffer.len) return error.BufferTooSmall;
|
||||
buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
|
||||
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
|
||||
buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
|
||||
buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
|
||||
index += 4;
|
||||
} else unreachable;
|
||||
|
||||
return buffer[0..index];
|
||||
}
|
||||
|
||||
pub fn skipWhitespaceSimd(text: []const u8) usize {
|
||||
const ChunkSize = 16;
|
||||
const Vec = @Vector(ChunkSize, u8);
|
||||
|
||||
// Broadcast whitespace characters to vectors
|
||||
const space: Vec = @splat(' ');
|
||||
const tab: Vec = @splat('\t');
|
||||
const lf: Vec = @splat('\n');
|
||||
const cr: Vec = @splat('\r');
|
||||
|
||||
var j: usize = 0;
|
||||
const end = text.len;
|
||||
|
||||
// SIMD processing
|
||||
while (j + ChunkSize <= end) {
|
||||
const chunk: Vec = text[j..][0..ChunkSize].*;
|
||||
|
||||
// Compare against each whitespace character
|
||||
const is_space = chunk == space;
|
||||
const is_tab = chunk == tab;
|
||||
const is_lf = chunk == lf;
|
||||
const is_cr = chunk == cr;
|
||||
|
||||
// Combine comparisons using vector operations
|
||||
const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
|
||||
@select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
|
||||
@select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
|
||||
@select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));
|
||||
|
||||
const TrueMask: Vec = @splat(0xFF);
|
||||
const FalseMask: Vec = @splat(0x00);
|
||||
|
||||
// Check if all characters are whitespace
|
||||
if (@reduce(.And, anyws == TrueMask)) {
|
||||
j += ChunkSize;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find first non-whitespace
|
||||
const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
|
||||
if (mask != 0) {
|
||||
return j + @ctz(mask);
|
||||
}
|
||||
}
|
||||
|
||||
// Scalar processing for remaining bytes
|
||||
while (j < end) switch (text[j]) {
|
||||
' ', '\t', '\n', '\r' => j += 1,
|
||||
else => break,
|
||||
};
|
||||
|
||||
return j;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user