This commit is contained in:
yuzu 2025-05-23 20:05:33 -05:00
parent 7bc4973bf4
commit d6bbd29a93
3 changed files with 122 additions and 62 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.zig-cache

View File

@ -1,5 +1,6 @@
const std = @import("std"); const std = @import("std");
const Tokenizer = @import("tokenizer.zig"); const Tokenizer = @import("tokenizer.zig");
const TokenType = Tokenizer.TokenType;
const StringPool = @import("strings.zig"); const StringPool = @import("strings.zig");
const StringIndex = StringPool.StringIndex; const StringIndex = StringPool.StringIndex;
const assert = std.debug.assert; const assert = std.debug.assert;
@ -22,8 +23,8 @@ pub const JsonValue = union(JsonType) {
bool: bool, bool: bool,
number: f64, number: f64,
string: StringIndex, string: StringIndex,
array: ArrayIndex.Slice, array: ArraySlice,
object: ObjectIndex.Entry, object: ObjectEntry,
}; };
pub const JsonInput = union(JsonType) { pub const JsonInput = union(JsonType) {
@ -35,31 +36,34 @@ pub const JsonInput = union(JsonType) {
object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput), object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput),
}; };
pub const ArrayIndex = enum(usize) { /// same as ObjectEntry but simpler
_, /// start is the offset
pub const ArraySlice = struct {
pub const Slice = struct { start: usize,
start: usize, len: usize,
len: usize,
};
}; };
pub const ObjectIndex = enum(usize) { /// just += the properties and value indexes to get the next item
_, /// property_idx and value_idx are the offset
/// it should be ordered
pub const ObjectEntry = struct {
len: usize,
property_idx: usize,
value_idx: usize,
};
pub const Entry = struct { pub const Flags = packed struct {
len: usize, allow_trailing_comma: bool = false,
property_idx: usize,
value_idx: usize,
};
}; };
pub const Options = struct { pub const Options = struct {
comptime max_depth: usize = 256, comptime max_depth: usize = 256,
comptime flags: Flags = .{},
}; };
index: std.MultiArrayList(JsonValue) = .{}, index: std.MultiArrayList(JsonValue) = .{},
string_index: StringPool = .empty, string_index: StringPool = .empty,
property_index: StringPool = .empty,
options: Options = .{}, options: Options = .{},
@ -67,7 +71,7 @@ pub const init: Self = .{};
pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
self.index.deinit(allocator); self.index.deinit(allocator);
self.string_index.deinit(allocator); self.property_index.deinit(allocator);
} }
fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize { fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
@ -77,6 +81,12 @@ fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
return idx; return idx;
} }
fn addProperty(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.property_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
return @intFromEnum(stridx);
}
fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize { fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.string_index.add(allocator, bytes); const stridx = try self.string_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1); try self.index.ensureUnusedCapacity(allocator, 1);
@ -86,15 +96,15 @@ fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usiz
} }
fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize { fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
var entry: ?ObjectIndex.Entry = null; var entry: ?ObjectEntry = null;
for (object.keys(), object.values(), 0..) |key, value, times| { for (object.keys(), object.values(), 0..) |key, value, times| {
const stridx = try self.string_index.add(allocator, key); const stridx = try self.property_index.add(allocator, key);
try self.index.ensureUnusedCapacity(allocator, 1); try self.index.ensureUnusedCapacity(allocator, 1);
const vidx = self.index.addOneAssumeCapacity(); const vidx = self.index.addOneAssumeCapacity();
self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value))); self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) { if (times == 0) {
entry = ObjectIndex.Entry{ entry = ObjectEntry{
.len = object.entries.len, .len = object.entries.len,
.property_idx = stridx, .property_idx = stridx,
.value_idx = vidx, .value_idx = vidx,
@ -108,7 +118,7 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas
self.index.set(idx, .{ .object = e }); self.index.set(idx, .{ .object = e });
return idx; return idx;
} else { } else {
self.index.set(idx, .{ .object = ObjectIndex.Entry{ self.index.set(idx, .{ .object = ObjectEntry{
.len = 0, .len = 0,
.property_idx = 0, .property_idx = 0,
.value_idx = 0, .value_idx = 0,
@ -120,8 +130,8 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas
fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize { fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1); try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity(); const idx = self.index.addOneAssumeCapacity();
const object: ObjectIndex.Entry = .{ const object: ObjectEntry = .{
.property_idx = self.string_index.string_bytes.items.len, .property_idx = self.property_index.string_bytes.items.len,
.value_idx = self.index.len, .value_idx = self.index.len,
.len = 0, .len = 0,
}; };
@ -130,13 +140,13 @@ fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
} }
fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize { fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
var entry: ?ArrayIndex.Slice = null; var entry: ?ArraySlice = null;
for (array, 0..) |value, times| { for (array, 0..) |value, times| {
try self.index.ensureUnusedCapacity(allocator, 1); try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity(); const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value))); self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) { if (times == 0) {
entry = ArrayIndex.Slice{ entry = ArraySlice{
.start = idx, .start = idx,
.len = array.len, .len = array.len,
}; };
@ -148,7 +158,7 @@ fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usiz
self.index.set(idx, .{ .array = e }); self.index.set(idx, .{ .array = e });
return idx; return idx;
} else { } else {
self.index.set(idx, .{ .array = ArrayIndex.Slice{ self.index.set(idx, .{ .array = ArraySlice{
.start = 0, .start = 0,
.len = 0, .len = 0,
} }); } });
@ -163,9 +173,16 @@ fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
return idx; return idx;
} }
fn addNull(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .null = {} });
return idx;
}
fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void { fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
switch (value) { switch (value) {
.null => {}, .null => try self.addNull(allocator),
.bool => try self.addBool(allocator, value.bool), .bool => try self.addBool(allocator, value.bool),
.number => try self.addNumber(allocator, value.number), .number => try self.addNumber(allocator, value.number),
.string => try self.addString(allocator, value.string), .string => try self.addString(allocator, value.string),
@ -174,8 +191,8 @@ fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
} }
} }
fn getString(self: *Self, index: []const u8) ?StringIndex { fn getProperty(self: *Self, index: []const u8) ?StringIndex {
return self.string_index.string_table.get(index); return self.property_index.string_table.get(index);
} }
fn getNumber(self: *Self, index: usize) ?f64 { fn getNumber(self: *Self, index: usize) ?f64 {
@ -200,7 +217,7 @@ fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct {
const values = try allocator.alloc(usize, entry.object.len); const values = try allocator.alloc(usize, entry.object.len);
for (0..entry.object.len) |i| { for (0..entry.object.len) |i| {
const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index); const slice = StringIndex.slice(@enumFromInt(pidx), &self.property_index);
keys[i] = @enumFromInt(pidx); keys[i] = @enumFromInt(pidx);
values[i] = vidx; values[i] = vidx;
pidx += slice.len + 1; pidx += slice.len + 1;
@ -265,7 +282,7 @@ fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput
try obj.ensureTotalCapacity(allocator, entry.object.len); try obj.ensureTotalCapacity(allocator, entry.object.len);
for (0..entry.object.len) |_| { for (0..entry.object.len) |_| {
const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index); const key = StringIndex.slice(@enumFromInt(kidx), &self.property_index);
const val = (try self.getValue(allocator, vidx)).?; const val = (try self.getValue(allocator, vidx)).?;
obj.putAssumeCapacityNoClobber(key, val); obj.putAssumeCapacityNoClobber(key, val);
@ -289,7 +306,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
var cycles: usize = 0; var cycles: usize = 0;
while (it.next()) |token| { while (it.next()) |token| {
switch (token.type) { flag: switch (token.type) {
.object_begin => { .object_begin => {
std.debug.print("{{", .{}); std.debug.print("{{", .{});
const obj_idx = try self.addEmptyObject(allocator); const obj_idx = try self.addEmptyObject(allocator);
@ -302,9 +319,9 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
switch (data) { switch (data) {
.object => |valid_entry| { .object => |valid_entry| {
const new_data = ObjectIndex.Entry{ const new_data = ObjectEntry{
.len = valid_entry.len + 1, .len = valid_entry.len + 1,
.property_idx = self.string_index.string_table.size, .property_idx = self.property_index.string_table.size,
.value_idx = obj_idx, .value_idx = obj_idx,
}; };
self.index.set(depth_buf[cycles - 1], .{ .object = new_data }); self.index.set(depth_buf[cycles - 1], .{ .object = new_data });
@ -320,7 +337,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?; const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?;
std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len }); std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len });
for (keys, vals) |k, v| { for (keys, vals) |k, v| {
const key = k.slice(&self.string_index); const key = k.slice(&self.property_index);
const val = self.index.get(v); const val = self.index.get(v);
std.debug.print( std.debug.print(
\\"{s}": {s}, \\"{s}": {s},
@ -328,14 +345,31 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
} }
std.debug.print("}}", .{}); std.debug.print("}}", .{});
}, },
.string => { .property => {
const idx = try self.addString(allocator, token.value.?.string); _ = try self.addProperty(allocator, token.value.?.string);
const last_obj = self.index.get(depth_buf[cycles - 1]); const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) { if (cycles > 0) {
const stridx = self.index.get(idx).string; self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
.len = last_obj.object.len + 1, .len = last_obj.object.len + 1,
.property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx, .property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
.string => {
// maybe we could dismiss the while loop altogether and just do this
// the whole time
if (it.peek()) |next| if (next.type == .colon) {
continue :flag TokenType.property;
};
_ = try self.addString(allocator, token.value.?.string);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx, .value_idx = last_obj.object.value_idx,
} }); } });
continue; continue;
@ -345,7 +379,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
_ = try self.addNumber(allocator, token.value.?.number); _ = try self.addNumber(allocator, token.value.?.number);
const last_obj = self.index.get(depth_buf[cycles - 1]); const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) { if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len, .len = last_obj.object.len,
.property_idx = last_obj.object.property_idx, .property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx, .value_idx = last_obj.object.value_idx,
@ -357,7 +391,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
_ = try self.addBool(allocator, if (token.type == .true) true else false); _ = try self.addBool(allocator, if (token.type == .true) true else false);
const last_obj = self.index.get(depth_buf[cycles - 1]); const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) { if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len, .len = last_obj.object.len,
.property_idx = last_obj.object.property_idx, .property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx, .value_idx = last_obj.object.value_idx,
@ -365,6 +399,23 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
continue; continue;
} }
}, },
.null => {
_ = try self.addNull(allocator);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
.comma => {
if (it.peek()) |tc| if (tc.type == .object_end and self.options.flags.allow_trailing_comma) {
return error.TrailingComma;
};
},
else => {}, else => {},
} }
@ -384,10 +435,11 @@ test parse {
var tokenizer = try Tokenizer.init(allocator, blk: { var tokenizer = try Tokenizer.init(allocator, blk: {
const json = const json =
\\ { \\ {
\\ "key": 123, \\ "key": "hello",
\\ "key2": false, \\ "key2": "world",
\\ "key3": true, \\ "key3": true,
\\ "key4": null \\ "key4": null,
\\ "key5": 123
\\ } \\ }
; ;
break :blk json; break :blk json;

View File

@ -232,14 +232,14 @@ pub fn nextNumber(self: *Self) Error!Token {
return error.BadNumber; // no floating point return error.BadNumber; // no floating point
}; };
return self.commit(Token{ return Token{
.type = .number, .type = .number,
.value = .{ .value = .{
.number = float, .number = float,
}, },
.start = start, .start = start,
.end = self.currentPosition(), .end = self.currentPosition(),
}); };
}; };
while (self.matchCharRange('0', '9') != null) {} while (self.matchCharRange('0', '9') != null) {}
@ -248,14 +248,14 @@ pub fn nextNumber(self: *Self) Error!Token {
return error.BadNumber; // floating point return error.BadNumber; // floating point
}; };
return self.commit(Token{ return .{
.type = .number, .type = .number,
.value = .{ .value = .{
.number = float, .number = float,
}, },
.start = start, .start = start,
.end = self.currentPosition(), .end = self.currentPosition(),
}); };
} }
/// Parse an identifier token /// Parse an identifier token
@ -284,32 +284,32 @@ pub fn nextIdentifier(self: *Self) Error!Token {
// true // true
if (std.mem.eql(u8, ident, "true")) { if (std.mem.eql(u8, ident, "true")) {
return self.commit(Token{ return .{
.type = .true, .type = .true,
.value = null, .value = null,
.start = start, .start = start,
.end = self.currentPosition(), .end = self.currentPosition(),
}); };
} }
// false // false
if (std.mem.eql(u8, ident, "false")) { if (std.mem.eql(u8, ident, "false")) {
return self.commit(Token{ return .{
.type = .false, .type = .false,
.value = null, .value = null,
.start = start, .start = start,
.end = self.currentPosition(), .end = self.currentPosition(),
}); };
} }
// null // null
if (std.mem.eql(u8, ident, "null")) { if (std.mem.eql(u8, ident, "null")) {
return self.commit(Token{ return .{
.type = .null, .type = .null,
.value = null, .value = null,
.start = start, .start = start,
.end = self.currentPosition(), .end = self.currentPosition(),
}); };
} }
unreachable; unreachable;
@ -350,17 +350,17 @@ pub fn nextToken(self: *Self) Error!Token {
':' => .colon, ':' => .colon,
'"' => { '"' => {
self.rollback(); self.rollback();
return (self.nextString()); return self.commit(self.nextString());
}, },
else => { else => {
self.rollback(); self.rollback();
// Try different token types in order of precedence // Try different token types in order of precedence
if (std.ascii.isDigit(c) or c == '-') { if (std.ascii.isDigit(c) or c == '-') {
return (self.nextNumber()); return self.commit(self.nextNumber());
} }
if (std.ascii.isAlphabetic(c)) { if (std.ascii.isAlphabetic(c)) {
return (self.nextIdentifier()); return self.commit(self.nextIdentifier());
} }
return error.InvalidSyntax; return error.InvalidSyntax;
@ -392,12 +392,12 @@ pub fn nextString(self: *Self) Error!Token {
switch (self.lastChar()) { switch (self.lastChar()) {
'"' => { '"' => {
return self.commit(Token{ return .{
.type = .string, .type = .string,
.value = .{ .string = try buffer.toOwnedSlice() }, .value = .{ .string = try buffer.toOwnedSlice() },
.start = start, .start = start,
.end = self.currentPosition(), .end = self.currentPosition(),
}); };
}, },
'\\' => { '\\' => {
self.advance(1); self.advance(1);
@ -415,12 +415,12 @@ pub fn nextString(self: *Self) Error!Token {
var code_points: [4]u8 = undefined; var code_points: [4]u8 = undefined;
inline for (0..4) |i| { inline for (0..4) |i| {
if (self.endOfInput()) if (self.endOfInput())
return self.commit(Token{ return .{
.type = .eof, .type = .eof,
.value = null, .value = null,
.start = start, .start = start,
.end = start + 1, .end = start + 1,
}); };
self.advance(1); self.advance(1);
code_points[i] = self.lastChar(); code_points[i] = self.lastChar();
} }
@ -446,6 +446,7 @@ pub fn nextString(self: *Self) Error!Token {
pub const Iterator = struct { pub const Iterator = struct {
tokenizer: *Self, tokenizer: *Self,
pub fn next(it: *Iterator) ?Token { pub fn next(it: *Iterator) ?Token {
defer it.tokenizer.skipWhitespace();
if (it.tokenizer.endOfInput()) return null; if (it.tokenizer.endOfInput()) return null;
return it.tokenizer.nextToken() catch null; return it.tokenizer.nextToken() catch null;
} }
@ -455,6 +456,12 @@ pub const Iterator = struct {
it.tokenizer.frame = 0; it.tokenizer.frame = 0;
it.tokenizer.prev_token = null; it.tokenizer.prev_token = null;
} }
pub fn peek(it: *Iterator) ?Token {
defer it.tokenizer.position -%= 1;
defer it.tokenizer.skipWhitespace();
if (it.tokenizer.endOfInput()) return null;
return it.tokenizer.nextToken() catch null;
}
}; };
/// iterator /// iterator