This commit is contained in:
yuzu 2025-05-23 20:05:33 -05:00
parent 7bc4973bf4
commit d6bbd29a93
3 changed files with 122 additions and 62 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.zig-cache

View File

@ -1,5 +1,6 @@
const std = @import("std");
const Tokenizer = @import("tokenizer.zig");
const TokenType = Tokenizer.TokenType;
const StringPool = @import("strings.zig");
const StringIndex = StringPool.StringIndex;
const assert = std.debug.assert;
@ -22,8 +23,8 @@ pub const JsonValue = union(JsonType) {
bool: bool,
number: f64,
string: StringIndex,
array: ArrayIndex.Slice,
object: ObjectIndex.Entry,
array: ArraySlice,
object: ObjectEntry,
};
pub const JsonInput = union(JsonType) {
@ -35,31 +36,34 @@ pub const JsonInput = union(JsonType) {
object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput),
};
pub const ArrayIndex = enum(usize) {
_,
pub const Slice = struct {
/// same as ObjectEntry but simpler
/// start is the offset
pub const ArraySlice = struct {
start: usize,
len: usize,
};
};
pub const ObjectIndex = enum(usize) {
_,
pub const Entry = struct {
/// just += the properties and value indexes to get the next item
/// property_idx and value_idx are the offset
/// it should be ordered
pub const ObjectEntry = struct {
len: usize,
property_idx: usize,
value_idx: usize,
};
pub const Flags = packed struct {
allow_trailing_comma: bool = false,
};
pub const Options = struct {
comptime max_depth: usize = 256,
comptime flags: Flags = .{},
};
index: std.MultiArrayList(JsonValue) = .{},
string_index: StringPool = .empty,
property_index: StringPool = .empty,
options: Options = .{},
@ -67,7 +71,7 @@ pub const init: Self = .{};
pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
self.index.deinit(allocator);
self.string_index.deinit(allocator);
self.property_index.deinit(allocator);
}
fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
@ -77,6 +81,12 @@ fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
return idx;
}
fn addProperty(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.property_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
return @intFromEnum(stridx);
}
fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.string_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
@ -86,15 +96,15 @@ fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usiz
}
fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
var entry: ?ObjectIndex.Entry = null;
var entry: ?ObjectEntry = null;
for (object.keys(), object.values(), 0..) |key, value, times| {
const stridx = try self.string_index.add(allocator, key);
const stridx = try self.property_index.add(allocator, key);
try self.index.ensureUnusedCapacity(allocator, 1);
const vidx = self.index.addOneAssumeCapacity();
self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) {
entry = ObjectIndex.Entry{
entry = ObjectEntry{
.len = object.entries.len,
.property_idx = stridx,
.value_idx = vidx,
@ -108,7 +118,7 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas
self.index.set(idx, .{ .object = e });
return idx;
} else {
self.index.set(idx, .{ .object = ObjectIndex.Entry{
self.index.set(idx, .{ .object = ObjectEntry{
.len = 0,
.property_idx = 0,
.value_idx = 0,
@ -120,8 +130,8 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas
fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
const object: ObjectIndex.Entry = .{
.property_idx = self.string_index.string_bytes.items.len,
const object: ObjectEntry = .{
.property_idx = self.property_index.string_bytes.items.len,
.value_idx = self.index.len,
.len = 0,
};
@ -130,13 +140,13 @@ fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
}
fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
var entry: ?ArrayIndex.Slice = null;
var entry: ?ArraySlice = null;
for (array, 0..) |value, times| {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) {
entry = ArrayIndex.Slice{
entry = ArraySlice{
.start = idx,
.len = array.len,
};
@ -148,7 +158,7 @@ fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usiz
self.index.set(idx, .{ .array = e });
return idx;
} else {
self.index.set(idx, .{ .array = ArrayIndex.Slice{
self.index.set(idx, .{ .array = ArraySlice{
.start = 0,
.len = 0,
} });
@ -163,9 +173,16 @@ fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
return idx;
}
fn addNull(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .null = {} });
return idx;
}
fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
switch (value) {
.null => {},
.null => try self.addNull(allocator),
.bool => try self.addBool(allocator, value.bool),
.number => try self.addNumber(allocator, value.number),
.string => try self.addString(allocator, value.string),
@ -174,8 +191,8 @@ fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
}
}
fn getString(self: *Self, index: []const u8) ?StringIndex {
return self.string_index.string_table.get(index);
fn getProperty(self: *Self, index: []const u8) ?StringIndex {
return self.property_index.string_table.get(index);
}
fn getNumber(self: *Self, index: usize) ?f64 {
@ -200,7 +217,7 @@ fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct {
const values = try allocator.alloc(usize, entry.object.len);
for (0..entry.object.len) |i| {
const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index);
const slice = StringIndex.slice(@enumFromInt(pidx), &self.property_index);
keys[i] = @enumFromInt(pidx);
values[i] = vidx;
pidx += slice.len + 1;
@ -265,7 +282,7 @@ fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput
try obj.ensureTotalCapacity(allocator, entry.object.len);
for (0..entry.object.len) |_| {
const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index);
const key = StringIndex.slice(@enumFromInt(kidx), &self.property_index);
const val = (try self.getValue(allocator, vidx)).?;
obj.putAssumeCapacityNoClobber(key, val);
@ -289,7 +306,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
var cycles: usize = 0;
while (it.next()) |token| {
switch (token.type) {
flag: switch (token.type) {
.object_begin => {
std.debug.print("{{", .{});
const obj_idx = try self.addEmptyObject(allocator);
@ -302,9 +319,9 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
switch (data) {
.object => |valid_entry| {
const new_data = ObjectIndex.Entry{
const new_data = ObjectEntry{
.len = valid_entry.len + 1,
.property_idx = self.string_index.string_table.size,
.property_idx = self.property_index.string_table.size,
.value_idx = obj_idx,
};
self.index.set(depth_buf[cycles - 1], .{ .object = new_data });
@ -320,7 +337,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?;
std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len });
for (keys, vals) |k, v| {
const key = k.slice(&self.string_index);
const key = k.slice(&self.property_index);
const val = self.index.get(v);
std.debug.print(
\\"{s}": {s},
@ -328,14 +345,31 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
}
std.debug.print("}}", .{});
},
.string => {
const idx = try self.addString(allocator, token.value.?.string);
.property => {
_ = try self.addProperty(allocator, token.value.?.string);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
const stridx = self.index.get(idx).string;
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len + 1,
.property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
.string => {
// maybe we could dismiss the while loop altogether and just do this
// the whole time
if (it.peek()) |next| if (next.type == .colon) {
continue :flag TokenType.property;
};
_ = try self.addString(allocator, token.value.?.string);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
@ -345,7 +379,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
_ = try self.addNumber(allocator, token.value.?.number);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
@ -357,7 +391,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
_ = try self.addBool(allocator, if (token.type == .true) true else false);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
@ -365,6 +399,23 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void {
continue;
}
},
.null => {
_ = try self.addNull(allocator);
const last_obj = self.index.get(depth_buf[cycles - 1]);
if (cycles > 0) {
self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{
.len = last_obj.object.len,
.property_idx = last_obj.object.property_idx,
.value_idx = last_obj.object.value_idx,
} });
continue;
}
},
.comma => {
if (it.peek()) |tc| if (tc.type == .object_end and self.options.flags.allow_trailing_comma) {
return error.TrailingComma;
};
},
else => {},
}
@ -384,10 +435,11 @@ test parse {
var tokenizer = try Tokenizer.init(allocator, blk: {
const json =
\\ {
\\ "key": 123,
\\ "key2": false,
\\ "key": "hello",
\\ "key2": "world",
\\ "key3": true,
\\ "key4": null
\\ "key4": null,
\\ "key5": 123
\\ }
;
break :blk json;

View File

@ -232,14 +232,14 @@ pub fn nextNumber(self: *Self) Error!Token {
return error.BadNumber; // no floating point
};
return self.commit(Token{
return Token{
.type = .number,
.value = .{
.number = float,
},
.start = start,
.end = self.currentPosition(),
});
};
};
while (self.matchCharRange('0', '9') != null) {}
@ -248,14 +248,14 @@ pub fn nextNumber(self: *Self) Error!Token {
return error.BadNumber; // floating point
};
return self.commit(Token{
return .{
.type = .number,
.value = .{
.number = float,
},
.start = start,
.end = self.currentPosition(),
});
};
}
/// Parse an identifier token
@ -284,32 +284,32 @@ pub fn nextIdentifier(self: *Self) Error!Token {
// true
if (std.mem.eql(u8, ident, "true")) {
return self.commit(Token{
return .{
.type = .true,
.value = null,
.start = start,
.end = self.currentPosition(),
});
};
}
// false
if (std.mem.eql(u8, ident, "false")) {
return self.commit(Token{
return .{
.type = .false,
.value = null,
.start = start,
.end = self.currentPosition(),
});
};
}
// null
if (std.mem.eql(u8, ident, "null")) {
return self.commit(Token{
return .{
.type = .null,
.value = null,
.start = start,
.end = self.currentPosition(),
});
};
}
unreachable;
@ -350,17 +350,17 @@ pub fn nextToken(self: *Self) Error!Token {
':' => .colon,
'"' => {
self.rollback();
return (self.nextString());
return self.commit(self.nextString());
},
else => {
self.rollback();
// Try different token types in order of precedence
if (std.ascii.isDigit(c) or c == '-') {
return (self.nextNumber());
return self.commit(self.nextNumber());
}
if (std.ascii.isAlphabetic(c)) {
return (self.nextIdentifier());
return self.commit(self.nextIdentifier());
}
return error.InvalidSyntax;
@ -392,12 +392,12 @@ pub fn nextString(self: *Self) Error!Token {
switch (self.lastChar()) {
'"' => {
return self.commit(Token{
return .{
.type = .string,
.value = .{ .string = try buffer.toOwnedSlice() },
.start = start,
.end = self.currentPosition(),
});
};
},
'\\' => {
self.advance(1);
@ -415,12 +415,12 @@ pub fn nextString(self: *Self) Error!Token {
var code_points: [4]u8 = undefined;
inline for (0..4) |i| {
if (self.endOfInput())
return self.commit(Token{
return .{
.type = .eof,
.value = null,
.start = start,
.end = start + 1,
});
};
self.advance(1);
code_points[i] = self.lastChar();
}
@ -446,6 +446,7 @@ pub fn nextString(self: *Self) Error!Token {
pub const Iterator = struct {
tokenizer: *Self,
pub fn next(it: *Iterator) ?Token {
defer it.tokenizer.skipWhitespace();
if (it.tokenizer.endOfInput()) return null;
return it.tokenizer.nextToken() catch null;
}
@ -455,6 +456,12 @@ pub const Iterator = struct {
it.tokenizer.frame = 0;
it.tokenizer.prev_token = null;
}
pub fn peek(it: *Iterator) ?Token {
defer it.tokenizer.position -%= 1;
defer it.tokenizer.skipWhitespace();
if (it.tokenizer.endOfInput()) return null;
return it.tokenizer.nextToken() catch null;
}
};
/// iterator