This commit is contained in:
yuzu 2025-05-24 15:28:11 -05:00
parent 7372e0092b
commit 11b101d3e8
2 changed files with 195 additions and 134 deletions

View File

@ -1,6 +1,7 @@
const std = @import("std"); const std = @import("std");
const Tokenizer = @import("tokenizer.zig"); const Tokenizer = @import("tokenizer.zig");
const TokenType = Tokenizer.TokenType; const TokenType = Tokenizer.TokenType;
const Token = Tokenizer.Token;
const StringPool = @import("strings.zig"); const StringPool = @import("strings.zig");
const StringIndex = StringPool.StringIndex; const StringIndex = StringPool.StringIndex;
const assert = std.debug.assert; const assert = std.debug.assert;
@ -156,7 +157,7 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas
self.index.set(idx, .{ .object = ObjectEntry{ self.index.set(idx, .{ .object = ObjectEntry{
.len = 0, .len = 0,
.property_idx = 0, .property_idx = 0,
.value_idx = 0, .value_idx = 1,
} }); } });
return idx; return idx;
} }
@ -167,7 +168,7 @@ fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
const idx = self.index.addOneAssumeCapacity(); const idx = self.index.addOneAssumeCapacity();
const object: ObjectEntry = .{ const object: ObjectEntry = .{
.property_idx = self.property_index.string_bytes.items.len, .property_idx = self.property_index.string_bytes.items.len,
.value_idx = self.index.len, .value_idx = self.index.len + 1,
.len = 0, .len = 0,
}; };
self.index.set(idx, .{ .object = object }); self.index.set(idx, .{ .object = object });
@ -299,43 +300,50 @@ fn getNull(self: *Self, index: usize) ?void {
return entry.null; return entry.null;
} }
fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput { fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize, offset: usize) !struct { ?JsonInput, usize } {
const entry = self.index.get(index); const entry = self.index.get(index);
switch (entry) { switch (entry) {
.null => return .{ .null = {} }, .null => return .{ .{ .null = {} }, 1 },
.bool => return .{ .bool = entry.bool }, .bool => return .{ .{ .bool = entry.bool }, 1 },
.number => return .{ .number = entry.number }, .number => return .{ .{ .number = entry.number }, 1 },
.string => { .string => {
const str = entry.string.slice(&self.string_index); const str = entry.string.slice(&self.string_index);
return .{ .string = str }; return .{ .{ .string = str }, 1 };
}, },
.array => { .array => {
const res = try allocator.alloc(JsonInput, entry.array.len); const res = try allocator.alloc(JsonInput, entry.array.len);
var idx = entry.array.start; var idx = entry.array.start;
var offset_calc: usize = offset;
for (0..entry.array.len) |i| { for (0..entry.array.len) |i| {
if (try self.getValue(allocator, idx)) |v| { const val, const step = try self.getValue(allocator, idx, offset_calc);
res[i] = v; res[i] = val.?;
idx += 1; idx += step;
} else unreachable;
} }
return .{ .array = res }; offset_calc += entry.array.len;
return .{ .{ .array = res }, offset_calc };
}, },
.object => { .object => {
var kidx = entry.object.property_idx; var kidx = entry.object.property_idx;
var vidx = entry.object.value_idx; var vidx = entry.object.value_idx;
var obj: std.StringArrayHashMapUnmanaged(JsonInput) = .empty; var obj: std.StringArrayHashMapUnmanaged(JsonInput) = .empty;
var offset_calc: usize = offset;
try obj.ensureTotalCapacity(allocator, entry.object.len); try obj.ensureTotalCapacity(allocator, entry.object.len);
for (0..entry.object.len) |_| { for (0..entry.object.len) |_| {
const key = StringIndex.slice(@enumFromInt(kidx), &self.property_index); const slice = StringIndex.slice(@enumFromInt(kidx), &self.property_index);
const val = (try self.getValue(allocator, vidx)).?; const val, const step = try self.getValue(allocator, vidx, offset_calc);
kidx += slice.len + 1;
vidx += step;
obj.putAssumeCapacityNoClobber(key, val); std.debug.print("putting {s} -> {d}\n", .{ slice, vidx });
kidx += key.len + 1; obj.putAssumeCapacityNoClobber(slice, val.?);
vidx += 1;
} }
return .{ .object = obj }; offset_calc += entry.object.len;
return .{ .{ .object = obj }, offset_calc };
}, },
} }
} }
@ -347,81 +355,144 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
var it = tokenizer.iterator(); var it = tokenizer.iterator();
const root = try self.addEmptyObject(allocator); const root = try self.addEmptyObject(allocator);
var work_query = try allocator.alloc(usize, self.options.max_depth); defer std.debug.print("idx: {s}\n", .{
var cycles: usize = 0; @tagName(self.index.get(self.index.get(root).object.value_idx)),
});
//defer assert(cycles == 0); var token = it.next() orelse
return root;
while (it.next()) |token| { var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0);
defer tokenizer.skipWhitespace();
std.debug.print("token: {s}\n", .{@tagName(token.type)}); flag: switch (token.type) {
.eof => {
assert(query.slice().len == 0);
return root;
},
.property => {
defer tokenizer.skipWhitespace();
flag: switch (token.type) { const scope_idx = query.get(query.len - 1);
.array_end => { switch (self.index.get(scope_idx)) {
cycles -= 1; .object => |scope| {
}, const pidx = try self.addProperty(allocator, token.value.?.string);
.object_end => { self.index.set(scope_idx, .{ .object = ObjectEntry{
cycles -= 1; .len = scope.len + 1,
}, .property_idx = if (scope.len == 0) pidx else scope.property_idx,
.array_begin => { .value_idx = scope.value_idx,
const idx = try self.addEmptyArray(allocator);
work_query[cycles] = idx;
cycles += 1;
},
.object_begin => {
if (cycles == 0) {
self.index.set(root, .{ .object = .{
.len = 0,
.property_idx = self.property_index.string_table.size,
.value_idx = self.index.len,
} }); } });
work_query[cycles] = root; },
} else { else => return error.InvalidSyntax,
const obj_idx = try self.addEmptyObject(allocator); }
work_query[cycles] = obj_idx;
} const next = it.next() orelse return error.InvalidSyntax;
cycles += 1; token = next;
}, switch (next.type) {
.property => { .colon => {
const scope_idx = work_query[cycles - 1]; token = it.next() orelse return error.InvalidSyntax;
switch (self.index.get(scope_idx)) { continue :flag token.type;
.object => |scope| { },
//std.debug.print("depth: {d}\n", .{cycles}); else => continue :flag next.type,
_ = try self.addProperty(allocator, token.value.?.string); // else => return error.InvalidSyntax,
self.index.set(scope_idx, .{ .object = ObjectEntry{ }
.len = scope.len + 1, },
.property_idx = scope.property_idx, .object_begin => {
.value_idx = scope.value_idx, defer tokenizer.skipWhitespace();
} });
}, if (query.slice().len == 0) {
else => unreachable, try query.ensureUnusedCapacity(1);
} const ptr = query.addOneAssumeCapacity();
}, ptr.* = root;
.string => { self.index.set(root, .{ .object = ObjectEntry{
if (it.peek()) |next| if (next.type == .colon) { .len = 0,
.property_idx = self.property_index.string_bytes.items.len,
.value_idx = 1,
} });
} else {
const idx_ptr = try query.addOne();
idx_ptr.* = try self.addEmptyObject(allocator);
self.index.set(idx_ptr.*, .{ .object = ObjectEntry{
.len = 0,
.property_idx = self.property_index.string_bytes.items.len,
.value_idx = self.index.len,
} });
}
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.string => continue :flag .property,
else => return error.InvalidSyntax,
}
},
.object_end => {
defer tokenizer.skipWhitespace();
assert(query.pop() != null);
const next = it.next() orelse
return root;
token = next;
switch (next.type) {
.comma => continue :flag .comma,
.object_end, .array_end => |t| continue :flag t,
else => return error.InvalidSyntax,
}
},
.true, .false => {
defer tokenizer.skipWhitespace();
_ = try self.addBool(allocator, if (token.type == .true) true else false);
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.comma => continue :flag .comma,
.object_end => continue :flag .object_end,
else => return error.InvalidSyntax,
}
},
.string => {
defer tokenizer.skipWhitespace();
const next = it.next() orelse return error.InvalidSyntax;
switch (next.type) {
.colon => {
continue :flag .property; continue :flag .property;
}; },
_ = try self.addString(allocator, token.value.?.string); else => |t| {
}, _ = try self.addString(allocator, token.value.?.string);
.number => {
_ = try self.addNumber(allocator, token.value.?.number); token = next;
}, continue :flag t;
.true, .false => { },
_ = try self.addBool(allocator, if (token.type == .true) true else false); }
}, },
.null => { .number => {
_ = try self.addNull(allocator); defer tokenizer.skipWhitespace();
},
.comma => if (it.peek()) |t| { _ = try self.addNumber(allocator, token.value.?.number);
if (t.type == .object_end) {
if (!self.options.flags.allow_trailing_comma) { const next = it.next() orelse return error.InvalidSyntax;
return error.TrailingComma; token = next;
} switch (next.type) {
.comma => continue :flag .comma,
.object_end => continue :flag .object_end,
else => return error.InvalidSyntax,
}
},
.comma => {
if (!self.options.flags.allow_trailing_comma) {
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.object_end, .array_end => return error.TrailingComma,
else => continue :flag token.type,
} }
}, }
else => continue, },
} else => {
std.debug.print("token: {s}\n", .{@tagName(token.type)});
},
} }
return root; return root;
@ -439,10 +510,13 @@ test parse {
var tokenizer = try Tokenizer.init(allocator, blk: { var tokenizer = try Tokenizer.init(allocator, blk: {
const json = const json =
\\ { \\ {
\\ "lazy": true, \\ "bio": "cool",
\\ "age": 15,
\\ "name": "yuzu", \\ "name": "yuzu",
\\ "dislikes": [["Math", 3], ["Sports", 1]], \\ "admin": true,
\\ "age": 15 \\ "address": {
\\ "lorem": "ipsum"
\\ }
\\ } \\ }
; ;
break :blk json; break :blk json;
@ -450,7 +524,7 @@ test parse {
const root = blk: { const root = blk: {
const idx = try parse(&self, &tokenizer); const idx = try parse(&self, &tokenizer);
const val = (try getValue(&self, allocator, idx)).?; const val, _ = try getValue(&self, allocator, idx, 0);
break :blk val; break :blk val;
}; };

View File

@ -58,18 +58,11 @@ pub const Token = struct {
pub const Self = @This(); pub const Self = @This();
text: []const u8, text: []const u8,
position: usize,
max_position: usize, max_position: usize,
stack: []usize, stack: []usize,
frame: usize, frame: usize,
allocator: std.mem.Allocator, allocator: std.mem.Allocator,
prev_token: ?Token = null,
pub fn pushBack(self: *Self, token: Token) void {
self.prev_token = token;
}
/// Initialize a new tokenizer /// Initialize a new tokenizer
pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self { pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
const stack = try allocator.alloc(usize, 0x100); const stack = try allocator.alloc(usize, 0x100);
@ -77,7 +70,6 @@ pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Er
@memset(stack, 0); @memset(stack, 0);
return .{ return .{
.text = text, .text = text,
.position = 0,
.max_position = 0, .max_position = 0,
.stack = stack, .stack = stack,
.frame = 0, .frame = 0,
@ -201,6 +193,14 @@ pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
self.advance(1); self.advance(1);
} }
pub fn anyChar(self: *Self) ?u8 {
if (self.endOfInput())
return null;
const char = self.text[self.currentPosition()];
self.advance(1);
return char;
}
// ========== Token Extraction ========== // ========== Token Extraction ==========
fn extractSlice(self: *Self, start: usize) []const u8 { fn extractSlice(self: *Self, start: usize) []const u8 {
@ -319,35 +319,33 @@ pub fn nextIdentifier(self: *Self) Error!Token {
/// Get the next token from the input /// Get the next token from the input
/// WARNING: this function eats whitespaces /// WARNING: this function eats whitespaces
pub fn nextToken(self: *Self) Error!Token { pub fn nextToken(self: *Self) Error!Token {
if (self.prev_token) |tok| {
self.prev_token = null;
return tok;
}
const start = try self.pushFrame(); const start = try self.pushFrame();
errdefer self.popFrame(); errdefer self.popFrame();
self.skipWhitespace(); self.skipWhitespace();
if (self.endOfInput()) {
return Token{
.type = .eof,
.value = null,
.start = start,
.end = start,
};
}
self.advance(1);
// Fall back to single character symbol // Fall back to single character symbol
const c = self.lastChar(); const c = self.anyChar() orelse return .{
.type = .eof,
.value = null,
.start = start,
.end = start,
};
const symbol_t: TokenType = switch (c) { const symbol_t: TokenType = switch (c) {
'{' => .object_begin, '{' => .object_begin,
'}' => .object_end, '}' => .object_end,
'[' => .array_begin, '[' => .array_begin,
']' => .array_end, ']' => .array_end,
',' => .comma, ',' => {
self.skipWhitespace();
return self.commit(Token{
.type = .comma,
.value = null,
.end = start + 1,
.start = start,
});
},
':' => .colon, ':' => .colon,
'"' => { '"' => {
self.rollback(); self.rollback();
@ -449,31 +447,20 @@ pub const Iterator = struct {
pub fn next(it: *Iterator) ?Token { pub fn next(it: *Iterator) ?Token {
defer it.tokenizer.skipWhitespace(); defer it.tokenizer.skipWhitespace();
if (it.tokenizer.endOfInput()) return null; if (it.tokenizer.endOfInput()) {
return it.tokenizer.nextToken() catch null; std.debug.print("got eof\n", .{});
return null;
}
return it.tokenizer.nextToken() catch |err| {
std.debug.print("got err: {s}\n", .{@errorName(err)});
return null;
};
} }
pub fn reset(it: *Iterator) void { pub fn reset(it: *Iterator) void {
it.tokenizer.position = 0; it.tokenizer.position = 0;
it.tokenizer.max_position = 0; it.tokenizer.max_position = 0;
it.tokenizer.frame = 0; it.tokenizer.frame = 0;
it.tokenizer.prev_token = null;
}
/// nasty trick
pub fn peek(it: *Iterator) ?Token {
const frame = it.tokenizer.frame;
const pos = it.tokenizer.position;
const prev = it.tokenizer.prev_token;
const max_pos = it.tokenizer.max_position;
defer {
it.tokenizer.position = pos;
it.tokenizer.frame = frame;
it.tokenizer.max_position = max_pos;
it.tokenizer.prev_token = prev;
}
if (it.tokenizer.endOfInput()) return null;
return it.tokenizer.nextToken() catch null;
} }
}; };