aether/language.zig
2025-05-25 13:46:15 -05:00

619 lines
19 KiB
Zig

const std = @import("std");
const Tokenizer = @import("tokenizer.zig");
const TokenType = Tokenizer.TokenType;
const Token = Tokenizer.Token;
const StringPool = @import("strings.zig");
const StringIndex = StringPool.StringIndex;
const assert = std.debug.assert;
const Self = @This();
pub const Error = enum {
TrailingComma,
};
pub const JsonType = enum {
null,
bool,
number,
string,
array,
object,
};
pub const JsonValue = union(JsonType) {
null: void,
bool: bool,
number: f64,
string: StringIndex,
array: ArraySlice,
object: ObjectEntry,
};
pub const JsonInput = union(JsonType) {
null: void,
bool: bool,
number: f64,
string: []const u8,
array: []JsonInput,
object: std.StringArrayHashMapUnmanaged(JsonInput),
pub fn deinit(self: JsonInput, allocator: std.mem.Allocator) void {
switch (self) {
JsonInput.array => |array| {
for (array) |json_input| {
json_input.deinit(allocator);
}
allocator.free(array);
},
.object => |*object| {
var it = object.iterator();
while (it.next()) |entry| {
entry.value_ptr.deinit(allocator);
//allocator.free(entry.key_ptr.*);
}
@constCast(object).deinit(allocator);
},
.string => |_| {},
else => {},
}
}
pub fn format(
self: @This(),
comptime fmt: []const u8,
opts: std.fmt.FormatOptions,
writer: anytype,
) !void {
switch (self) {
.null => try writer.writeAll("null"),
.bool => try writer.writeAll(if (self.bool) "true" else "false"),
.number => try writer.print("{d}", .{self.number}),
.string => try writer.print("\"{s}\"", .{self.string}),
.array => {
try writer.writeByte('[');
for (self.array, 0..) |val, i| {
try val.format(fmt, opts, writer);
if (i < self.array.len - 1) try writer.writeByte(',');
}
try writer.writeByte(']');
},
.object => {
try writer.writeByte('{');
for (self.object.keys(), self.object.values(), 0..) |k, v, i| {
try writer.print("\"{s}\"", .{k});
try writer.writeByte(':');
try v.format(fmt, opts, writer);
if (i < self.object.entries.len - 1) try writer.writeByte(',');
}
try writer.writeByte('}');
},
}
}
};
/// same as ObjectEntry but simpler
/// start is the offset
pub const ArraySlice = struct {
start: usize,
len: usize,
};
/// just += the properties and value indexes to get the next item
/// property_idx and value_idx are the offset
/// it should be ordered
pub const ObjectEntry = struct {
len: usize,
property_idx: usize,
value_idx: usize,
};
pub const Flags = packed struct {
allow_trailing_comma: bool = false,
};
pub const Options = struct {
comptime indent_len: usize = 4,
comptime max_depth: usize = 256,
comptime flags: Flags = .{},
};
index: std.MultiArrayList(JsonValue) = .{},
string_index: StringPool = .empty,
property_index: StringPool = .empty,
options: Options = .{},
pub const init = Self{};
pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
self.index.deinit(allocator);
self.property_index.deinit(allocator);
self.string_index.deinit(allocator);
}
fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .number = number });
return idx;
}
fn addProperty(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.property_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
return @intFromEnum(stridx);
}
fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.string_index.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .string = stridx });
return idx;
}
fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize {
var entry: ?ObjectEntry = null;
for (object.keys(), object.values(), 0..) |key, value, times| {
const stridx = try self.property_index.add(allocator, key);
try self.index.ensureUnusedCapacity(allocator, 1);
const vidx = self.index.addOneAssumeCapacity();
self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) {
entry = ObjectEntry{
.len = object.entries.len,
.property_idx = stridx,
.value_idx = vidx,
};
}
}
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
if (entry) |e| {
self.index.set(idx, .{ .object = e });
return idx;
} else {
self.index.set(idx, .{ .object = ObjectEntry{
.len = 0,
.property_idx = 0,
.value_idx = 1,
} });
return idx;
}
}
fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
const object: ObjectEntry = .{
.property_idx = self.property_index.string_bytes.items.len,
.value_idx = self.index.len + 1,
.len = 0,
};
self.index.set(idx, .{ .object = object });
return idx;
}
fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize {
var entry: ?ArraySlice = null;
for (array, 0..) |value, times| {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value)));
if (times == 0) {
entry = ArraySlice{
.start = idx,
.len = array.len,
};
}
}
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
if (entry) |e| {
self.index.set(idx, .{ .array = e });
return idx;
} else {
self.index.set(idx, .{ .array = ArraySlice{
.start = 0,
.len = 0,
} });
return idx;
}
}
fn addEmptyArray(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .array = ArraySlice{
.start = self.index.len,
.len = 0,
} });
return idx;
}
fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .bool = value });
return idx;
}
fn addNull(self: *Self, allocator: std.mem.Allocator) !usize {
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .null = {} });
return idx;
}
fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void {
switch (value) {
.null => try self.addNull(allocator),
.bool => try self.addBool(allocator, value.bool),
.number => try self.addNumber(allocator, value.number),
.string => try self.addString(allocator, value.string),
.array => try self.addArray(allocator, value.array),
.object => try self.addObject(allocator, value.object),
}
}
fn getProperty(self: *Self, index: []const u8) ?StringIndex {
return self.property_index.string_table.get(index);
}
fn getNumber(self: *Self, index: usize) ?f64 {
if (self.index.get(index)) |n| return n;
return null;
}
fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !struct {
[]StringIndex,
[]usize,
} {
const entry = self.index.get(index);
if (entry.object.len == 0) {
return .{ &.{}, &.{} };
}
var pidx = entry.object.property_idx;
var vidx = entry.object.value_idx;
const keys = try allocator.alloc(StringIndex, entry.object.len);
const values = try allocator.alloc(usize, entry.object.len);
for (0..entry.object.len) |i| {
const slice = StringIndex.slice(@enumFromInt(pidx), &self.property_index);
keys[i] = @enumFromInt(pidx);
values[i] = vidx;
pidx += slice.len + 1;
vidx += 1;
}
return .{ keys, values };
}
fn getArray(self: *Self, allocator: std.mem.Allocator, index: usize) ![]usize {
const entry = self.index.get(index);
if (entry.array.len == 0) {
return &.{};
}
var idx = entry.array.start;
const values = try allocator.alloc(usize, entry.array.len);
for (0..entry.array.len) |i| {
values[i] = idx;
idx += 1;
}
return values;
}
fn getBool(self: *Self, index: usize) ?bool {
const entry = self.index.get(index) orelse return null;
return entry.bool;
}
fn getNull(self: *Self, index: usize) ?void {
const entry = self.index.get(index) orelse return null;
return entry.null;
}
// Recursively compute how many index slots a node occupies (including nested)
fn skipSlots(self: *Self, slot: usize) usize {
const e = self.index.get(slot);
switch (e) {
.object => |obj| {
var total: usize = 1;
var v = obj.value_idx;
for (0..obj.len) |_| {
const s = skipSlots(self, v);
total += s;
v += s;
}
return total;
},
.array => |arr| {
var total: usize = 1;
var c = arr.start;
for (0..arr.len) |_| {
const s = skipSlots(self, c);
total += s;
c += s;
}
return total;
},
else => return 1,
}
}
// Compute bytes length of properties starting at pidx
fn skipProps(self: *Self, pidx: usize, count: usize) usize {
var total: usize = 0;
var p = pidx;
for (0..count) |_| {
const key_slice = StringIndex.slice(@enumFromInt(p), &self.property_index);
const len = key_slice.len + 1;
total += len;
p += len;
}
return total;
}
fn skipNestedProps(self: *Self, pptr: *usize, slot: usize) void {
const e = self.index.get(slot);
if (e == .object) {
var v = e.object.value_idx;
// Skip each nested key and its deeper nested props
for (0..e.object.len) |_| {
// Skip this key
const k: *StringIndex = @ptrCast(pptr);
const slice = k.slice(&self.property_index);
pptr.* += slice.len + 1;
// Recurse into this property's value
skipNestedProps(self, pptr, v);
// Skip slots of the value in index array
const s = skipSlots(self, v);
v += s;
}
}
}
fn getValue(
self: *Self,
allocator: std.mem.Allocator,
idx: usize,
) !JsonInput {
const entry = self.index.get(idx);
switch (entry) {
.null => return .null,
.bool => return .{ .bool = entry.bool },
.number => return .{ .number = entry.number },
.string => |string| {
const sl = string.slice(&self.string_index);
return .{ .string = sl };
},
.array => |arr| {
var out = try allocator.alloc(JsonInput, arr.len);
var c = arr.start;
for (0..arr.len) |i| {
const v = try self.getValue(allocator, c);
out[i] = v;
c += skipSlots(self, c);
}
return .{ .array = out[0..arr.len] };
},
.object => |obj| {
var map: std.StringArrayHashMapUnmanaged(JsonInput) = .empty;
var p = obj.property_idx;
var v = obj.value_idx;
for (0..obj.len) |_| {
// Extract key
const k: StringIndex = @enumFromInt(p);
const key_slice = k.slice(&self.property_index);
// Extract and assign value
const val = try self.getValue(allocator, v);
try map.put(allocator, key_slice, val);
// Advance past this key
p += key_slice.len + 1;
// Skip nested property names of this value
self.skipNestedProps(&p, v);
// Advance past the value slots
const s = self.skipSlots(v);
v += s;
}
return .{ .object = map };
},
}
}
test getValue {
const allocator = std.testing.allocator;
const json =
\\ {
\\ "name": "Yuzu",
\\ "author": true,
\\ "age": 15,
\\ "address": {
\\ "street": 1,
\\ "deeply_nested": {
\\ "k": 5,
\\ "socialist": "expansion",
\\ "idk": {"a":"b"}
\\ }
\\ },
\\ "offset": "yes"
\\ }
;
var tokenizer: Tokenizer = try .init(allocator, json);
defer tokenizer.deinit();
var self = init;
defer self.deinit(allocator);
const idx: usize = try parse(&self, &tokenizer);
var root = try getValue(&self, allocator, idx);
defer root.deinit(allocator);
try std.testing.expect(root == .object);
std.debug.print("{}\n", .{root});
}
/// always returns 0 (root)
pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
const allocator = tokenizer.allocator;
var it = tokenizer.iterator();
const root = try self.addEmptyObject(allocator);
var token = it.next() orelse
return root;
var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0);
flag: switch (token.type) {
.eof => {
assert(query.slice().len == 0);
return root;
},
.property => {
defer tokenizer.skipWhitespace();
const scope_idx = query.get(query.len - 1);
switch (self.index.get(scope_idx)) {
.object => |scope| {
//std.debug.print("prop: {s} \n", .{token.value.?.string});
const pidx = try self.addProperty(allocator, token.value.?.string);
allocator.free(token.value.?.string);
self.index.set(scope_idx, .{ .object = ObjectEntry{
.len = scope.len + 1,
.property_idx = if (scope.len == 0) pidx else scope.property_idx,
.value_idx = scope.value_idx,
} });
},
else => return error.InvalidSyntax,
}
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.colon => {
token = it.next() orelse return error.InvalidSyntax;
continue :flag token.type;
},
else => continue :flag next.type,
// else => return error.InvalidSyntax,
}
},
.object_begin => {
defer tokenizer.skipWhitespace();
if (query.slice().len == 0) {
try query.ensureUnusedCapacity(1);
const ptr = query.addOneAssumeCapacity();
ptr.* = root;
self.index.set(root, .{ .object = ObjectEntry{
.len = 0,
.property_idx = 0,
.value_idx = 1,
} });
} else {
const idx_ptr = try query.addOne();
idx_ptr.* = try self.addEmptyObject(allocator);
self.index.set(idx_ptr.*, .{
.object = ObjectEntry{
.len = 0,
.property_idx = self.index.len, //self.property_index.string_bytes.items.len,
.value_idx = self.index.len,
},
});
}
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.string => continue :flag .property,
else => return error.InvalidSyntax,
}
},
.object_end => {
defer tokenizer.skipWhitespace();
assert(query.pop() != null);
const next = it.next() orelse
return root;
token = next;
switch (next.type) {
.comma => continue :flag .comma,
.object_end, .array_end => |t| continue :flag t,
else => return error.InvalidSyntax,
}
},
.true, .false => {
defer tokenizer.skipWhitespace();
_ = try self.addBool(allocator, if (token.type == .true) true else false);
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.comma => continue :flag .comma,
.object_end => continue :flag .object_end,
else => return error.InvalidSyntax,
}
},
.string => {
defer tokenizer.skipWhitespace();
const next = it.next() orelse return error.InvalidSyntax;
switch (next.type) {
.colon => {
continue :flag .property;
},
else => |t| {
_ = try self.addString(allocator, token.value.?.string);
allocator.free(token.value.?.string);
token = next;
continue :flag t;
},
}
},
.number => {
defer tokenizer.skipWhitespace();
_ = try self.addNumber(allocator, token.value.?.number);
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.comma => continue :flag .comma,
.object_end => continue :flag .object_end,
else => return error.InvalidSyntax,
}
},
.comma => {
if (!self.options.flags.allow_trailing_comma) {
const next = it.next() orelse return error.InvalidSyntax;
token = next;
switch (next.type) {
.object_end, .array_end => return error.TrailingComma,
else => continue :flag token.type,
}
}
},
else => {
// std.debug.print("token: {s}\n", .{@tagName(token.type)});
},
}
return root;
}