diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..19892e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.zig-cache diff --git a/2.zig b/language.zig similarity index 73% rename from 2.zig rename to language.zig index a9477fe..e872253 100644 --- a/2.zig +++ b/language.zig @@ -1,5 +1,6 @@ const std = @import("std"); const Tokenizer = @import("tokenizer.zig"); +const TokenType = Tokenizer.TokenType; const StringPool = @import("strings.zig"); const StringIndex = StringPool.StringIndex; const assert = std.debug.assert; @@ -22,8 +23,8 @@ pub const JsonValue = union(JsonType) { bool: bool, number: f64, string: StringIndex, - array: ArrayIndex.Slice, - object: ObjectIndex.Entry, + array: ArraySlice, + object: ObjectEntry, }; pub const JsonInput = union(JsonType) { @@ -35,31 +36,34 @@ pub const JsonInput = union(JsonType) { object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput), }; -pub const ArrayIndex = enum(usize) { - _, - - pub const Slice = struct { - start: usize, - len: usize, - }; +/// same as ObjectEntry but simpler +/// start is the offset +pub const ArraySlice = struct { + start: usize, + len: usize, }; -pub const ObjectIndex = enum(usize) { - _, +/// just += the properties and value indexes to get the next item +/// property_idx and value_idx are the offset +/// it should be ordered +pub const ObjectEntry = struct { + len: usize, + property_idx: usize, + value_idx: usize, +}; - pub const Entry = struct { - len: usize, - property_idx: usize, - value_idx: usize, - }; +pub const Flags = packed struct { + allow_trailing_comma: bool = false, }; pub const Options = struct { comptime max_depth: usize = 256, + comptime flags: Flags = .{}, }; index: std.MultiArrayList(JsonValue) = .{}, string_index: StringPool = .empty, +property_index: StringPool = .empty, options: Options = .{}, @@ -67,7 +71,7 @@ pub const init: Self = .{}; pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { self.index.deinit(allocator); - self.string_index.deinit(allocator); + self.property_index.deinit(allocator); } fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize { @@ -77,6 +81,12 @@ fn addNumber(self: *Self, allocator: std.mem.Allocator, number: f64) !usize { return idx; } +fn addProperty(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize { + const stridx = try self.property_index.add(allocator, bytes); + try self.index.ensureUnusedCapacity(allocator, 1); + return @intFromEnum(stridx); +} + fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usize { const stridx = try self.string_index.add(allocator, bytes); try self.index.ensureUnusedCapacity(allocator, 1); @@ -86,15 +96,15 @@ fn addString(self: *Self, allocator: std.mem.Allocator, bytes: []const u8) !usiz } fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHashMapUnmanaged([]const u8, JsonInput)) !usize { - var entry: ?ObjectIndex.Entry = null; + var entry: ?ObjectEntry = null; for (object.keys(), object.values(), 0..) |key, value, times| { - const stridx = try self.string_index.add(allocator, key); + const stridx = try self.property_index.add(allocator, key); try self.index.ensureUnusedCapacity(allocator, 1); const vidx = self.index.addOneAssumeCapacity(); self.index.set(vidx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value))); if (times == 0) { - entry = ObjectIndex.Entry{ + entry = ObjectEntry{ .len = object.entries.len, .property_idx = stridx, .value_idx = vidx, @@ -108,7 +118,7 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas self.index.set(idx, .{ .object = e }); return idx; } else { - self.index.set(idx, .{ .object = ObjectIndex.Entry{ + self.index.set(idx, .{ .object = ObjectEntry{ .len = 0, .property_idx = 0, .value_idx = 0, @@ -120,8 +130,8 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize { try self.index.ensureUnusedCapacity(allocator, 1); const idx = self.index.addOneAssumeCapacity(); - const object: ObjectIndex.Entry = .{ - .property_idx = self.string_index.string_bytes.items.len, + const object: ObjectEntry = .{ + .property_idx = self.property_index.string_bytes.items.len, .value_idx = self.index.len, .len = 0, }; @@ -130,13 +140,13 @@ fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize { } fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usize { - var entry: ?ArrayIndex.Slice = null; + var entry: ?ArraySlice = null; for (array, 0..) |value, times| { try self.index.ensureUnusedCapacity(allocator, 1); const idx = self.index.addOneAssumeCapacity(); self.index.set(idx, @unionInit(JsonValue, std.meta.activeTag(value), self.addValue(allocator, value))); if (times == 0) { - entry = ArrayIndex.Slice{ + entry = ArraySlice{ .start = idx, .len = array.len, }; @@ -148,7 +158,7 @@ fn addArray(self: *Self, allocator: std.mem.Allocator, array: []JsonInput) !usiz self.index.set(idx, .{ .array = e }); return idx; } else { - self.index.set(idx, .{ .array = ArrayIndex.Slice{ + self.index.set(idx, .{ .array = ArraySlice{ .start = 0, .len = 0, } }); @@ -163,9 +173,16 @@ fn addBool(self: *Self, allocator: std.mem.Allocator, value: bool) !usize { return idx; } +fn addNull(self: *Self, allocator: std.mem.Allocator) !usize { + try self.index.ensureUnusedCapacity(allocator, 1); + const idx = self.index.addOneAssumeCapacity(); + self.index.set(idx, .{ .null = {} }); + return idx; +} + fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void { switch (value) { - .null => {}, + .null => try self.addNull(allocator), .bool => try self.addBool(allocator, value.bool), .number => try self.addNumber(allocator, value.number), .string => try self.addString(allocator, value.string), @@ -174,8 +191,8 @@ fn addValue(self: *Self, allocator: std.mem.Allocator, value: JsonInput) !void { } } -fn getString(self: *Self, index: []const u8) ?StringIndex { - return self.string_index.string_table.get(index); +fn getProperty(self: *Self, index: []const u8) ?StringIndex { + return self.property_index.string_table.get(index); } fn getNumber(self: *Self, index: usize) ?f64 { @@ -200,7 +217,7 @@ fn getObject(self: *Self, allocator: std.mem.Allocator, index: usize) !?struct { const values = try allocator.alloc(usize, entry.object.len); for (0..entry.object.len) |i| { - const slice = StringIndex.slice(@enumFromInt(pidx), &self.string_index); + const slice = StringIndex.slice(@enumFromInt(pidx), &self.property_index); keys[i] = @enumFromInt(pidx); values[i] = vidx; pidx += slice.len + 1; @@ -265,7 +282,7 @@ fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput try obj.ensureTotalCapacity(allocator, entry.object.len); for (0..entry.object.len) |_| { - const key = StringIndex.slice(@enumFromInt(kidx), &self.string_index); + const key = StringIndex.slice(@enumFromInt(kidx), &self.property_index); const val = (try self.getValue(allocator, vidx)).?; obj.putAssumeCapacityNoClobber(key, val); @@ -289,7 +306,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { var cycles: usize = 0; while (it.next()) |token| { - switch (token.type) { + flag: switch (token.type) { .object_begin => { std.debug.print("{{", .{}); const obj_idx = try self.addEmptyObject(allocator); @@ -302,9 +319,9 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { switch (data) { .object => |valid_entry| { - const new_data = ObjectIndex.Entry{ + const new_data = ObjectEntry{ .len = valid_entry.len + 1, - .property_idx = self.string_index.string_table.size, + .property_idx = self.property_index.string_table.size, .value_idx = obj_idx, }; self.index.set(depth_buf[cycles - 1], .{ .object = new_data }); @@ -320,7 +337,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { const keys, const vals = (try self.getObject(allocator, depth_buf[cycles - 1])).?; std.debug.print("\nfound {d} keys and {d} values\n", .{ keys.len, vals.len }); for (keys, vals) |k, v| { - const key = k.slice(&self.string_index); + const key = k.slice(&self.property_index); const val = self.index.get(v); std.debug.print( \\"{s}": {s}, @@ -328,14 +345,31 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { } std.debug.print("}}", .{}); }, - .string => { - const idx = try self.addString(allocator, token.value.?.string); + .property => { + _ = try self.addProperty(allocator, token.value.?.string); const last_obj = self.index.get(depth_buf[cycles - 1]); if (cycles > 0) { - const stridx = self.index.get(idx).string; - self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{ .len = last_obj.object.len + 1, - .property_idx = if (cycles > 1) @intFromEnum(stridx) else last_obj.object.property_idx, + .property_idx = last_obj.object.property_idx, + .value_idx = last_obj.object.value_idx, + } }); + continue; + } + }, + .string => { + // maybe we could dismiss the while loop altogether and just do this + // the whole time + if (it.peek()) |next| if (next.type == .colon) { + continue :flag TokenType.property; + }; + + _ = try self.addString(allocator, token.value.?.string); + const last_obj = self.index.get(depth_buf[cycles - 1]); + if (cycles > 0) { + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{ + .len = last_obj.object.len, + .property_idx = last_obj.object.property_idx, .value_idx = last_obj.object.value_idx, } }); continue; @@ -345,7 +379,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { _ = try self.addNumber(allocator, token.value.?.number); const last_obj = self.index.get(depth_buf[cycles - 1]); if (cycles > 0) { - self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{ .len = last_obj.object.len, .property_idx = last_obj.object.property_idx, .value_idx = last_obj.object.value_idx, @@ -357,7 +391,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { _ = try self.addBool(allocator, if (token.type == .true) true else false); const last_obj = self.index.get(depth_buf[cycles - 1]); if (cycles > 0) { - self.index.set(depth_buf[cycles - 1], .{ .object = ObjectIndex.Entry{ + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{ .len = last_obj.object.len, .property_idx = last_obj.object.property_idx, .value_idx = last_obj.object.value_idx, @@ -365,6 +399,23 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !void { continue; } }, + .null => { + _ = try self.addNull(allocator); + const last_obj = self.index.get(depth_buf[cycles - 1]); + if (cycles > 0) { + self.index.set(depth_buf[cycles - 1], .{ .object = ObjectEntry{ + .len = last_obj.object.len, + .property_idx = last_obj.object.property_idx, + .value_idx = last_obj.object.value_idx, + } }); + continue; + } + }, + .comma => { + if (it.peek()) |tc| if (tc.type == .object_end and self.options.flags.allow_trailing_comma) { + return error.TrailingComma; + }; + }, else => {}, } @@ -384,10 +435,11 @@ test parse { var tokenizer = try Tokenizer.init(allocator, blk: { const json = \\ { - \\ "key": 123, - \\ "key2": false, + \\ "key": "hello", + \\ "key2": "world", \\ "key3": true, - \\ "key4": null + \\ "key4": null, + \\ "key5": 123 \\ } ; break :blk json; diff --git a/tokenizer.zig b/tokenizer.zig index 57266a9..4fd450f 100644 --- a/tokenizer.zig +++ b/tokenizer.zig @@ -232,14 +232,14 @@ pub fn nextNumber(self: *Self) Error!Token { return error.BadNumber; // no floating point }; - return self.commit(Token{ + return Token{ .type = .number, .value = .{ .number = float, }, .start = start, .end = self.currentPosition(), - }); + }; }; while (self.matchCharRange('0', '9') != null) {} @@ -248,14 +248,14 @@ pub fn nextNumber(self: *Self) Error!Token { return error.BadNumber; // floating point }; - return self.commit(Token{ + return .{ .type = .number, .value = .{ .number = float, }, .start = start, .end = self.currentPosition(), - }); + }; } /// Parse an identifier token @@ -284,32 +284,32 @@ pub fn nextIdentifier(self: *Self) Error!Token { // true if (std.mem.eql(u8, ident, "true")) { - return self.commit(Token{ + return .{ .type = .true, .value = null, .start = start, .end = self.currentPosition(), - }); + }; } // false if (std.mem.eql(u8, ident, "false")) { - return self.commit(Token{ + return .{ .type = .false, .value = null, .start = start, .end = self.currentPosition(), - }); + }; } // null if (std.mem.eql(u8, ident, "null")) { - return self.commit(Token{ + return .{ .type = .null, .value = null, .start = start, .end = self.currentPosition(), - }); + }; } unreachable; @@ -350,17 +350,17 @@ pub fn nextToken(self: *Self) Error!Token { ':' => .colon, '"' => { self.rollback(); - return (self.nextString()); + return self.commit(self.nextString()); }, else => { self.rollback(); // Try different token types in order of precedence if (std.ascii.isDigit(c) or c == '-') { - return (self.nextNumber()); + return self.commit(self.nextNumber()); } if (std.ascii.isAlphabetic(c)) { - return (self.nextIdentifier()); + return self.commit(self.nextIdentifier()); } return error.InvalidSyntax; @@ -392,12 +392,12 @@ pub fn nextString(self: *Self) Error!Token { switch (self.lastChar()) { '"' => { - return self.commit(Token{ + return .{ .type = .string, .value = .{ .string = try buffer.toOwnedSlice() }, .start = start, .end = self.currentPosition(), - }); + }; }, '\\' => { self.advance(1); @@ -415,12 +415,12 @@ pub fn nextString(self: *Self) Error!Token { var code_points: [4]u8 = undefined; inline for (0..4) |i| { if (self.endOfInput()) - return self.commit(Token{ + return .{ .type = .eof, .value = null, .start = start, .end = start + 1, - }); + }; self.advance(1); code_points[i] = self.lastChar(); } @@ -446,6 +446,7 @@ pub fn nextString(self: *Self) Error!Token { pub const Iterator = struct { tokenizer: *Self, pub fn next(it: *Iterator) ?Token { + defer it.tokenizer.skipWhitespace(); if (it.tokenizer.endOfInput()) return null; return it.tokenizer.nextToken() catch null; } @@ -455,6 +456,12 @@ pub const Iterator = struct { it.tokenizer.frame = 0; it.tokenizer.prev_token = null; } + pub fn peek(it: *Iterator) ?Token { + defer it.tokenizer.position -%= 1; + defer it.tokenizer.skipWhitespace(); + if (it.tokenizer.endOfInput()) return null; + return it.tokenizer.nextToken() catch null; + } }; /// iterator