diff --git a/language.zig b/language.zig index 7635a98..ab77103 100644 --- a/language.zig +++ b/language.zig @@ -9,22 +9,9 @@ const assert = std.debug.assert; const Self = @This(); -pub const Error = enum { - Eof, - TrailingComma, - MissingKey, - MissingValue, - UnexpectedToken, -}; +pub const Error = enum { Eof, TrailingComma, MissingKey, MissingValue, UnexpectedToken }; -pub const JsonType = enum { - null, - bool, - number, - string, - array, - object, -}; +pub const JsonType = enum { null, bool, number, string, array, object }; pub const JsonValue = union(JsonType) { null: void, @@ -36,16 +23,12 @@ pub const JsonValue = union(JsonType) { }; pub const JsonInput = union(JsonType) { - - // data structures - const Object = std.StringArrayHashMapUnmanaged(JsonInput); - null: void, bool: bool, number: f64, string: []const u8, array: []JsonInput, - object: Object, + object: std.StringArrayHashMapUnmanaged(JsonInput), pub fn deinit(self: JsonInput, allocator: mem.Allocator) void { switch (self) { @@ -99,15 +82,13 @@ pub const JsonInput = union(JsonType) { }; /// same as ObjectEntry but simpler -/// start is the offset +///.tip is the offset pub const ArraySlice = struct { len: usize, - start: usize, + tip: usize, }; -/// just += the properties and value indexes to get the next item -/// property_idx and value_idx are the offset -/// it should be ordered +/// just += the value indexes to get the next item pub const ObjectEntry = struct { len: usize, tip: usize, @@ -120,19 +101,14 @@ pub const PropertyEntry = struct { pub const Flags = packed struct { /// Make the tokenizer omit comments, TBD allow_comments: bool = false, - /// Not to error on trailing comma, default is `false` for obvious reasons allow_trailing_comma: bool = false, - /// Allows parsing `packed struct` as an `int`, size is the backing int bitfields: bool = false, - /// Allows parsing `enum` as an `int`, size is the backing int real_enums: bool = false, - /// Allows parsing unions, default behaviour is yet to be concluded unions: bool = false, - /// To cast numbers always as f64, as the name says numbersf64: bool = false, }; @@ -168,7 +144,6 @@ fn addNumber(self: *Self, allocator: mem.Allocator, number: f64) !usize { fn addProperty(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize { const stridx = try self.properties.add(allocator, bytes); - try self.index.ensureUnusedCapacity(allocator, 1); try self.property_map.ensureUnusedCapacity(allocator, 1); return @intFromEnum(stridx); } @@ -203,8 +178,7 @@ fn addNull(self: *Self, allocator: mem.Allocator) !usize { // Recursively compute how many index slots a node occupies (including nested) fn skipSlots(self: *Self, slot: usize) usize { - const e = self.index.get(slot); - switch (e) { + switch (self.index.get(slot)) { .object => |obj| { var total: usize = 1; var v = obj.tip; @@ -217,7 +191,7 @@ fn skipSlots(self: *Self, slot: usize) usize { }, .array => |arr| { var total: usize = 1; - var c = arr.start; + var c = arr.tip; for (0..arr.len) |_| { const s = skipSlots(self, c); total += s; @@ -237,9 +211,7 @@ pub fn getValue( if (self.index.len == 0) return error.InvalidSyntax; - const entry = self.index.get(idx); - - switch (entry) { + switch (self.index.get(idx)) { .null => return .{ .null = {} }, .bool => |b| return .{ .bool = b }, .number => |number| return .{ .number = number }, @@ -250,7 +222,7 @@ pub fn getValue( .array => |arr| { var out = try allocator.alloc(JsonInput, arr.len); errdefer allocator.free(out); - var c = arr.start; + var c = arr.tip; for (0..arr.len) |i| { const v = try self.getValue(allocator, c); out[i] = v; @@ -259,58 +231,42 @@ pub fn getValue( return .{ .array = out[0..arr.len] }; }, .object => |obj| { - var map: JsonInput.Object = .empty; + var map: std.StringArrayHashMapUnmanaged(JsonInput) = .empty; errdefer map.deinit(allocator); var tip = obj.tip; - - for (0..obj.len) |_| - if (self.property_map.get(tip)) |pen| { - const key_slice = pen.tip.slice(&self.properties); - - const val = try self.getValue(allocator, tip); - try map.put(allocator, key_slice, val); - const s = self.skipSlots(tip); - tip += s; - } else { - // for (self.property_map.keys(), self.property_map.values()) |k, v| { - // std.debug.print("{}: {s}\n", .{ v.tip, @tagName(self.index.get(k)) }); - // std.debug.print("tip: {d}\n", .{k}); - // } - return error.MissingKey; - }; + for (0..obj.len) |_| if (self.property_map.get(tip)) |pen| { + try map.put( + allocator, + pen.tip.slice(&self.properties), + try self.getValue(allocator, tip), + ); + tip += self.skipSlots(tip); + } else return error.MissingKey; return .{ .object = map }; }, } } /// always returns 0 (root) -pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { +pub fn parse(self: *Self, allocator: mem.Allocator, tokenizer: *Tokenizer) !usize { tokenizer.skipWhitespace(); if (tokenizer.endOfInput()) return error.Eof; - const allocator = tokenizer.allocator; - const root = try self.addEmpty(allocator); - var token = try tokenizer.nextToken(); + var token = try tokenizer.nextToken(allocator); var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0); flag: switch (token.type) { .eof => { - if (root != 0) { - return error.InvalidSyntax; - } - if (query.slice().len != 0) { - return error.InvalidSyntax; - } + if (root != 0) return error.InvalidSyntax; + if (query.slice().len != 0) return error.InvalidSyntax; return root; }, .property => { - defer tokenizer.skipWhitespace(); - const scope_idx = query.get(query.len - 1); switch (self.index.get(scope_idx)) { .object => |scope| { @@ -325,26 +281,20 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { .tip = scope.tip, } }); }, - .array => { - return error.InvalidSyntax; - }, else => return error.InvalidSyntax, } - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { .colon => { - token = try tokenizer.nextToken(); + token = try tokenizer.nextToken(allocator); continue :flag token.type; }, else => continue :flag next.type, - // else => return error.InvalidSyntax, } }, .object_begin => { - defer tokenizer.skipWhitespace(); - if (query.slice().len < 1) { const ptr = try query.addOne(); ptr.* = root; @@ -368,14 +318,14 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { .array => |slice| { self.index.set(parent_idx, .{ .array = ArraySlice{ .len = slice.len + 1, - .start = if (slice.len == 0) idx_ptr.* else slice.start, + .tip = if (slice.len == 0) idx_ptr.* else slice.tip, } }); }, else => {}, } } - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { .string => continue :flag .property, @@ -384,14 +334,13 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { } }, .object_end, .array_end => { - tokenizer.skipWhitespace(); if (query.pop() == null) return error.InvalidSyntax; // double close if (query.slice().len == 0) return root; - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { @@ -408,7 +357,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { ptr.* = root; self.index.set(root, .{ .array = ArraySlice{ .len = 0, - .start = 1, + .tip = 1, } }); } else { // order matters @@ -419,21 +368,21 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { idx_ptr.* = try self.addEmpty(allocator); self.index.set(idx_ptr.*, .{ .array = ArraySlice{ .len = 0, - .start = idx_ptr.* + 1, + .tip = idx_ptr.* + 1, } }); switch (self.index.get(parent_idx)) { .array => |slice| { self.index.set(parent_idx, .{ .array = ArraySlice{ .len = slice.len + 1, - .start = if (slice.len == 0) idx_ptr.* else slice.start, + .tip = if (slice.len == 0) idx_ptr.* else slice.tip, } }); }, else => {}, } } - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { .property => return error.InvalidSyntax, @@ -455,16 +404,13 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { .array => |slice| { self.index.set(parent_idx, .{ .array = ArraySlice{ .len = slice.len + 1, - .start = if (slice.len == 0) idx else slice.start, + .tip = if (slice.len == 0) idx else slice.tip, } }); }, else => {}, } - const next = tokenizer.nextToken() catch |err| switch (err) { - error.InvalidSyntax => return err, - else => return root, - }; + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { .comma => continue :flag .comma, @@ -473,9 +419,6 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { } }, .string => { - defer tokenizer.skipWhitespace(); - errdefer allocator.free(token.value.?.string); - if (query.len == 0) { // root _ = try self.addString(allocator, token.value.?.string); @@ -487,7 +430,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { const parent_idx = query.get(query.len - 1); - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); switch (next.type) { .colon => { continue :flag .property; @@ -499,7 +442,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { .array => |slice| { self.index.set(parent_idx, .{ .array = ArraySlice{ .len = slice.len + 1, - .start = if (slice.len == 0) idx else slice.start, + .tip = if (slice.len == 0) idx else slice.tip, } }); }, else => {}, @@ -511,8 +454,6 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { } }, .number => { - defer tokenizer.skipWhitespace(); - if (query.len == 0) { // root _ = try self.addNumber(allocator, token.value.?.number); @@ -526,13 +467,13 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { .array => |slice| { self.index.set(parent_idx, .{ .array = ArraySlice{ .len = slice.len + 1, - .start = if (slice.len == 0) idx else slice.start, + .tip = if (slice.len == 0) idx else slice.tip, } }); }, else => {}, } - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { .comma => continue :flag .comma, @@ -542,7 +483,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { }, .comma => { if (!self.options.flags.allow_trailing_comma) { - const next = try tokenizer.nextToken(); + const next = try tokenizer.nextToken(allocator); token = next; switch (next.type) { .object_end, .array_end => return error.TrailingComma, @@ -552,7 +493,6 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { } }, .null => { - defer tokenizer.skipWhitespace(); const idx = try self.addNull(allocator); if (query.len == 0) { @@ -565,12 +505,12 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { .array => |slice| { self.index.set(parent_idx, .{ .array = ArraySlice{ .len = slice.len + 1, - .start = if (slice.len == 0) idx else slice.start, + .tip = if (slice.len == 0) idx else slice.tip, } }); }, else => {}, } - const next = tokenizer.nextToken() catch |err| switch (err) { + const next = tokenizer.nextToken(allocator) catch |err| switch (err) { error.InvalidSyntax => return err, else => return root, }; @@ -603,14 +543,14 @@ test getValue { ; // 1: a, 2: b, 3: c, 4: d, 5: e, 6: f var tokenizer: Tokenizer = try .init(allocator, text); - defer tokenizer.deinit(); + defer tokenizer.deinit(allocator); var self = try allocator.create(Self); self.* = Self.init; defer allocator.destroy(self); defer self.deinit(allocator); - const idx: usize = try self.parse(&tokenizer); + const idx: usize = try self.parse(allocator, &tokenizer); var root = try self.getValue(allocator, idx); defer root.deinit(allocator); diff --git a/test.zig b/test.zig index bcd9506..ad5dcb2 100644 --- a/test.zig +++ b/test.zig @@ -5,9 +5,9 @@ const testing = std.testing; const Language = @import("language.zig"); const Tokenizer = @import("tokenizer.zig"); -test Language { - const allocator = std.testing.allocator; +const allocator = std.testing.allocator; +test Language { const text = \\ { \\ "cute": true, @@ -25,14 +25,14 @@ test Language { ; var tokenizer: Tokenizer = try .init(allocator, text); - defer tokenizer.deinit(); + defer tokenizer.deinit(allocator); var self = try allocator.create(Language); defer allocator.destroy(self); self.* = Language.init; defer self.deinit(allocator); - const idx: usize = try self.parse(&tokenizer); + const idx: usize = try self.parse(allocator, &tokenizer); var root = try self.getValue(allocator, idx); defer root.deinit(allocator); @@ -50,18 +50,18 @@ test { fn expectPass(comptime path: []const u8) !void { const file = @embedFile("tests" ++ path); - var tokenizer: Tokenizer = try .init(std.testing.allocator, file); - defer tokenizer.deinit(); + var tokenizer: Tokenizer = try .init(allocator, file); + defer tokenizer.deinit(allocator); - var self = try std.testing.allocator.create(Language); + var self = try allocator.create(Language); self.* = Language.init; - defer std.testing.allocator.destroy(self); - defer self.deinit(std.testing.allocator); + defer allocator.destroy(self); + defer self.deinit(allocator); - const idx: usize = try self.parse(&tokenizer); + const idx: usize = try self.parse(allocator, &tokenizer); - var root = try self.getValue(std.testing.allocator, idx); - defer root.deinit(std.testing.allocator); + var root = try self.getValue(allocator, idx); + defer root.deinit(allocator); std.debug.print("{}\n", .{root}); } @@ -69,22 +69,19 @@ fn expectPass(comptime path: []const u8) !void { fn expectFail(comptime path: []const u8) !void { const file = @embedFile("tests" ++ path); - var tokenizer: Tokenizer = try .init(std.testing.allocator, file); - defer tokenizer.deinit(); + var tokenizer: Tokenizer = try .init(allocator, file); + defer tokenizer.deinit(allocator); - var self = try std.testing.allocator.create(Language); + var self = try allocator.create(Language); self.* = Language.init; - defer std.testing.allocator.destroy(self); - defer self.deinit(std.testing.allocator); + defer allocator.destroy(self); + defer self.deinit(allocator); - const idx: usize = self.parse(&tokenizer) catch { + const idx: usize = self.parse(allocator, &tokenizer) catch return; - }; - - var root = self.getValue(std.testing.allocator, idx) catch { + var root = self.getValue(allocator, idx) catch return; - }; - defer root.deinit(std.testing.allocator); + defer root.deinit(allocator); } // zig fmt: off diff --git a/tokenizer.zig b/tokenizer.zig index 4ec2d3e..156ae2f 100644 --- a/tokenizer.zig +++ b/tokenizer.zig @@ -34,11 +34,7 @@ pub const TokenType = enum(u8) { pub const Token = struct { type: TokenType, - value: ?union { - number: f64, - string: []const u8, - symbol: u8, - }, + value: ?union { number: f64, string: []const u8, symbol: u8 }, start: usize, end: usize, }; @@ -49,10 +45,9 @@ text: []const u8, max_position: usize, stack: []usize, frame: usize, -allocator: std.mem.Allocator, /// Initialize a new tokenizer -pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self { +pub fn init(allocator: std.mem.Allocator, text: []const u8) mem.Allocator.Error!Self { const stack = try allocator.alloc(usize, 0x100); errdefer allocator.free(stack); @memset(stack, 0); @@ -61,13 +56,12 @@ pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Er .max_position = 0, .stack = stack, .frame = 0, - .allocator = allocator, }; } /// Clean up resources -pub fn deinit(self: *Self) void { - self.allocator.free(self.stack); +pub fn deinit(self: *Self, allocator: mem.Allocator) void { + allocator.free(self.stack); } // ========== Core Parsing Functions ========== @@ -82,13 +76,13 @@ fn advance(self: *Self, delta: usize) void { self.max_position = self.stack[self.frame]; } -fn pushFrame(self: *Self) Error!usize { +fn pushFrame(self: *Self, allocator: mem.Allocator) Error!usize { self.frame += 1; if (self.frame == self.stack.len) { - const new_stack = try self.allocator.alloc(usize, self.stack.len * 2); + const new_stack = try allocator.alloc(usize, self.stack.len * 2); @memset(new_stack, 0); @memcpy(new_stack, self.stack); - self.allocator.free(self.stack); + allocator.free(self.stack); self.stack = new_stack; } if (self.frame > self.text.len) @@ -209,10 +203,10 @@ pub fn skipWhitespace(self: *Self) void { } /// Parse a number token -pub fn nextNumber(self: *Self) Error!Token { +pub fn nextNumber(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); - const start = try self.pushFrame(); + const start = try self.pushFrame(allocator); errdefer self.popFrame(); self.matchChar('-') orelse {}; // this may not fail @@ -262,14 +256,14 @@ pub fn nextNumber(self: *Self) Error!Token { } /// Parse an identifier token -pub fn nextIdentifier(self: *Self) Error!Token { +pub fn nextIdentifier(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); - const start = try self.pushFrame(); + const start = try self.pushFrame(allocator); errdefer self.popFrame(); - var buffer = try self.allocator.alloc(u8, 0x100); - defer self.allocator.free(buffer); + var buffer = try allocator.alloc(u8, 0x100); + defer allocator.free(buffer); self.matchCharPredicate(std.ascii.isAlphabetic) orelse return error.InvalidSyntax; @@ -319,10 +313,10 @@ pub fn nextIdentifier(self: *Self) Error!Token { /// Get the next token from the input /// WARNING: this function eats whitespaces -pub fn nextToken(self: *Self) Error!Token { +pub fn nextToken(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); - const start = try self.pushFrame(); + const start = try self.pushFrame(allocator); errdefer self.popFrame(); // Fall back to single character symbol @@ -350,19 +344,19 @@ pub fn nextToken(self: *Self) Error!Token { ':' => .colon, '"' => { self.rollback(); - const string = try self.nextString(); - errdefer self.allocator.free(string); + const string = try self.nextString(allocator); + errdefer allocator.free(string); return self.commit(string); }, else => { self.rollback(); // Try different token types in order of precedence if (std.ascii.isDigit(c) or c == '-') { - return self.commit(self.nextNumber()); + return self.commit(self.nextNumber(allocator)); } if (std.ascii.isAlphabetic(c)) { - return self.commit(self.nextIdentifier()); + return self.commit(self.nextIdentifier(allocator)); } return error.InvalidSyntax; @@ -377,15 +371,15 @@ pub fn nextToken(self: *Self) Error!Token { }); } -pub fn nextString(self: *Self) Error!Token { +pub fn nextString(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); - const start = try self.pushFrame(); + const start = try self.pushFrame(allocator); errdefer self.popFrame(); self.matchChar('"') orelse unreachable; - var buffer: std.ArrayList(u8) = .init(self.allocator); + var buffer: std.ArrayList(u8) = .init(allocator); defer buffer.deinit(); loop: while (!self.endOfInput()) { @@ -449,6 +443,7 @@ pub fn nextString(self: *Self) Error!Token { pub const Iterator = struct { tokenizer: *Self, + allocator: mem.Allocator, pub fn next(it: *Iterator) ?Token { defer it.tokenizer.skipWhitespace(); @@ -457,7 +452,7 @@ pub const Iterator = struct { if (it.tokenizer.endOfInput()) { return null; } - return it.tokenizer.nextToken() catch |err| switch (err) { + return it.tokenizer.nextToken(it.allocator) catch |err| switch (err) { error.InvalidSyntax => unreachable, else => { return null; @@ -473,16 +468,13 @@ pub const Iterator = struct { }; /// iterator -pub fn iterator(self: *Self) Iterator { - return Iterator{ - .tokenizer = self, - }; +pub fn iterator(self: *Self, allocator: mem.Allocator) Iterator { + return .{ .tokenizer = self, .allocator = allocator }; } pub fn stringToUtf8(bytes: []u8) ![]u8 { - const code_point = std.fmt.parseInt(u21, bytes, 16) catch { + const code_point = std.fmt.parseInt(u21, bytes, 16) catch return error.BadNumber; - }; var buffer: [4]u8 = undefined; var index: usize = 0; @@ -553,9 +545,7 @@ pub fn skipWhitespaceSimd(text: []const u8) usize { // Find first non-whitespace const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask); - if (mask != 0) { - return j + @ctz(mask); - } + if (mask != 0) return j + @ctz(mask); } // Scalar processing for remaining bytes