From 3952f49d66ae4f7645458546e20f4ff1afad540b Mon Sep 17 00:00:00 2001 From: yuzu Date: Thu, 29 May 2025 21:25:31 -0500 Subject: [PATCH] all tests pass --- README.md | 7 +++--- language.zig | 59 +++++++++++++++++++++++++++++++++++---------------- test.zig | 51 +++++++++++++++++++++----------------------- tokenizer.zig | 29 ++++++++++++------------- 4 files changed, 83 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 88ffca5..7548143 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,12 @@ * (a) Ok but all of your reasons are dumb - I'll answer this later ## Behaviour -- Null characters, eg: U+0000 are forbidden and will be ignored by the parser - All control characters except DEL are forbidden -- It passes most of the JSON test suite, 253 tests as of 2025-05-29 when I'm writing this +- Null control characters, eg: U+0000 are forbidden and will be ignored by the parser +- Of course, uses null terminated strings, clearly this is not the best approach, but it's memory efficient and fast as fuck! +- It passes most unit tests of the [JSON test suite](https://github.com/nst/JSONTestSuite), totalling 286 tests as of 2025-05-29 when I'm writing this. - It has no reflection as of right now, you must implement your own `parse` function, with reflection -- It uses null terminated strings, clearly this is not the best approach, but it's efficient +- All defaults are configurable via the `Flags` bitfield ## Sic respondeo: The Zig Discord server is plagued with modern scum, of course, modern scum will dismiss all of my claims or label them as "dumb" or "you're using it wrong!", has any of these individuals not considered that Zig is over complicated? There is a reason why Andrew Kelley himself detached from the communities [and has spoken in multiple instances](https://andrewkelley.me/post/goodbye-twitter-reddit.html) about the "shitification" of software communities, it's like turning a good community of like-minded programmers into a soydev shill. One good thing that he did was shutting down the r/zig subreddit. diff --git a/language.zig b/language.zig index b8f93ce..7635a98 100644 --- a/language.zig +++ b/language.zig @@ -10,6 +10,7 @@ const assert = std.debug.assert; const Self = @This(); pub const Error = enum { + Eof, TrailingComma, MissingKey, MissingValue, @@ -117,11 +118,23 @@ pub const PropertyEntry = struct { }; pub const Flags = packed struct { + /// Make the tokenizer omit comments, TBD allow_comments: bool = false, + + /// Not to error on trailing comma, default is `false` for obvious reasons allow_trailing_comma: bool = false, - enums_are_strings: bool = false, - unions_are_strings: bool = false, - packed_structs_are_ints: bool = false, + + /// Allows parsing `packed struct` as an `int`, size is the backing int + bitfields: bool = false, + + /// Allows parsing `enum` as an `int`, size is the backing int + real_enums: bool = false, + + /// Allows parsing unions, default behaviour is yet to be concluded + unions: bool = false, + + /// To cast numbers always as f64, as the name says + numbersf64: bool = false, }; pub const Options = struct { @@ -221,6 +234,9 @@ pub fn getValue( allocator: mem.Allocator, idx: usize, ) !JsonInput { + if (self.index.len == 0) + return error.InvalidSyntax; + const entry = self.index.get(idx); switch (entry) { @@ -269,14 +285,16 @@ pub fn getValue( /// always returns 0 (root) pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { - const allocator = tokenizer.allocator; + tokenizer.skipWhitespace(); - var it = tokenizer.iterator(); + if (tokenizer.endOfInput()) + return error.Eof; + + const allocator = tokenizer.allocator; const root = try self.addEmpty(allocator); - var token = it.next() orelse - return root; + var token = try tokenizer.nextToken(); var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0); @@ -313,11 +331,11 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { else => return error.InvalidSyntax, } - const next = it.next() orelse return error.InvalidSyntax; + const next = try tokenizer.nextToken(); token = next; switch (next.type) { .colon => { - token = it.next() orelse return error.InvalidSyntax; + token = try tokenizer.nextToken(); continue :flag token.type; }, else => continue :flag next.type, @@ -357,7 +375,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { } } - const next = it.next() orelse return error.InvalidSyntax; + const next = try tokenizer.nextToken(); token = next; switch (next.type) { .string => continue :flag .property, @@ -373,8 +391,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { if (query.slice().len == 0) return root; - const next = it.next() orelse - return root; + const next = try tokenizer.nextToken(); token = next; switch (next.type) { @@ -416,7 +433,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { } } - const next = it.next() orelse return error.InvalidSyntax; + const next = try tokenizer.nextToken(); token = next; switch (next.type) { .property => return error.InvalidSyntax, @@ -444,7 +461,10 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { else => {}, } - const next = it.next() orelse return error.InvalidSyntax; + const next = tokenizer.nextToken() catch |err| switch (err) { + error.InvalidSyntax => return err, + else => return root, + }; token = next; switch (next.type) { .comma => continue :flag .comma, @@ -467,7 +487,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { const parent_idx = query.get(query.len - 1); - const next = it.next() orelse return error.InvalidSyntax; + const next = try tokenizer.nextToken(); switch (next.type) { .colon => { continue :flag .property; @@ -512,7 +532,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { else => {}, } - const next = it.next() orelse return error.InvalidSyntax; + const next = try tokenizer.nextToken(); token = next; switch (next.type) { .comma => continue :flag .comma, @@ -522,7 +542,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { }, .comma => { if (!self.options.flags.allow_trailing_comma) { - const next = it.next() orelse return error.InvalidSyntax; + const next = try tokenizer.nextToken(); token = next; switch (next.type) { .object_end, .array_end => return error.TrailingComma, @@ -550,7 +570,10 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { }, else => {}, } - const next = it.next() orelse return error.InvalidSyntax; + const next = tokenizer.nextToken() catch |err| switch (err) { + error.InvalidSyntax => return err, + else => return root, + }; token = next; switch (next.type) { .comma => continue :flag .comma, diff --git a/test.zig b/test.zig index 05a12b1..bcd9506 100644 --- a/test.zig +++ b/test.zig @@ -123,7 +123,6 @@ test { try expectPass("/y_object_duplicated_key_and_value.json"); } test { try expectPass("/y_object_duplicated_key.json"); } test { try expectPass("/y_object_empty.json"); } test { try expectPass("/y_object_empty_key.json"); } -// BIG ISSUE test { try expectPass("/y_object_escaped_null_in_key.json"); } test { try expectPass("/y_object_extreme_numbers.json"); } test { try expectPass("/y_object.json"); } @@ -295,55 +294,53 @@ test { try expectFail("/n_object_two_commas_in_a_row.json"); } test { try expectFail("/n_object_unquoted_key.json"); } test { try expectFail("/n_object_unterminated-value.json"); } test { try expectFail("/n_object_with_single_string.json"); } -// !!! test { try expectFail("/n_object_with_trailing_garbage.json"); } -// test { try expectFail("/n_single_space.json"); } +test { try expectFail("/n_single_space.json"); } test { try expectFail("/n_string_1_surrogate_then_escape.json"); } -// test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); } -// test { try expectFail("/n_string_1_surrogate_then_escape_u1x.json"); } -// test { try expectFail("/n_string_1_surrogate_then_escape_u.json"); } +test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); } +test { try expectFail("/n_string_1_surrogate_then_escape_u1x.json"); } +test { try expectFail("/n_string_1_surrogate_then_escape_u.json"); } test { try expectFail("/n_string_accentuated_char_no_quotes.json"); } test { try expectFail("/n_string_backslash_00.json"); } test { try expectFail("/n_string_escaped_backslash_bad.json"); } test { try expectFail("/n_string_escaped_ctrl_char_tab.json"); } -// test { try expectFail("/n_string_escaped_emoji.json"); } -// test { try expectFail("/n_string_escape_x.json"); } -// test { try expectFail("/n_string_incomplete_escaped_character.json"); } +test { try expectFail("/n_string_escaped_emoji.json"); } +test { try expectFail("/n_string_escape_x.json"); } +test { try expectFail("/n_string_incomplete_escaped_character.json"); } test { try expectFail("/n_string_incomplete_escape.json"); } -// test { try expectFail("/n_string_incomplete_surrogate_escape_invalid.json"); } -// test { try expectFail("/n_string_incomplete_surrogate.json"); } -// test { try expectFail("/n_string_invalid_backslash_esc.json"); } +test { try expectFail("/n_string_incomplete_surrogate_escape_invalid.json"); } +test { try expectFail("/n_string_incomplete_surrogate.json"); } +test { try expectFail("/n_string_invalid_backslash_esc.json"); } test { try expectFail("/n_string_invalid_unicode_escape.json"); } -// test { try expectFail("/n_string_invalid_utf8_after_escape.json"); } -// test { try expectFail("/n_string_invalid-utf-8-in-escape.json"); } +test { try expectFail("/n_string_invalid_utf8_after_escape.json"); } +test { try expectFail("/n_string_invalid-utf-8-in-escape.json"); } test { try expectFail("/n_string_leading_uescaped_thinspace.json"); } test { try expectFail("/n_string_no_quotes_with_bad_escape.json"); } -//test { try expectFail("/n_string_single_doublequote.json"); } +test { try expectFail("/n_string_single_doublequote.json"); } test { try expectFail("/n_string_single_quote.json"); } -//test { try expectFail("/n_string_single_string_no_double_quotes.json"); } +test { try expectFail("/n_string_single_string_no_double_quotes.json"); } test { try expectFail("/n_string_start_escape_unclosed.json"); } test { try expectFail("/n_string_unescaped_ctrl_char.json"); } test { try expectFail("/n_string_unescaped_newline.json"); } test { try expectFail("/n_string_unescaped_tab.json"); } -// test { try expectFail("/n_string_unicode_CapitalU.json"); } -// possibly stack overflow +test { try expectFail("/n_string_unicode_CapitalU.json"); } test { try expectFail("/n_string_with_trailing_garbage.json"); } test { try expectFail("/n_structure_100000_opening_arrays.json"); } -//test { try expectFail("/n_structure_angle_bracket_..json"); } +test { try expectFail("/n_structure_angle_bracket_..json"); } test { try expectFail("/n_structure_angle_bracket_null.json"); } test { try expectFail("/n_structure_array_trailing_garbage.json"); } test { try expectFail("/n_structure_array_with_extra_array_close.json"); } test { try expectFail("/n_structure_array_with_unclosed_string.json"); } -//test { try expectFail("/n_structure_ascii-unicode-identifier.json"); } +test { try expectFail("/n_structure_ascii-unicode-identifier.json"); } test { try expectFail("/n_structure_capitalized_True.json"); } test { try expectFail("/n_structure_close_unopened_array.json"); } test { try expectFail("/n_structure_comma_instead_of_closing_brace.json"); } test { try expectFail("/n_structure_double_array.json"); } test { try expectFail("/n_structure_end_array.json"); } -//test { try expectFail("/n_structure_incomplete_UTF8_BOM.json"); } -//test { try expectFail("/n_structure_lone-invalid-utf-8.json"); } +test { try expectFail("/n_structure_incomplete_UTF8_BOM.json"); } +test { try expectFail("/n_structure_lone-invalid-utf-8.json"); } test { try expectFail("/n_structure_lone-open-bracket.json"); } -//test { try expectFail("/n_structure_no_data.json"); } +test { try expectFail("/n_structure_no_data.json"); } test { try expectFail("/n_structure_null-byte-outside-string.json"); } test { try expectFail("/n_structure_number_with_trailing_garbage.json"); } test { try expectFail("/n_structure_object_followed_by_closing_object.json"); } @@ -363,8 +360,8 @@ test { try expectFail("/n_structure_open_object_open_array.json"); } test { try expectFail("/n_structure_open_object_open_string.json"); } test { try expectFail("/n_structure_open_object_string_with_apostrophes.json"); } test { try expectFail("/n_structure_open_open.json"); } -//test { try expectFail("/n_structure_single_eacute.json"); } -//test { try expectFail("/n_structure_single_star.json"); } +test { try expectFail("/n_structure_single_eacute.json"); } +test { try expectFail("/n_structure_single_star.json"); } test { try expectFail("/n_structure_trailing_#.json"); } test { try expectFail("/n_structure_U+2060_word_joined.json"); } test { try expectFail("/n_structure_uescaped_LF_before_string.json"); } @@ -373,8 +370,8 @@ test { try expectFail("/n_structure_unclosed_array_partial_null.json"); } test { try expectFail("/n_structure_unclosed_array_unfinished_false.json"); } test { try expectFail("/n_structure_unclosed_array_unfinished_true.json"); } test { try expectFail("/n_structure_unclosed_object.json"); } -//test { try expectFail("/n_structure_unicode-identifier.json"); } -//test { try expectFail("/n_structure_UTF8_BOM_no_data.json"); } +test { try expectFail("/n_structure_unicode-identifier.json"); } +test { try expectFail("/n_structure_UTF8_BOM_no_data.json"); } test { try expectFail("/n_structure_whitespace_formfeed.json"); } test { try expectFail("/n_structure_whitespace_U+2060_word_joiner.json"); } // zig fmt: off diff --git a/tokenizer.zig b/tokenizer.zig index 7ccb2cc..4ec2d3e 100644 --- a/tokenizer.zig +++ b/tokenizer.zig @@ -124,7 +124,7 @@ fn currentChar(self: *Self) u8 { return self.text[self.currentPosition()]; } -fn endOfInput(self: *Self) bool { +pub fn endOfInput(self: *Self) bool { return self.currentPosition() >= self.text.len; } @@ -204,19 +204,17 @@ fn extractSlice(self: *Self, start: usize) []const u8 { // Skip all whitespace characters pub fn skipWhitespace(self: *Self) void { const start = self.currentPosition(); - if (self.endOfInput()) - return; const end = skipWhitespaceSimd(self.text[start..]); self.advance(end); } /// Parse a number token pub fn nextNumber(self: *Self) Error!Token { + self.skipWhitespace(); + const start = try self.pushFrame(); errdefer self.popFrame(); - self.skipWhitespace(); - self.matchChar('-') orelse {}; // this may not fail while (self.matchCharRange('0', '9') != null) {} @@ -265,11 +263,11 @@ pub fn nextNumber(self: *Self) Error!Token { /// Parse an identifier token pub fn nextIdentifier(self: *Self) Error!Token { + self.skipWhitespace(); + const start = try self.pushFrame(); errdefer self.popFrame(); - self.skipWhitespace(); - var buffer = try self.allocator.alloc(u8, 0x100); defer self.allocator.free(buffer); @@ -322,11 +320,11 @@ pub fn nextIdentifier(self: *Self) Error!Token { /// Get the next token from the input /// WARNING: this function eats whitespaces pub fn nextToken(self: *Self) Error!Token { + self.skipWhitespace(); + const start = try self.pushFrame(); errdefer self.popFrame(); - self.skipWhitespace(); - // Fall back to single character symbol const c = self.anyChar() orelse return .{ .type = .eof, @@ -380,11 +378,11 @@ pub fn nextToken(self: *Self) Error!Token { } pub fn nextString(self: *Self) Error!Token { + self.skipWhitespace(); + const start = try self.pushFrame(); errdefer self.popFrame(); - self.skipWhitespace(); - self.matchChar('"') orelse unreachable; var buffer: std.ArrayList(u8) = .init(self.allocator); @@ -457,12 +455,13 @@ pub const Iterator = struct { errdefer it.tokenizer.deinit(); if (it.tokenizer.endOfInput()) { - // std.debug.print("got eof\n", .{}); return null; } - return it.tokenizer.nextToken() catch { - // std.debug.print("got err: {s}\n", .{@errorName(err)}); - return null; + return it.tokenizer.nextToken() catch |err| switch (err) { + error.InvalidSyntax => unreachable, + else => { + return null; + }, }; }