all tests pass

2025-05-29 21:25:31 -05:00 · 2025-05-29 21:25:31 -05:00 · 3952f49d66
commit 3952f49d66
parent 5bca03ea6b
4 changed files with 83 additions and 63 deletions
--- a/README.md
+++ b/README.md
@ -23,11 +23,12 @@
 * (a) Ok but all of your reasons are dumb
    - I'll answer this later
 ## Behaviour
- Null characters, eg: U+0000 are forbidden and will be ignored by the parser
 - All control characters except DEL are forbidden
- It passes most of the JSON test suite, 253 tests as of 2025-05-29 when I'm writing this
+- Null control characters, eg: U+0000 are forbidden and will be ignored by the parser
+- Of course, uses null terminated strings, clearly this is not the best approach, but it's memory efficient and fast as fuck!
+- It passes most unit tests of the [JSON test suite](https://github.com/nst/JSONTestSuite), totalling 286 tests as of 2025-05-29 when I'm writing this.
 - It has no reflection as of right now, you must implement your own `parse` function, with reflection
- It uses null terminated strings, clearly this is not the best approach, but it's efficient
+- All defaults are configurable via the `Flags` bitfield

 ## Sic respondeo:
 The Zig Discord server is plagued with modern scum, of course, modern scum will dismiss all of my claims or label them as "dumb" or "you're using it wrong!", has any of these individuals not considered that Zig is over complicated? There is a reason why Andrew Kelley himself detached from the communities [and has spoken in multiple instances](https://andrewkelley.me/post/goodbye-twitter-reddit.html) about the "shitification" of software communities, it's like turning a good community of like-minded programmers into a soydev shill. One good thing that he did was shutting down the r/zig subreddit.
--- a/language.zig
+++ b/language.zig
@ -10,6 +10,7 @@ const assert = std.debug.assert;
 const Self = @This();

 pub const Error = enum {
+    Eof,
    TrailingComma,
    MissingKey,
    MissingValue,
@ -117,11 +118,23 @@ pub const PropertyEntry = struct {
 };

 pub const Flags = packed struct {
+    /// Make the tokenizer omit comments, TBD
    allow_comments: bool = false,
+
+    /// Not to error on trailing comma, default is `false` for obvious reasons
    allow_trailing_comma: bool = false,
-    enums_are_strings: bool = false,
-    unions_are_strings: bool = false,
-    packed_structs_are_ints: bool = false,
+
+    /// Allows parsing `packed struct` as an `int`, size is the backing int
+    bitfields: bool = false,
+
+    /// Allows parsing `enum` as an `int`, size is the backing int
+    real_enums: bool = false,
+
+    /// Allows parsing unions, default behaviour is yet to be concluded
+    unions: bool = false,
+
+    /// To cast numbers always as f64, as the name says
+    numbersf64: bool = false,
 };

 pub const Options = struct {
@ -221,6 +234,9 @@ pub fn getValue(
    allocator: mem.Allocator,
    idx: usize,
 ) !JsonInput {
+    if (self.index.len == 0)
+        return error.InvalidSyntax;
+
    const entry = self.index.get(idx);

    switch (entry) {
@ -269,14 +285,16 @@ pub fn getValue(

 /// always returns 0 (root)
 pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
-    const allocator = tokenizer.allocator;
+    tokenizer.skipWhitespace();

-    var it = tokenizer.iterator();
+    if (tokenizer.endOfInput())
+        return error.Eof;
+
+    const allocator = tokenizer.allocator;

    const root = try self.addEmpty(allocator);

-    var token = it.next() orelse
-        return root;
+    var token = try tokenizer.nextToken();

    var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0);

@ -313,11 +331,11 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
                else => return error.InvalidSyntax,
            }

-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = try tokenizer.nextToken();
            token = next;
            switch (next.type) {
                .colon => {
-                    token = it.next() orelse return error.InvalidSyntax;
+                    token = try tokenizer.nextToken();
                    continue :flag token.type;
                },
                else => continue :flag next.type,
@ -357,7 +375,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
                }
            }

-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = try tokenizer.nextToken();
            token = next;
            switch (next.type) {
                .string => continue :flag .property,
@ -373,8 +391,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
            if (query.slice().len == 0)
                return root;

-            const next = it.next() orelse
-                return root;
+            const next = try tokenizer.nextToken();

            token = next;
            switch (next.type) {
@ -416,7 +433,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
                }
            }

-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = try tokenizer.nextToken();
            token = next;
            switch (next.type) {
                .property => return error.InvalidSyntax,
@ -444,7 +461,10 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
                else => {},
            }

-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = tokenizer.nextToken() catch |err| switch (err) {
+                error.InvalidSyntax => return err,
+                else => return root,
+            };
            token = next;
            switch (next.type) {
                .comma => continue :flag .comma,
@ -467,7 +487,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {

            const parent_idx = query.get(query.len - 1);

-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = try tokenizer.nextToken();
            switch (next.type) {
                .colon => {
                    continue :flag .property;
@ -512,7 +532,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
                else => {},
            }

-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = try tokenizer.nextToken();
            token = next;
            switch (next.type) {
                .comma => continue :flag .comma,
@ -522,7 +542,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
        },
        .comma => {
            if (!self.options.flags.allow_trailing_comma) {
-                const next = it.next() orelse return error.InvalidSyntax;
+                const next = try tokenizer.nextToken();
                token = next;
                switch (next.type) {
                    .object_end, .array_end => return error.TrailingComma,
@ -550,7 +570,10 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
                },
                else => {},
            }
-            const next = it.next() orelse return error.InvalidSyntax;
+            const next = tokenizer.nextToken() catch |err| switch (err) {
+                error.InvalidSyntax => return err,
+                else => return root,
+            };
            token = next;
            switch (next.type) {
                .comma => continue :flag .comma,
--- a/test.zig
+++ b/test.zig
@ -123,7 +123,6 @@ test { try expectPass("/y_object_duplicated_key_and_value.json"); }
 test { try expectPass("/y_object_duplicated_key.json"); }
 test { try expectPass("/y_object_empty.json"); }
 test { try expectPass("/y_object_empty_key.json"); }
-// BIG ISSUE
 test { try expectPass("/y_object_escaped_null_in_key.json"); }
 test { try expectPass("/y_object_extreme_numbers.json"); }
 test { try expectPass("/y_object.json"); }
@ -295,55 +294,53 @@ test { try expectFail("/n_object_two_commas_in_a_row.json"); }
 test { try expectFail("/n_object_unquoted_key.json"); }
 test { try expectFail("/n_object_unterminated-value.json"); }
 test { try expectFail("/n_object_with_single_string.json"); }
-// !!!
 test { try expectFail("/n_object_with_trailing_garbage.json"); }
-// test { try expectFail("/n_single_space.json"); }
+test { try expectFail("/n_single_space.json"); }
 test { try expectFail("/n_string_1_surrogate_then_escape.json"); }
-// test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); }
-// test { try expectFail("/n_string_1_surrogate_then_escape_u1x.json"); }
-// test { try expectFail("/n_string_1_surrogate_then_escape_u.json"); }
+test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); }
+test { try expectFail("/n_string_1_surrogate_then_escape_u1x.json"); }
+test { try expectFail("/n_string_1_surrogate_then_escape_u.json"); }
 test { try expectFail("/n_string_accentuated_char_no_quotes.json"); }
 test { try expectFail("/n_string_backslash_00.json"); }
 test { try expectFail("/n_string_escaped_backslash_bad.json"); }
 test { try expectFail("/n_string_escaped_ctrl_char_tab.json"); }
-// test { try expectFail("/n_string_escaped_emoji.json"); }
-// test { try expectFail("/n_string_escape_x.json"); }
-// test { try expectFail("/n_string_incomplete_escaped_character.json"); }
+test { try expectFail("/n_string_escaped_emoji.json"); }
+test { try expectFail("/n_string_escape_x.json"); }
+test { try expectFail("/n_string_incomplete_escaped_character.json"); }
 test { try expectFail("/n_string_incomplete_escape.json"); }
-// test { try expectFail("/n_string_incomplete_surrogate_escape_invalid.json"); }
-// test { try expectFail("/n_string_incomplete_surrogate.json"); }
-// test { try expectFail("/n_string_invalid_backslash_esc.json"); }
+test { try expectFail("/n_string_incomplete_surrogate_escape_invalid.json"); }
+test { try expectFail("/n_string_incomplete_surrogate.json"); }
+test { try expectFail("/n_string_invalid_backslash_esc.json"); }
 test { try expectFail("/n_string_invalid_unicode_escape.json"); }
-// test { try expectFail("/n_string_invalid_utf8_after_escape.json"); }
-// test { try expectFail("/n_string_invalid-utf-8-in-escape.json"); }
+test { try expectFail("/n_string_invalid_utf8_after_escape.json"); }
+test { try expectFail("/n_string_invalid-utf-8-in-escape.json"); }
 test { try expectFail("/n_string_leading_uescaped_thinspace.json"); }
 test { try expectFail("/n_string_no_quotes_with_bad_escape.json"); }
-//test { try expectFail("/n_string_single_doublequote.json"); }
+test { try expectFail("/n_string_single_doublequote.json"); }
 test { try expectFail("/n_string_single_quote.json"); }
-//test { try expectFail("/n_string_single_string_no_double_quotes.json"); }
+test { try expectFail("/n_string_single_string_no_double_quotes.json"); }
 test { try expectFail("/n_string_start_escape_unclosed.json"); }
 test { try expectFail("/n_string_unescaped_ctrl_char.json"); }
 test { try expectFail("/n_string_unescaped_newline.json"); }
 test { try expectFail("/n_string_unescaped_tab.json"); }
-// test { try expectFail("/n_string_unicode_CapitalU.json"); }
-// possibly stack overflow
+test { try expectFail("/n_string_unicode_CapitalU.json"); }
 test { try expectFail("/n_string_with_trailing_garbage.json"); }
 test { try expectFail("/n_structure_100000_opening_arrays.json"); }
-//test { try expectFail("/n_structure_angle_bracket_..json"); }
+test { try expectFail("/n_structure_angle_bracket_..json"); }
 test { try expectFail("/n_structure_angle_bracket_null.json"); }
 test { try expectFail("/n_structure_array_trailing_garbage.json"); }
 test { try expectFail("/n_structure_array_with_extra_array_close.json"); }
 test { try expectFail("/n_structure_array_with_unclosed_string.json"); }
-//test { try expectFail("/n_structure_ascii-unicode-identifier.json"); }
+test { try expectFail("/n_structure_ascii-unicode-identifier.json"); }
 test { try expectFail("/n_structure_capitalized_True.json"); }
 test { try expectFail("/n_structure_close_unopened_array.json"); }
 test { try expectFail("/n_structure_comma_instead_of_closing_brace.json"); }
 test { try expectFail("/n_structure_double_array.json"); }
 test { try expectFail("/n_structure_end_array.json"); }
-//test { try expectFail("/n_structure_incomplete_UTF8_BOM.json"); }
-//test { try expectFail("/n_structure_lone-invalid-utf-8.json"); }
+test { try expectFail("/n_structure_incomplete_UTF8_BOM.json"); }
+test { try expectFail("/n_structure_lone-invalid-utf-8.json"); }
 test { try expectFail("/n_structure_lone-open-bracket.json"); }
-//test { try expectFail("/n_structure_no_data.json"); }
+test { try expectFail("/n_structure_no_data.json"); }
 test { try expectFail("/n_structure_null-byte-outside-string.json"); }
 test { try expectFail("/n_structure_number_with_trailing_garbage.json"); }
 test { try expectFail("/n_structure_object_followed_by_closing_object.json"); }
@ -363,8 +360,8 @@ test { try expectFail("/n_structure_open_object_open_array.json"); }
 test { try expectFail("/n_structure_open_object_open_string.json"); }
 test { try expectFail("/n_structure_open_object_string_with_apostrophes.json"); }
 test { try expectFail("/n_structure_open_open.json"); }
-//test { try expectFail("/n_structure_single_eacute.json"); }
-//test { try expectFail("/n_structure_single_star.json"); }
+test { try expectFail("/n_structure_single_eacute.json"); }
+test { try expectFail("/n_structure_single_star.json"); }
 test { try expectFail("/n_structure_trailing_#.json"); }
 test { try expectFail("/n_structure_U+2060_word_joined.json"); }
 test { try expectFail("/n_structure_uescaped_LF_before_string.json"); }
@ -373,8 +370,8 @@ test { try expectFail("/n_structure_unclosed_array_partial_null.json"); }
 test { try expectFail("/n_structure_unclosed_array_unfinished_false.json"); }
 test { try expectFail("/n_structure_unclosed_array_unfinished_true.json"); }
 test { try expectFail("/n_structure_unclosed_object.json"); }
-//test { try expectFail("/n_structure_unicode-identifier.json"); }
-//test { try expectFail("/n_structure_UTF8_BOM_no_data.json"); }
+test { try expectFail("/n_structure_unicode-identifier.json"); }
+test { try expectFail("/n_structure_UTF8_BOM_no_data.json"); }
 test { try expectFail("/n_structure_whitespace_formfeed.json"); }
 test { try expectFail("/n_structure_whitespace_U+2060_word_joiner.json"); }
 // zig fmt: off
--- a/tokenizer.zig
+++ b/tokenizer.zig
@ -124,7 +124,7 @@ fn currentChar(self: *Self) u8 {
    return self.text[self.currentPosition()];
 }

-fn endOfInput(self: *Self) bool {
+pub fn endOfInput(self: *Self) bool {
    return self.currentPosition() >= self.text.len;
 }

@ -204,19 +204,17 @@ fn extractSlice(self: *Self, start: usize) []const u8 {
 // Skip all whitespace characters
 pub fn skipWhitespace(self: *Self) void {
    const start = self.currentPosition();
-    if (self.endOfInput())
-        return;
    const end = skipWhitespaceSimd(self.text[start..]);
    self.advance(end);
 }

 /// Parse a number token
 pub fn nextNumber(self: *Self) Error!Token {
+    self.skipWhitespace();
+
    const start = try self.pushFrame();
    errdefer self.popFrame();

-    self.skipWhitespace();
-
    self.matchChar('-') orelse {}; // this may not fail

    while (self.matchCharRange('0', '9') != null) {}
@ -265,11 +263,11 @@ pub fn nextNumber(self: *Self) Error!Token {

 /// Parse an identifier token
 pub fn nextIdentifier(self: *Self) Error!Token {
+    self.skipWhitespace();
+
    const start = try self.pushFrame();
    errdefer self.popFrame();

-    self.skipWhitespace();
-
    var buffer = try self.allocator.alloc(u8, 0x100);
    defer self.allocator.free(buffer);

@ -322,11 +320,11 @@ pub fn nextIdentifier(self: *Self) Error!Token {
 /// Get the next token from the input
 /// WARNING: this function eats whitespaces
 pub fn nextToken(self: *Self) Error!Token {
+    self.skipWhitespace();
+
    const start = try self.pushFrame();
    errdefer self.popFrame();

-    self.skipWhitespace();
-
    // Fall back to single character symbol
    const c = self.anyChar() orelse return .{
        .type = .eof,
@ -380,11 +378,11 @@ pub fn nextToken(self: *Self) Error!Token {
 }

 pub fn nextString(self: *Self) Error!Token {
+    self.skipWhitespace();
+
    const start = try self.pushFrame();
    errdefer self.popFrame();

-    self.skipWhitespace();
-
    self.matchChar('"') orelse unreachable;

    var buffer: std.ArrayList(u8) = .init(self.allocator);
@ -457,12 +455,13 @@ pub const Iterator = struct {
        errdefer it.tokenizer.deinit();

        if (it.tokenizer.endOfInput()) {
-            // std.debug.print("got eof\n", .{});
            return null;
        }
-        return it.tokenizer.nextToken() catch {
-            // std.debug.print("got err: {s}\n", .{@errorName(err)});
-            return null;
+        return it.tokenizer.nextToken() catch |err| switch (err) {
+            error.InvalidSyntax => unreachable,
+            else => {
+                return null;
+            },
        };
    }