all tests pass

This commit is contained in:
yuzu 2025-05-29 21:25:31 -05:00
parent 5bca03ea6b
commit 3952f49d66
4 changed files with 83 additions and 63 deletions

View File

@ -23,11 +23,12 @@
* (a) Ok but all of your reasons are dumb
- I'll answer this later
## Behaviour
- Null characters, eg: U+0000 are forbidden and will be ignored by the parser
- All control characters except DEL are forbidden
- It passes most of the JSON test suite, 253 tests as of 2025-05-29 when I'm writing this
- Null control characters, eg: U+0000 are forbidden and will be ignored by the parser
- Of course, uses null terminated strings, clearly this is not the best approach, but it's memory efficient and fast as fuck!
- It passes most unit tests of the [JSON test suite](https://github.com/nst/JSONTestSuite), totalling 286 tests as of 2025-05-29 when I'm writing this.
- It has no reflection as of right now, you must implement your own `parse` function, with reflection
- It uses null terminated strings, clearly this is not the best approach, but it's efficient
- All defaults are configurable via the `Flags` bitfield
## Sic respondeo:
The Zig Discord server is plagued with modern scum, of course, modern scum will dismiss all of my claims or label them as "dumb" or "you're using it wrong!", has any of these individuals not considered that Zig is over complicated? There is a reason why Andrew Kelley himself detached from the communities [and has spoken in multiple instances](https://andrewkelley.me/post/goodbye-twitter-reddit.html) about the "shitification" of software communities, it's like turning a good community of like-minded programmers into a soydev shill. One good thing that he did was shutting down the r/zig subreddit.

View File

@ -10,6 +10,7 @@ const assert = std.debug.assert;
const Self = @This();
pub const Error = enum {
Eof,
TrailingComma,
MissingKey,
MissingValue,
@ -117,11 +118,23 @@ pub const PropertyEntry = struct {
};
pub const Flags = packed struct {
/// Make the tokenizer omit comments, TBD
allow_comments: bool = false,
/// Not to error on trailing comma, default is `false` for obvious reasons
allow_trailing_comma: bool = false,
enums_are_strings: bool = false,
unions_are_strings: bool = false,
packed_structs_are_ints: bool = false,
/// Allows parsing `packed struct` as an `int`, size is the backing int
bitfields: bool = false,
/// Allows parsing `enum` as an `int`, size is the backing int
real_enums: bool = false,
/// Allows parsing unions, default behaviour is yet to be concluded
unions: bool = false,
/// To cast numbers always as f64, as the name says
numbersf64: bool = false,
};
pub const Options = struct {
@ -221,6 +234,9 @@ pub fn getValue(
allocator: mem.Allocator,
idx: usize,
) !JsonInput {
if (self.index.len == 0)
return error.InvalidSyntax;
const entry = self.index.get(idx);
switch (entry) {
@ -269,14 +285,16 @@ pub fn getValue(
/// always returns 0 (root)
pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
const allocator = tokenizer.allocator;
tokenizer.skipWhitespace();
var it = tokenizer.iterator();
if (tokenizer.endOfInput())
return error.Eof;
const allocator = tokenizer.allocator;
const root = try self.addEmpty(allocator);
var token = it.next() orelse
return root;
var token = try tokenizer.nextToken();
var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0);
@ -313,11 +331,11 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
else => return error.InvalidSyntax,
}
const next = it.next() orelse return error.InvalidSyntax;
const next = try tokenizer.nextToken();
token = next;
switch (next.type) {
.colon => {
token = it.next() orelse return error.InvalidSyntax;
token = try tokenizer.nextToken();
continue :flag token.type;
},
else => continue :flag next.type,
@ -357,7 +375,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
}
}
const next = it.next() orelse return error.InvalidSyntax;
const next = try tokenizer.nextToken();
token = next;
switch (next.type) {
.string => continue :flag .property,
@ -373,8 +391,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
if (query.slice().len == 0)
return root;
const next = it.next() orelse
return root;
const next = try tokenizer.nextToken();
token = next;
switch (next.type) {
@ -416,7 +433,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
}
}
const next = it.next() orelse return error.InvalidSyntax;
const next = try tokenizer.nextToken();
token = next;
switch (next.type) {
.property => return error.InvalidSyntax,
@ -444,7 +461,10 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
else => {},
}
const next = it.next() orelse return error.InvalidSyntax;
const next = tokenizer.nextToken() catch |err| switch (err) {
error.InvalidSyntax => return err,
else => return root,
};
token = next;
switch (next.type) {
.comma => continue :flag .comma,
@ -467,7 +487,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
const parent_idx = query.get(query.len - 1);
const next = it.next() orelse return error.InvalidSyntax;
const next = try tokenizer.nextToken();
switch (next.type) {
.colon => {
continue :flag .property;
@ -512,7 +532,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
else => {},
}
const next = it.next() orelse return error.InvalidSyntax;
const next = try tokenizer.nextToken();
token = next;
switch (next.type) {
.comma => continue :flag .comma,
@ -522,7 +542,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
},
.comma => {
if (!self.options.flags.allow_trailing_comma) {
const next = it.next() orelse return error.InvalidSyntax;
const next = try tokenizer.nextToken();
token = next;
switch (next.type) {
.object_end, .array_end => return error.TrailingComma,
@ -550,7 +570,10 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
},
else => {},
}
const next = it.next() orelse return error.InvalidSyntax;
const next = tokenizer.nextToken() catch |err| switch (err) {
error.InvalidSyntax => return err,
else => return root,
};
token = next;
switch (next.type) {
.comma => continue :flag .comma,

View File

@ -123,7 +123,6 @@ test { try expectPass("/y_object_duplicated_key_and_value.json"); }
test { try expectPass("/y_object_duplicated_key.json"); }
test { try expectPass("/y_object_empty.json"); }
test { try expectPass("/y_object_empty_key.json"); }
// BIG ISSUE
test { try expectPass("/y_object_escaped_null_in_key.json"); }
test { try expectPass("/y_object_extreme_numbers.json"); }
test { try expectPass("/y_object.json"); }
@ -295,55 +294,53 @@ test { try expectFail("/n_object_two_commas_in_a_row.json"); }
test { try expectFail("/n_object_unquoted_key.json"); }
test { try expectFail("/n_object_unterminated-value.json"); }
test { try expectFail("/n_object_with_single_string.json"); }
// !!!
test { try expectFail("/n_object_with_trailing_garbage.json"); }
// test { try expectFail("/n_single_space.json"); }
test { try expectFail("/n_single_space.json"); }
test { try expectFail("/n_string_1_surrogate_then_escape.json"); }
// test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); }
// test { try expectFail("/n_string_1_surrogate_then_escape_u1x.json"); }
// test { try expectFail("/n_string_1_surrogate_then_escape_u.json"); }
test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); }
test { try expectFail("/n_string_1_surrogate_then_escape_u1x.json"); }
test { try expectFail("/n_string_1_surrogate_then_escape_u.json"); }
test { try expectFail("/n_string_accentuated_char_no_quotes.json"); }
test { try expectFail("/n_string_backslash_00.json"); }
test { try expectFail("/n_string_escaped_backslash_bad.json"); }
test { try expectFail("/n_string_escaped_ctrl_char_tab.json"); }
// test { try expectFail("/n_string_escaped_emoji.json"); }
// test { try expectFail("/n_string_escape_x.json"); }
// test { try expectFail("/n_string_incomplete_escaped_character.json"); }
test { try expectFail("/n_string_escaped_emoji.json"); }
test { try expectFail("/n_string_escape_x.json"); }
test { try expectFail("/n_string_incomplete_escaped_character.json"); }
test { try expectFail("/n_string_incomplete_escape.json"); }
// test { try expectFail("/n_string_incomplete_surrogate_escape_invalid.json"); }
// test { try expectFail("/n_string_incomplete_surrogate.json"); }
// test { try expectFail("/n_string_invalid_backslash_esc.json"); }
test { try expectFail("/n_string_incomplete_surrogate_escape_invalid.json"); }
test { try expectFail("/n_string_incomplete_surrogate.json"); }
test { try expectFail("/n_string_invalid_backslash_esc.json"); }
test { try expectFail("/n_string_invalid_unicode_escape.json"); }
// test { try expectFail("/n_string_invalid_utf8_after_escape.json"); }
// test { try expectFail("/n_string_invalid-utf-8-in-escape.json"); }
test { try expectFail("/n_string_invalid_utf8_after_escape.json"); }
test { try expectFail("/n_string_invalid-utf-8-in-escape.json"); }
test { try expectFail("/n_string_leading_uescaped_thinspace.json"); }
test { try expectFail("/n_string_no_quotes_with_bad_escape.json"); }
//test { try expectFail("/n_string_single_doublequote.json"); }
test { try expectFail("/n_string_single_doublequote.json"); }
test { try expectFail("/n_string_single_quote.json"); }
//test { try expectFail("/n_string_single_string_no_double_quotes.json"); }
test { try expectFail("/n_string_single_string_no_double_quotes.json"); }
test { try expectFail("/n_string_start_escape_unclosed.json"); }
test { try expectFail("/n_string_unescaped_ctrl_char.json"); }
test { try expectFail("/n_string_unescaped_newline.json"); }
test { try expectFail("/n_string_unescaped_tab.json"); }
// test { try expectFail("/n_string_unicode_CapitalU.json"); }
// possibly stack overflow
test { try expectFail("/n_string_unicode_CapitalU.json"); }
test { try expectFail("/n_string_with_trailing_garbage.json"); }
test { try expectFail("/n_structure_100000_opening_arrays.json"); }
//test { try expectFail("/n_structure_angle_bracket_..json"); }
test { try expectFail("/n_structure_angle_bracket_..json"); }
test { try expectFail("/n_structure_angle_bracket_null.json"); }
test { try expectFail("/n_structure_array_trailing_garbage.json"); }
test { try expectFail("/n_structure_array_with_extra_array_close.json"); }
test { try expectFail("/n_structure_array_with_unclosed_string.json"); }
//test { try expectFail("/n_structure_ascii-unicode-identifier.json"); }
test { try expectFail("/n_structure_ascii-unicode-identifier.json"); }
test { try expectFail("/n_structure_capitalized_True.json"); }
test { try expectFail("/n_structure_close_unopened_array.json"); }
test { try expectFail("/n_structure_comma_instead_of_closing_brace.json"); }
test { try expectFail("/n_structure_double_array.json"); }
test { try expectFail("/n_structure_end_array.json"); }
//test { try expectFail("/n_structure_incomplete_UTF8_BOM.json"); }
//test { try expectFail("/n_structure_lone-invalid-utf-8.json"); }
test { try expectFail("/n_structure_incomplete_UTF8_BOM.json"); }
test { try expectFail("/n_structure_lone-invalid-utf-8.json"); }
test { try expectFail("/n_structure_lone-open-bracket.json"); }
//test { try expectFail("/n_structure_no_data.json"); }
test { try expectFail("/n_structure_no_data.json"); }
test { try expectFail("/n_structure_null-byte-outside-string.json"); }
test { try expectFail("/n_structure_number_with_trailing_garbage.json"); }
test { try expectFail("/n_structure_object_followed_by_closing_object.json"); }
@ -363,8 +360,8 @@ test { try expectFail("/n_structure_open_object_open_array.json"); }
test { try expectFail("/n_structure_open_object_open_string.json"); }
test { try expectFail("/n_structure_open_object_string_with_apostrophes.json"); }
test { try expectFail("/n_structure_open_open.json"); }
//test { try expectFail("/n_structure_single_eacute.json"); }
//test { try expectFail("/n_structure_single_star.json"); }
test { try expectFail("/n_structure_single_eacute.json"); }
test { try expectFail("/n_structure_single_star.json"); }
test { try expectFail("/n_structure_trailing_#.json"); }
test { try expectFail("/n_structure_U+2060_word_joined.json"); }
test { try expectFail("/n_structure_uescaped_LF_before_string.json"); }
@ -373,8 +370,8 @@ test { try expectFail("/n_structure_unclosed_array_partial_null.json"); }
test { try expectFail("/n_structure_unclosed_array_unfinished_false.json"); }
test { try expectFail("/n_structure_unclosed_array_unfinished_true.json"); }
test { try expectFail("/n_structure_unclosed_object.json"); }
//test { try expectFail("/n_structure_unicode-identifier.json"); }
//test { try expectFail("/n_structure_UTF8_BOM_no_data.json"); }
test { try expectFail("/n_structure_unicode-identifier.json"); }
test { try expectFail("/n_structure_UTF8_BOM_no_data.json"); }
test { try expectFail("/n_structure_whitespace_formfeed.json"); }
test { try expectFail("/n_structure_whitespace_U+2060_word_joiner.json"); }
// zig fmt: off

View File

@ -124,7 +124,7 @@ fn currentChar(self: *Self) u8 {
return self.text[self.currentPosition()];
}
fn endOfInput(self: *Self) bool {
pub fn endOfInput(self: *Self) bool {
return self.currentPosition() >= self.text.len;
}
@ -204,19 +204,17 @@ fn extractSlice(self: *Self, start: usize) []const u8 {
// Skip all whitespace characters
pub fn skipWhitespace(self: *Self) void {
const start = self.currentPosition();
if (self.endOfInput())
return;
const end = skipWhitespaceSimd(self.text[start..]);
self.advance(end);
}
/// Parse a number token
pub fn nextNumber(self: *Self) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
self.matchChar('-') orelse {}; // this may not fail
while (self.matchCharRange('0', '9') != null) {}
@ -265,11 +263,11 @@ pub fn nextNumber(self: *Self) Error!Token {
/// Parse an identifier token
pub fn nextIdentifier(self: *Self) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
var buffer = try self.allocator.alloc(u8, 0x100);
defer self.allocator.free(buffer);
@ -322,11 +320,11 @@ pub fn nextIdentifier(self: *Self) Error!Token {
/// Get the next token from the input
/// WARNING: this function eats whitespaces
pub fn nextToken(self: *Self) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
// Fall back to single character symbol
const c = self.anyChar() orelse return .{
.type = .eof,
@ -380,11 +378,11 @@ pub fn nextToken(self: *Self) Error!Token {
}
pub fn nextString(self: *Self) Error!Token {
self.skipWhitespace();
const start = try self.pushFrame();
errdefer self.popFrame();
self.skipWhitespace();
self.matchChar('"') orelse unreachable;
var buffer: std.ArrayList(u8) = .init(self.allocator);
@ -457,12 +455,13 @@ pub const Iterator = struct {
errdefer it.tokenizer.deinit();
if (it.tokenizer.endOfInput()) {
// std.debug.print("got eof\n", .{});
return null;
}
return it.tokenizer.nextToken() catch {
// std.debug.print("got err: {s}\n", .{@errorName(err)});
return null;
return it.tokenizer.nextToken() catch |err| switch (err) {
error.InvalidSyntax => unreachable,
else => {
return null;
},
};
}