Compare commits

...

2 Commits

Author SHA1 Message Date
da94cc608a added stuff 2025-05-29 18:40:08 -05:00
ad766bc1e8 . 2025-05-27 22:46:07 -05:00
5 changed files with 111 additions and 106 deletions

View File

@ -21,6 +21,14 @@
- This parser is based off of a [string pool](https://en.wikipedia.org/wiki/String_interning) written by Andrew Kelley, so it must be good, [watch a video](https://www.hytradboi.com/2025/05c72e39-c07e-41bc-ac40-85e8308f2917-programming-without-pointers) - This parser is based off of a [string pool](https://en.wikipedia.org/wiki/String_interning) written by Andrew Kelley, so it must be good, [watch a video](https://www.hytradboi.com/2025/05c72e39-c07e-41bc-ac40-85e8308f2917-programming-without-pointers)
- Probably no, but less featureful software (or less worse) is usually better than over-engineered mess - Probably no, but less featureful software (or less worse) is usually better than over-engineered mess
* (a) Ok but all of your reasons are dumb * (a) Ok but all of your reasons are dumb
### Sic respondeo: - I'll answer this later
## Behaviour
- Null characters, eg: U+0000 are forbidden and will be ignored by the parser
- All control characters except DEL are forbidden
- It passes most of the JSON test suite, 253 tests as of 2025-05-29 when I'm writing this
- It has no reflection as of right now, you must implement your own `parse` function, with reflection
- It uses null terminated strings, clearly this is not the best approach, but it's efficient
## Sic respondeo:
The Zig Discord server is plagued with modern scum, of course, modern scum will dismiss all of my claims or label them as "dumb" or "you're using it wrong!", has any of these individuals not considered that Zig is over complicated? There is a reason why Andrew Kelley himself detached from the communities [and has spoken in multiple instances](https://andrewkelley.me/post/goodbye-twitter-reddit.html) about the "shitification" of software communities, it's like turning a good community of like-minded programmers into a soydev shill. One good thing that he did was shutting down the r/zig subreddit. The Zig Discord server is plagued with modern scum, of course, modern scum will dismiss all of my claims or label them as "dumb" or "you're using it wrong!", has any of these individuals not considered that Zig is over complicated? There is a reason why Andrew Kelley himself detached from the communities [and has spoken in multiple instances](https://andrewkelley.me/post/goodbye-twitter-reddit.html) about the "shitification" of software communities, it's like turning a good community of like-minded programmers into a soydev shill. One good thing that he did was shutting down the r/zig subreddit.
On different note, I defend simplicity and minimalism, so I am not saying that *every* Zig developer who thinks differently is scum, I just say that if you cannot think beyond your viewpoint you will reach nowhere. Lastly, if your software is not straightforward and simple to use, then why criticising someone for now knowing how to use it? The only reasonable way to make software complicated is as long as user friendliness is not a tradeoff for performance or minimalism. On different note, I defend simplicity and minimalism, so I am not saying that *every* Zig developer who thinks differently is scum, I just say that if you cannot think beyond your viewpoint you will reach nowhere. Lastly, if your software is not straightforward and simple to use, then why criticising someone for now knowing how to use it? The only reasonable way to make software complicated is as long as user friendliness is not a tradeoff for performance or minimalism.

View File

@ -48,14 +48,16 @@ pub const JsonInput = union(JsonType) {
pub fn deinit(self: JsonInput, allocator: mem.Allocator) void { pub fn deinit(self: JsonInput, allocator: mem.Allocator) void {
switch (self) { switch (self) {
JsonInput.array => |array| { .array => |array| {
for (array) |json_input| for (array) |json_input|
json_input.deinit(allocator); json_input.deinit(allocator);
allocator.free(array); allocator.free(array);
}, },
.object => |*object| { .object => |*object| {
var it = object.iterator(); var it = object.iterator();
while (it.next()) |entry| entry.value_ptr.deinit(allocator); while (it.next()) |entry| {
entry.value_ptr.deinit(allocator);
}
@constCast(object).deinit(allocator); @constCast(object).deinit(allocator);
}, },
else => {}, else => {},
@ -98,8 +100,8 @@ pub const JsonInput = union(JsonType) {
/// same as ObjectEntry but simpler /// same as ObjectEntry but simpler
/// start is the offset /// start is the offset
pub const ArraySlice = struct { pub const ArraySlice = struct {
start: usize,
len: usize, len: usize,
start: usize,
}; };
/// just += the properties and value indexes to get the next item /// just += the properties and value indexes to get the next item
@ -107,12 +109,19 @@ pub const ArraySlice = struct {
/// it should be ordered /// it should be ordered
pub const ObjectEntry = struct { pub const ObjectEntry = struct {
len: usize, len: usize,
property_idx: usize, tip: usize,
value_idx: usize, };
pub const PropertyEntry = struct {
tip: StringIndex,
}; };
pub const Flags = packed struct { pub const Flags = packed struct {
allow_comments: bool = false,
allow_trailing_comma: bool = false, allow_trailing_comma: bool = false,
enums_are_strings: bool = false,
unions_are_strings: bool = false,
packed_structs_are_ints: bool = false,
}; };
pub const Options = struct { pub const Options = struct {
@ -122,8 +131,9 @@ pub const Options = struct {
}; };
index: std.MultiArrayList(JsonValue) = .{}, index: std.MultiArrayList(JsonValue) = .{},
string_index: StringPool = .empty, strings: StringPool = .empty,
property_index: StringPool = .empty, properties: StringPool = .empty,
property_map: std.AutoArrayHashMapUnmanaged(usize, PropertyEntry) = .empty,
options: Options = .{}, options: Options = .{},
@ -131,8 +141,9 @@ pub const init = Self{};
pub fn deinit(self: *Self, allocator: mem.Allocator) void { pub fn deinit(self: *Self, allocator: mem.Allocator) void {
self.index.deinit(allocator); self.index.deinit(allocator);
self.property_index.deinit(allocator); self.properties.deinit(allocator);
self.string_index.deinit(allocator); self.strings.deinit(allocator);
self.property_map.deinit(allocator);
} }
fn addNumber(self: *Self, allocator: mem.Allocator, number: f64) !usize { fn addNumber(self: *Self, allocator: mem.Allocator, number: f64) !usize {
@ -143,13 +154,14 @@ fn addNumber(self: *Self, allocator: mem.Allocator, number: f64) !usize {
} }
fn addProperty(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize { fn addProperty(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.property_index.add(allocator, bytes); const stridx = try self.properties.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1); try self.index.ensureUnusedCapacity(allocator, 1);
try self.property_map.ensureUnusedCapacity(allocator, 1);
return @intFromEnum(stridx); return @intFromEnum(stridx);
} }
fn addString(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize { fn addString(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.string_index.add(allocator, bytes); const stridx = try self.strings.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1); try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity(); const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .string = stridx }); self.index.set(idx, .{ .string = stridx });
@ -182,7 +194,7 @@ fn skipSlots(self: *Self, slot: usize) usize {
switch (e) { switch (e) {
.object => |obj| { .object => |obj| {
var total: usize = 1; var total: usize = 1;
var v = obj.value_idx; var v = obj.tip;
for (0..obj.len) |_| { for (0..obj.len) |_| {
const s = skipSlots(self, v); const s = skipSlots(self, v);
total += s; total += s;
@ -204,25 +216,6 @@ fn skipSlots(self: *Self, slot: usize) usize {
} }
} }
fn skipNestedProps(self: *Self, pptr: *usize, slot: usize) void {
const e = self.index.get(slot);
if (e == .object) {
var v = e.object.value_idx;
// Skip each nested key and its deeper nested props
for (0..e.object.len) |_| {
// Skip this key
const k: *StringIndex = @ptrCast(pptr);
const slice = k.slice(&self.property_index);
pptr.* += slice.len + 1;
// Recurse into this property's value
skipNestedProps(self, pptr, v);
// Skip slots of the value in index array
const s = skipSlots(self, v);
v += s;
}
}
}
pub fn getValue( pub fn getValue(
self: *Self, self: *Self,
allocator: mem.Allocator, allocator: mem.Allocator,
@ -235,7 +228,7 @@ pub fn getValue(
.bool => |b| return .{ .bool = b }, .bool => |b| return .{ .bool = b },
.number => |number| return .{ .number = number }, .number => |number| return .{ .number = number },
.string => |string| { .string => |string| {
const sl = string.slice(&self.string_index); const sl = string.slice(&self.strings);
return .{ .string = sl }; return .{ .string = sl };
}, },
.array => |arr| { .array => |arr| {
@ -250,23 +243,20 @@ pub fn getValue(
}, },
.object => |obj| { .object => |obj| {
var map: JsonInput.Object = .empty; var map: JsonInput.Object = .empty;
var p = obj.property_idx; errdefer map.deinit(allocator);
var v = obj.value_idx; var tip = obj.tip;
for (0..obj.len) |_| {
// Extract key for (0..obj.len) |_|
const k: StringIndex = @enumFromInt(p); if (self.property_map.get(tip)) |pen| {
const key_slice = k.slice(&self.property_index); const key_slice = pen.tip.slice(&self.properties);
// Extract and assign value
const val = try self.getValue(allocator, v); const val = try self.getValue(allocator, tip);
try map.put(allocator, key_slice, val); try map.put(allocator, key_slice, val);
// Advance past this key const s = self.skipSlots(tip);
p += key_slice.len + 1; tip += s;
// Skip nested property names of this value } else {
self.skipNestedProps(&p, v); return error.MissingKey;
// Advance past the value slots };
const s = self.skipSlots(v);
v += s;
}
return .{ .object = map }; return .{ .object = map };
}, },
} }
@ -299,11 +289,13 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
.object => |scope| { .object => |scope| {
//std.debug.print("prop: {s} \n", .{token.value.?.string}); //std.debug.print("prop: {s} \n", .{token.value.?.string});
const pidx = try self.addProperty(allocator, token.value.?.string); const pidx = try self.addProperty(allocator, token.value.?.string);
self.property_map.putAssumeCapacity(scope.len + 1, .{ .tip = @enumFromInt(pidx) });
allocator.free(token.value.?.string); allocator.free(token.value.?.string);
self.index.set(scope_idx, .{ .object = ObjectEntry{ self.index.set(scope_idx, .{ .object = ObjectEntry{
.len = scope.len + 1, .len = scope.len + 1,
.property_idx = if (scope.len == 0) pidx else scope.property_idx, .tip = scope.tip,
.value_idx = scope.value_idx,
} }); } });
}, },
else => return error.InvalidSyntax, else => return error.InvalidSyntax,
@ -328,8 +320,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
ptr.* = root; ptr.* = root;
self.index.set(root, .{ .object = ObjectEntry{ self.index.set(root, .{ .object = ObjectEntry{
.len = 0, .len = 0,
.property_idx = 0, .tip = 1,
.value_idx = 1,
} }); } });
} else { } else {
//order //order
@ -340,8 +331,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
self.index.set(idx_ptr.*, .{ self.index.set(idx_ptr.*, .{
.object = ObjectEntry{ .object = ObjectEntry{
.len = 0, .len = 0,
.property_idx = self.index.len, //self.property_index.string_bytes.items.len, .tip = self.index.len,
.value_idx = self.index.len,
}, },
}); });
switch (self.index.get(parent_idx)) { switch (self.index.get(parent_idx)) {
@ -451,6 +441,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
if (query.len == 0) { if (query.len == 0) {
// root // root
const idx = try self.addString(allocator, token.value.?.string); const idx = try self.addString(allocator, token.value.?.string);
allocator.free(token.value.?.string);
self.index.set(root, .{ .string = @enumFromInt(idx) }); self.index.set(root, .{ .string = @enumFromInt(idx) });
return root; return root;
} }
@ -516,6 +507,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
token = next; token = next;
switch (next.type) { switch (next.type) {
.object_end, .array_end => return error.TrailingComma, .object_end, .array_end => return error.TrailingComma,
.comma => return error.InvalidSyntax,
else => continue :flag token.type, else => continue :flag token.type,
} }
} }

3
root.zig Normal file
View File

@ -0,0 +1,3 @@
pub const Language = @import("language.zig");
pub const Tokenizer = @import("tokenizer.zig");
pub const StringPool = @import("strings.zig");

View File

@ -16,7 +16,7 @@ test Language {
\\ "a": 2, \\ "a": 2,
\\ "c": { \\ "c": {
\\ "d": 4, \\ "d": 4,
\\ "uwu": [[[[[1], [2]]]]], \\ "uwua": [[[[[1], [2]]]]],
\\ "x": true \\ "x": true
\\ } \\ }
\\ }, \\ },
@ -119,60 +119,59 @@ test { try expectPass("/y_number_real_pos_exponent.json"); }
test { try expectPass("/y_number_simple_int.json"); } test { try expectPass("/y_number_simple_int.json"); }
test { try expectPass("/y_number_simple_real.json"); } test { try expectPass("/y_number_simple_real.json"); }
test { try expectPass("/y_object_basic.json"); } test { try expectPass("/y_object_basic.json"); }
// maybe there is a better way to handle these test { try expectPass("/y_object_duplicated_key_and_value.json"); }
// test { try expectPass("/y_object_duplicated_key_and_value.json"); } test { try expectPass("/y_object_duplicated_key.json"); }
// test { try expectPass("/y_object_duplicated_key.json"); }
test { try expectPass("/y_object_empty.json"); } test { try expectPass("/y_object_empty.json"); }
test { try expectPass("/y_object_empty_key.json"); } test { try expectPass("/y_object_empty_key.json"); }
// BIG ISSUE // BIG ISSUE
// test { try expectPass("/y_object_escaped_null_in_key.json"); } test { try expectPass("/y_object_escaped_null_in_key.json"); }
test { try expectPass("/y_object_extreme_numbers.json"); } test { try expectPass("/y_object_extreme_numbers.json"); }
test { try expectPass("/y_object.json"); } test { try expectPass("/y_object.json"); }
//test { try expectPass("/y_object_long_strings.json"); } test { try expectPass("/y_object_long_strings.json"); }
test { try expectPass("/y_object_simple.json"); } test { try expectPass("/y_object_simple.json"); }
//test { try expectPass("/y_object_string_unicode.json"); } test { try expectPass("/y_object_string_unicode.json"); }
test { try expectPass("/y_object_with_newlines.json"); } test { try expectPass("/y_object_with_newlines.json"); }
//test { try expectPass("/y_string_1_2_3_bytes_UTF-8_sequences.json"); } test { try expectPass("/y_string_1_2_3_bytes_UTF-8_sequences.json"); }
//test { try expectPass("/y_string_accepted_surrogate_pair.json"); } test { try expectPass("/y_string_accepted_surrogate_pair.json"); }
//test { try expectPass("/y_string_accepted_surrogate_pairs.json"); } test { try expectPass("/y_string_accepted_surrogate_pairs.json"); }
//test { try expectPass("/y_string_allowed_escapes.json"); } test { try expectPass("/y_string_allowed_escapes.json"); }
//test { try expectPass("/y_string_backslash_and_u_escaped_zero.json"); } test { try expectPass("/y_string_backslash_and_u_escaped_zero.json"); }
//test { try expectPass("/y_string_backslash_doublequotes.json"); } test { try expectPass("/y_string_backslash_doublequotes.json"); }
//test { try expectPass("/y_string_comments.json"); } test { try expectPass("/y_string_comments.json"); }
//test { try expectPass("/y_string_double_escape_a.json"); } test { try expectPass("/y_string_double_escape_a.json"); }
//test { try expectPass("/y_string_double_escape_n.json"); } test { try expectPass("/y_string_double_escape_n.json"); }
//test { try expectPass("/y_string_escaped_control_character.json"); } test { try expectPass("/y_string_escaped_control_character.json"); }
//test { try expectPass("/y_string_escaped_noncharacter.json"); } test { try expectPass("/y_string_escaped_noncharacter.json"); }
//test { try expectPass("/y_string_in_array.json"); } test { try expectPass("/y_string_in_array.json"); }
//test { try expectPass("/y_string_in_array_with_leading_space.json"); } test { try expectPass("/y_string_in_array_with_leading_space.json"); }
//test { try expectPass("/y_string_last_surrogates_1_and_2.json"); } test { try expectPass("/y_string_last_surrogates_1_and_2.json"); }
//test { try expectPass("/y_string_nbsp_uescaped.json"); } test { try expectPass("/y_string_nbsp_uescaped.json"); }
//test { try expectPass("/y_string_nonCharacterInUTF-8_U+10FFFF.json"); } test { try expectPass("/y_string_nonCharacterInUTF-8_U+10FFFF.json"); }
//test { try expectPass("/y_string_nonCharacterInUTF-8_U+FFFF.json"); } test { try expectPass("/y_string_nonCharacterInUTF-8_U+FFFF.json"); }
//test { try expectPass("/y_string_null_escape.json"); } test { try expectPass("/y_string_null_escape.json"); }
//test { try expectPass("/y_string_one-byte-utf-8.json"); } test { try expectPass("/y_string_one-byte-utf-8.json"); }
//test { try expectPass("/y_string_pi.json"); } test { try expectPass("/y_string_pi.json"); }
//test { try expectPass("/y_string_reservedCharacterInUTF-8_U+1BFFF.json"); } test { try expectPass("/y_string_reservedCharacterInUTF-8_U+1BFFF.json"); }
//test { try expectPass("/y_string_simple_ascii.json"); } test { try expectPass("/y_string_simple_ascii.json"); }
//test { try expectPass("/y_string_space.json"); } test { try expectPass("/y_string_space.json"); }
//test { try expectPass("/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json"); } test { try expectPass("/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json"); }
//test { try expectPass("/y_string_three-byte-utf-8.json"); } test { try expectPass("/y_string_three-byte-utf-8.json"); }
//test { try expectPass("/y_string_two-byte-utf-8.json"); } test { try expectPass("/y_string_two-byte-utf-8.json"); }
//test { try expectPass("/y_string_u+2028_line_sep.json"); } test { try expectPass("/y_string_u+2028_line_sep.json"); }
//test { try expectPass("/y_string_u+2029_par_sep.json"); } test { try expectPass("/y_string_u+2029_par_sep.json"); }
//test { try expectPass("/y_string_uescaped_newline.json"); } test { try expectPass("/y_string_uescaped_newline.json"); }
//test { try expectPass("/y_string_uEscape.json"); } test { try expectPass("/y_string_uEscape.json"); }
//test { try expectPass("/y_string_unescaped_char_delete.json"); } test { try expectPass("/y_string_unescaped_char_delete.json"); }
//test { try expectPass("/y_string_unicode_2.json"); } test { try expectPass("/y_string_unicode_2.json"); }
//test { try expectPass("/y_string_unicodeEscapedBackslash.json"); } test { try expectPass("/y_string_unicodeEscapedBackslash.json"); }
//test { try expectPass("/y_string_unicode_escaped_double_quote.json"); } test { try expectPass("/y_string_unicode_escaped_double_quote.json"); }
//test { try expectPass("/y_string_unicode.json"); } test { try expectPass("/y_string_unicode.json"); }
//test { try expectPass("/y_string_unicode_U+10FFFE_nonchar.json"); } test { try expectPass("/y_string_unicode_U+10FFFE_nonchar.json"); }
//test { try expectPass("/y_string_unicode_U+1FFFE_nonchar.json"); } test { try expectPass("/y_string_unicode_U+1FFFE_nonchar.json"); }
//test { try expectPass("/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json"); } test { try expectPass("/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json"); }
//test { try expectPass("/y_string_unicode_U+2064_invisible_plus.json"); } test { try expectPass("/y_string_unicode_U+2064_invisible_plus.json"); }
//test { try expectPass("/y_string_unicode_U+FDD0_nonchar.json"); } test { try expectPass("/y_string_unicode_U+FDD0_nonchar.json"); }
//test { try expectPass("/y_string_unicode_U+FFFE_nonchar.json"); } test { try expectPass("/y_string_unicode_U+FFFE_nonchar.json"); }
test { try expectPass("/y_string_utf8.json"); } test { try expectPass("/y_string_utf8.json"); }
test { try expectPass("/y_string_with_del_character.json"); } test { try expectPass("/y_string_with_del_character.json"); }
test { try expectPass("/y_structure_lonely_false.json"); } test { try expectPass("/y_structure_lonely_false.json"); }
@ -297,7 +296,7 @@ test { try expectFail("/n_object_unquoted_key.json"); }
test { try expectFail("/n_object_unterminated-value.json"); } test { try expectFail("/n_object_unterminated-value.json"); }
test { try expectFail("/n_object_with_single_string.json"); } test { try expectFail("/n_object_with_single_string.json"); }
// !!! // !!!
// test { try expectFail("/n_object_with_trailing_garbage.json"); } test { try expectFail("/n_object_with_trailing_garbage.json"); }
// test { try expectFail("/n_single_space.json"); } // test { try expectFail("/n_single_space.json"); }
test { try expectFail("/n_string_1_surrogate_then_escape.json"); } test { try expectFail("/n_string_1_surrogate_then_escape.json"); }
// test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); } // test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); }

View File

@ -395,6 +395,9 @@ pub fn nextString(self: *Self) Error!Token {
switch (try self.lastChar()) { switch (try self.lastChar()) {
'"' => { '"' => {
while (std.mem.indexOfScalar(u8, buffer.items, 0x00)) |idx|
_ = buffer.swapRemove(idx);
return .{ return .{
.type = .string, .type = .string,
.value = .{ .string = try buffer.toOwnedSlice() }, .value = .{ .string = try buffer.toOwnedSlice() },
@ -435,7 +438,7 @@ pub fn nextString(self: *Self) Error!Token {
} // end switch } // end switch
}, },
else => |c| { else => |c| {
if (std.ascii.isControl(c)) { if (std.ascii.isControl(c) and c != std.ascii.control_code.del) {
return error.UnexpectedCharacter; return error.UnexpectedCharacter;
} }
try buffer.append(c); try buffer.append(c);