added stuff

This commit is contained in:
yuzu 2025-05-29 18:40:08 -05:00
parent ad766bc1e8
commit da94cc608a
6 changed files with 107 additions and 176 deletions

View File

@ -21,6 +21,14 @@
- This parser is based off of a [string pool](https://en.wikipedia.org/wiki/String_interning) written by Andrew Kelley, so it must be good, [watch a video](https://www.hytradboi.com/2025/05c72e39-c07e-41bc-ac40-85e8308f2917-programming-without-pointers)
- Probably no, but less featureful software (or less worse) is usually better than over-engineered mess
* (a) Ok but all of your reasons are dumb
### Sic respondeo:
- I'll answer this later
## Behaviour
- Null characters, eg: U+0000 are forbidden and will be ignored by the parser
- All control characters except DEL are forbidden
- It passes most of the JSON test suite, 253 tests as of 2025-05-29 when I'm writing this
- It has no reflection as of right now, you must implement your own `parse` function, with reflection
- It uses null terminated strings, clearly this is not the best approach, but it's efficient
## Sic respondeo:
The Zig Discord server is plagued with modern scum, of course, modern scum will dismiss all of my claims or label them as "dumb" or "you're using it wrong!", has any of these individuals not considered that Zig is over complicated? There is a reason why Andrew Kelley himself detached from the communities [and has spoken in multiple instances](https://andrewkelley.me/post/goodbye-twitter-reddit.html) about the "shitification" of software communities, it's like turning a good community of like-minded programmers into a soydev shill. One good thing that he did was shutting down the r/zig subreddit.
On different note, I defend simplicity and minimalism, so I am not saying that *every* Zig developer who thinks differently is scum, I just say that if you cannot think beyond your viewpoint you will reach nowhere. Lastly, if your software is not straightforward and simple to use, then why criticising someone for now knowing how to use it? The only reasonable way to make software complicated is as long as user friendliness is not a tradeoff for performance or minimalism.

View File

@ -1,70 +0,0 @@
//! Reference counted, owned, and interned strings
const std = @import("std");
const Allocator = std.mem.Allocator;
bytes: std.ArrayListUnmanaged(u8),
map: std.StringHashMapUnmanaged(void),
// Construct new empty string intern pool
pub const empty: @This() = .{ .bytes = .empty, .map = .empty };
// Free all memory allocated by the intern pool. Additionally, ensure
// all memory allocated by the pool in it's lifetime is also freed.
pub fn deinit(self: *@This(), gpa: Allocator) void {
self.bytes.deinit(gpa);
self.map.deinit(gpa);
self.* = undefined;
}
// Add a string to the intern pool, returning a stable pointer to the string.
// The pointer will last as far as the string is not removed with `remove()`.
pub fn add(self: *@This(), gpa: Allocator, str: []const u8) ![]const u8 {
if (self.map.getKey(str)) |entry| {
// Return pre-existing
return entry;
}
// Allocate memory for & copy the string
const str_start_idx = self.bytes.items.len;
try self.bytes.appendSlice(gpa, str);
const entry_str = self.bytes.items[str_start_idx..];
errdefer self.bytes.items.len = str_start_idx;
// Add the entry to our map, use the owned string
try self.map.putNoClobber(gpa, entry_str, void{});
// Return the stable pointer
return entry_str;
}
test "Intern Pool" {
const gpa = std.testing.allocator;
try generalWorkload(gpa);
try std.testing.checkAllAllocationFailures(gpa, generalWorkload, .{});
}
fn generalWorkload(gpa: Allocator) !void {
var pool: @This() = .empty;
defer pool.deinit(gpa);
// Inserting elements
const a = try pool.add(gpa, "x");
const b = try pool.add(gpa, "y");
const c = try pool.add(gpa, "z");
const d = try pool.add(gpa, "z");
const e = try pool.add(gpa, "y");
try std.testing.expectEqualSlices(u8, "x", a);
try std.testing.expectEqualSlices(u8, "y", b);
try std.testing.expectEqualSlices(u8, "z", c);
try std.testing.expectEqualSlices(u8, "z", d);
try std.testing.expectEqualSlices(u8, "y", e);
try std.testing.expectEqual(b.ptr, e.ptr);
try std.testing.expectEqual(c.ptr, d.ptr);
try std.testing.expectEqual(pool.map.size, 3);
try std.testing.expectEqualSlices(u8, "xyz", pool.bytes.items);
}

View File

@ -48,14 +48,16 @@ pub const JsonInput = union(JsonType) {
pub fn deinit(self: JsonInput, allocator: mem.Allocator) void {
switch (self) {
JsonInput.array => |array| {
.array => |array| {
for (array) |json_input|
json_input.deinit(allocator);
allocator.free(array);
},
.object => |*object| {
var it = object.iterator();
while (it.next()) |entry| entry.value_ptr.deinit(allocator);
while (it.next()) |entry| {
entry.value_ptr.deinit(allocator);
}
@constCast(object).deinit(allocator);
},
else => {},
@ -98,8 +100,8 @@ pub const JsonInput = union(JsonType) {
/// same as ObjectEntry but simpler
/// start is the offset
pub const ArraySlice = struct {
start: usize,
len: usize,
start: usize,
};
/// just += the properties and value indexes to get the next item
@ -107,8 +109,11 @@ pub const ArraySlice = struct {
/// it should be ordered
pub const ObjectEntry = struct {
len: usize,
property_idx: usize,
value_idx: usize,
tip: usize,
};
pub const PropertyEntry = struct {
tip: StringIndex,
};
pub const Flags = packed struct {
@ -126,8 +131,9 @@ pub const Options = struct {
};
index: std.MultiArrayList(JsonValue) = .{},
string_index: StringPool = .empty,
property_index: StringPool = .empty,
strings: StringPool = .empty,
properties: StringPool = .empty,
property_map: std.AutoArrayHashMapUnmanaged(usize, PropertyEntry) = .empty,
options: Options = .{},
@ -135,8 +141,9 @@ pub const init = Self{};
pub fn deinit(self: *Self, allocator: mem.Allocator) void {
self.index.deinit(allocator);
self.property_index.deinit(allocator);
self.string_index.deinit(allocator);
self.properties.deinit(allocator);
self.strings.deinit(allocator);
self.property_map.deinit(allocator);
}
fn addNumber(self: *Self, allocator: mem.Allocator, number: f64) !usize {
@ -147,13 +154,14 @@ fn addNumber(self: *Self, allocator: mem.Allocator, number: f64) !usize {
}
fn addProperty(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.property_index.add(allocator, bytes);
const stridx = try self.properties.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
try self.property_map.ensureUnusedCapacity(allocator, 1);
return @intFromEnum(stridx);
}
fn addString(self: *Self, allocator: mem.Allocator, bytes: []const u8) !usize {
const stridx = try self.string_index.add(allocator, bytes);
const stridx = try self.strings.add(allocator, bytes);
try self.index.ensureUnusedCapacity(allocator, 1);
const idx = self.index.addOneAssumeCapacity();
self.index.set(idx, .{ .string = stridx });
@ -186,7 +194,7 @@ fn skipSlots(self: *Self, slot: usize) usize {
switch (e) {
.object => |obj| {
var total: usize = 1;
var v = obj.value_idx;
var v = obj.tip;
for (0..obj.len) |_| {
const s = skipSlots(self, v);
total += s;
@ -208,25 +216,6 @@ fn skipSlots(self: *Self, slot: usize) usize {
}
}
fn skipNestedProps(self: *Self, pptr: *usize, slot: usize) void {
const e = self.index.get(slot);
if (e == .object) {
var v = e.object.value_idx;
// Skip each nested key and its deeper nested props
for (0..e.object.len) |_| {
// Skip this key
const k: *StringIndex = @ptrCast(pptr);
const slice = k.slice(&self.property_index);
pptr.* += slice.len + 1;
// Recurse into this property's value
skipNestedProps(self, pptr, v);
// Skip slots of the value in index array
const s = skipSlots(self, v);
v += s;
}
}
}
pub fn getValue(
self: *Self,
allocator: mem.Allocator,
@ -239,7 +228,7 @@ pub fn getValue(
.bool => |b| return .{ .bool = b },
.number => |number| return .{ .number = number },
.string => |string| {
const sl = string.slice(&self.string_index);
const sl = string.slice(&self.strings);
return .{ .string = sl };
},
.array => |arr| {
@ -254,23 +243,20 @@ pub fn getValue(
},
.object => |obj| {
var map: JsonInput.Object = .empty;
var p = obj.property_idx;
var v = obj.value_idx;
for (0..obj.len) |_| {
// Extract key
const k: StringIndex = @enumFromInt(p);
const key_slice = k.slice(&self.property_index);
// Extract and assign value
const val = try self.getValue(allocator, v);
try map.put(allocator, key_slice, val);
// Advance past this key
p += key_slice.len + 1;
// Skip nested property names of this value
self.skipNestedProps(&p, v);
// Advance past the value slots
const s = self.skipSlots(v);
v += s;
}
errdefer map.deinit(allocator);
var tip = obj.tip;
for (0..obj.len) |_|
if (self.property_map.get(tip)) |pen| {
const key_slice = pen.tip.slice(&self.properties);
const val = try self.getValue(allocator, tip);
try map.put(allocator, key_slice, val);
const s = self.skipSlots(tip);
tip += s;
} else {
return error.MissingKey;
};
return .{ .object = map };
},
}
@ -303,11 +289,13 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
.object => |scope| {
//std.debug.print("prop: {s} \n", .{token.value.?.string});
const pidx = try self.addProperty(allocator, token.value.?.string);
self.property_map.putAssumeCapacity(scope.len + 1, .{ .tip = @enumFromInt(pidx) });
allocator.free(token.value.?.string);
self.index.set(scope_idx, .{ .object = ObjectEntry{
.len = scope.len + 1,
.property_idx = if (scope.len == 0) pidx else scope.property_idx,
.value_idx = scope.value_idx,
.tip = scope.tip,
} });
},
else => return error.InvalidSyntax,
@ -332,8 +320,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
ptr.* = root;
self.index.set(root, .{ .object = ObjectEntry{
.len = 0,
.property_idx = 0,
.value_idx = 1,
.tip = 1,
} });
} else {
//order
@ -344,8 +331,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
self.index.set(idx_ptr.*, .{
.object = ObjectEntry{
.len = 0,
.property_idx = self.index.len, //self.property_index.string_bytes.items.len,
.value_idx = self.index.len,
.tip = self.index.len,
},
});
switch (self.index.get(parent_idx)) {
@ -455,6 +441,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
if (query.len == 0) {
// root
const idx = try self.addString(allocator, token.value.?.string);
allocator.free(token.value.?.string);
self.index.set(root, .{ .string = @enumFromInt(idx) });
return root;
}
@ -520,6 +507,7 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize {
token = next;
switch (next.type) {
.object_end, .array_end => return error.TrailingComma,
.comma => return error.InvalidSyntax,
else => continue :flag token.type,
}
}

3
root.zig Normal file
View File

@ -0,0 +1,3 @@
pub const Language = @import("language.zig");
pub const Tokenizer = @import("tokenizer.zig");
pub const StringPool = @import("strings.zig");

View File

@ -16,7 +16,7 @@ test Language {
\\ "a": 2,
\\ "c": {
\\ "d": 4,
\\ "uwu": [[[[[1], [2]]]]],
\\ "uwua": [[[[[1], [2]]]]],
\\ "x": true
\\ }
\\ },
@ -119,60 +119,59 @@ test { try expectPass("/y_number_real_pos_exponent.json"); }
test { try expectPass("/y_number_simple_int.json"); }
test { try expectPass("/y_number_simple_real.json"); }
test { try expectPass("/y_object_basic.json"); }
// maybe there is a better way to handle these
// test { try expectPass("/y_object_duplicated_key_and_value.json"); }
// test { try expectPass("/y_object_duplicated_key.json"); }
test { try expectPass("/y_object_duplicated_key_and_value.json"); }
test { try expectPass("/y_object_duplicated_key.json"); }
test { try expectPass("/y_object_empty.json"); }
test { try expectPass("/y_object_empty_key.json"); }
// BIG ISSUE
// test { try expectPass("/y_object_escaped_null_in_key.json"); }
test { try expectPass("/y_object_escaped_null_in_key.json"); }
test { try expectPass("/y_object_extreme_numbers.json"); }
test { try expectPass("/y_object.json"); }
//test { try expectPass("/y_object_long_strings.json"); }
test { try expectPass("/y_object_long_strings.json"); }
test { try expectPass("/y_object_simple.json"); }
//test { try expectPass("/y_object_string_unicode.json"); }
test { try expectPass("/y_object_string_unicode.json"); }
test { try expectPass("/y_object_with_newlines.json"); }
//test { try expectPass("/y_string_1_2_3_bytes_UTF-8_sequences.json"); }
//test { try expectPass("/y_string_accepted_surrogate_pair.json"); }
//test { try expectPass("/y_string_accepted_surrogate_pairs.json"); }
//test { try expectPass("/y_string_allowed_escapes.json"); }
//test { try expectPass("/y_string_backslash_and_u_escaped_zero.json"); }
//test { try expectPass("/y_string_backslash_doublequotes.json"); }
//test { try expectPass("/y_string_comments.json"); }
//test { try expectPass("/y_string_double_escape_a.json"); }
//test { try expectPass("/y_string_double_escape_n.json"); }
//test { try expectPass("/y_string_escaped_control_character.json"); }
//test { try expectPass("/y_string_escaped_noncharacter.json"); }
//test { try expectPass("/y_string_in_array.json"); }
//test { try expectPass("/y_string_in_array_with_leading_space.json"); }
//test { try expectPass("/y_string_last_surrogates_1_and_2.json"); }
//test { try expectPass("/y_string_nbsp_uescaped.json"); }
//test { try expectPass("/y_string_nonCharacterInUTF-8_U+10FFFF.json"); }
//test { try expectPass("/y_string_nonCharacterInUTF-8_U+FFFF.json"); }
//test { try expectPass("/y_string_null_escape.json"); }
//test { try expectPass("/y_string_one-byte-utf-8.json"); }
//test { try expectPass("/y_string_pi.json"); }
//test { try expectPass("/y_string_reservedCharacterInUTF-8_U+1BFFF.json"); }
//test { try expectPass("/y_string_simple_ascii.json"); }
//test { try expectPass("/y_string_space.json"); }
//test { try expectPass("/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json"); }
//test { try expectPass("/y_string_three-byte-utf-8.json"); }
//test { try expectPass("/y_string_two-byte-utf-8.json"); }
//test { try expectPass("/y_string_u+2028_line_sep.json"); }
//test { try expectPass("/y_string_u+2029_par_sep.json"); }
//test { try expectPass("/y_string_uescaped_newline.json"); }
//test { try expectPass("/y_string_uEscape.json"); }
//test { try expectPass("/y_string_unescaped_char_delete.json"); }
//test { try expectPass("/y_string_unicode_2.json"); }
//test { try expectPass("/y_string_unicodeEscapedBackslash.json"); }
//test { try expectPass("/y_string_unicode_escaped_double_quote.json"); }
//test { try expectPass("/y_string_unicode.json"); }
//test { try expectPass("/y_string_unicode_U+10FFFE_nonchar.json"); }
//test { try expectPass("/y_string_unicode_U+1FFFE_nonchar.json"); }
//test { try expectPass("/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json"); }
//test { try expectPass("/y_string_unicode_U+2064_invisible_plus.json"); }
//test { try expectPass("/y_string_unicode_U+FDD0_nonchar.json"); }
//test { try expectPass("/y_string_unicode_U+FFFE_nonchar.json"); }
test { try expectPass("/y_string_1_2_3_bytes_UTF-8_sequences.json"); }
test { try expectPass("/y_string_accepted_surrogate_pair.json"); }
test { try expectPass("/y_string_accepted_surrogate_pairs.json"); }
test { try expectPass("/y_string_allowed_escapes.json"); }
test { try expectPass("/y_string_backslash_and_u_escaped_zero.json"); }
test { try expectPass("/y_string_backslash_doublequotes.json"); }
test { try expectPass("/y_string_comments.json"); }
test { try expectPass("/y_string_double_escape_a.json"); }
test { try expectPass("/y_string_double_escape_n.json"); }
test { try expectPass("/y_string_escaped_control_character.json"); }
test { try expectPass("/y_string_escaped_noncharacter.json"); }
test { try expectPass("/y_string_in_array.json"); }
test { try expectPass("/y_string_in_array_with_leading_space.json"); }
test { try expectPass("/y_string_last_surrogates_1_and_2.json"); }
test { try expectPass("/y_string_nbsp_uescaped.json"); }
test { try expectPass("/y_string_nonCharacterInUTF-8_U+10FFFF.json"); }
test { try expectPass("/y_string_nonCharacterInUTF-8_U+FFFF.json"); }
test { try expectPass("/y_string_null_escape.json"); }
test { try expectPass("/y_string_one-byte-utf-8.json"); }
test { try expectPass("/y_string_pi.json"); }
test { try expectPass("/y_string_reservedCharacterInUTF-8_U+1BFFF.json"); }
test { try expectPass("/y_string_simple_ascii.json"); }
test { try expectPass("/y_string_space.json"); }
test { try expectPass("/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json"); }
test { try expectPass("/y_string_three-byte-utf-8.json"); }
test { try expectPass("/y_string_two-byte-utf-8.json"); }
test { try expectPass("/y_string_u+2028_line_sep.json"); }
test { try expectPass("/y_string_u+2029_par_sep.json"); }
test { try expectPass("/y_string_uescaped_newline.json"); }
test { try expectPass("/y_string_uEscape.json"); }
test { try expectPass("/y_string_unescaped_char_delete.json"); }
test { try expectPass("/y_string_unicode_2.json"); }
test { try expectPass("/y_string_unicodeEscapedBackslash.json"); }
test { try expectPass("/y_string_unicode_escaped_double_quote.json"); }
test { try expectPass("/y_string_unicode.json"); }
test { try expectPass("/y_string_unicode_U+10FFFE_nonchar.json"); }
test { try expectPass("/y_string_unicode_U+1FFFE_nonchar.json"); }
test { try expectPass("/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json"); }
test { try expectPass("/y_string_unicode_U+2064_invisible_plus.json"); }
test { try expectPass("/y_string_unicode_U+FDD0_nonchar.json"); }
test { try expectPass("/y_string_unicode_U+FFFE_nonchar.json"); }
test { try expectPass("/y_string_utf8.json"); }
test { try expectPass("/y_string_with_del_character.json"); }
test { try expectPass("/y_structure_lonely_false.json"); }
@ -297,7 +296,7 @@ test { try expectFail("/n_object_unquoted_key.json"); }
test { try expectFail("/n_object_unterminated-value.json"); }
test { try expectFail("/n_object_with_single_string.json"); }
// !!!
// test { try expectFail("/n_object_with_trailing_garbage.json"); }
test { try expectFail("/n_object_with_trailing_garbage.json"); }
// test { try expectFail("/n_single_space.json"); }
test { try expectFail("/n_string_1_surrogate_then_escape.json"); }
// test { try expectFail("/n_string_1_surrogate_then_escape_u1.json"); }

View File

@ -395,6 +395,9 @@ pub fn nextString(self: *Self) Error!Token {
switch (try self.lastChar()) {
'"' => {
while (std.mem.indexOfScalar(u8, buffer.items, 0x00)) |idx|
_ = buffer.swapRemove(idx);
return .{
.type = .string,
.value = .{ .string = try buffer.toOwnedSlice() },
@ -435,7 +438,7 @@ pub fn nextString(self: *Self) Error!Token {
} // end switch
},
else => |c| {
if (std.ascii.isControl(c)) {
if (std.ascii.isControl(c) and c != std.ascii.control_code.del) {
return error.UnexpectedCharacter;
}
try buffer.append(c);