diff --git a/language.zig b/language.zig index 1b29552..deda419 100644 --- a/language.zig +++ b/language.zig @@ -1,6 +1,7 @@ const std = @import("std"); const Tokenizer = @import("tokenizer.zig"); const TokenType = Tokenizer.TokenType; +const Token = Tokenizer.Token; const StringPool = @import("strings.zig"); const StringIndex = StringPool.StringIndex; const assert = std.debug.assert; @@ -156,7 +157,7 @@ fn addObject(self: *Self, allocator: std.mem.Allocator, object: std.AutoArrayHas self.index.set(idx, .{ .object = ObjectEntry{ .len = 0, .property_idx = 0, - .value_idx = 0, + .value_idx = 1, } }); return idx; } @@ -167,7 +168,7 @@ fn addEmptyObject(self: *Self, allocator: std.mem.Allocator) !usize { const idx = self.index.addOneAssumeCapacity(); const object: ObjectEntry = .{ .property_idx = self.property_index.string_bytes.items.len, - .value_idx = self.index.len, + .value_idx = self.index.len + 1, .len = 0, }; self.index.set(idx, .{ .object = object }); @@ -299,43 +300,50 @@ fn getNull(self: *Self, index: usize) ?void { return entry.null; } -fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize) !?JsonInput { +fn getValue(self: *Self, allocator: std.mem.Allocator, index: usize, offset: usize) !struct { ?JsonInput, usize } { const entry = self.index.get(index); switch (entry) { - .null => return .{ .null = {} }, - .bool => return .{ .bool = entry.bool }, - .number => return .{ .number = entry.number }, + .null => return .{ .{ .null = {} }, 1 }, + .bool => return .{ .{ .bool = entry.bool }, 1 }, + .number => return .{ .{ .number = entry.number }, 1 }, .string => { const str = entry.string.slice(&self.string_index); - return .{ .string = str }; + return .{ .{ .string = str }, 1 }; }, .array => { const res = try allocator.alloc(JsonInput, entry.array.len); var idx = entry.array.start; + var offset_calc: usize = offset; + for (0..entry.array.len) |i| { - if (try self.getValue(allocator, idx)) |v| { - res[i] = v; - idx += 1; - } else unreachable; + const val, const step = try self.getValue(allocator, idx, offset_calc); + res[i] = val.?; + idx += step; } - return .{ .array = res }; + offset_calc += entry.array.len; + + return .{ .{ .array = res }, offset_calc }; }, .object => { var kidx = entry.object.property_idx; var vidx = entry.object.value_idx; var obj: std.StringArrayHashMapUnmanaged(JsonInput) = .empty; + var offset_calc: usize = offset; try obj.ensureTotalCapacity(allocator, entry.object.len); for (0..entry.object.len) |_| { - const key = StringIndex.slice(@enumFromInt(kidx), &self.property_index); - const val = (try self.getValue(allocator, vidx)).?; + const slice = StringIndex.slice(@enumFromInt(kidx), &self.property_index); + const val, const step = try self.getValue(allocator, vidx, offset_calc); + kidx += slice.len + 1; + vidx += step; - obj.putAssumeCapacityNoClobber(key, val); - kidx += key.len + 1; - vidx += 1; + std.debug.print("putting {s} -> {d}\n", .{ slice, vidx }); + obj.putAssumeCapacityNoClobber(slice, val.?); } - return .{ .object = obj }; + offset_calc += entry.object.len; + + return .{ .{ .object = obj }, offset_calc }; }, } } @@ -347,81 +355,144 @@ pub fn parse(self: *Self, tokenizer: *Tokenizer) !usize { var it = tokenizer.iterator(); const root = try self.addEmptyObject(allocator); - var work_query = try allocator.alloc(usize, self.options.max_depth); - var cycles: usize = 0; + defer std.debug.print("idx: {s}\n", .{ + @tagName(self.index.get(self.index.get(root).object.value_idx)), + }); - //defer assert(cycles == 0); + var token = it.next() orelse + return root; - while (it.next()) |token| { - defer tokenizer.skipWhitespace(); + var query: std.BoundedArray(usize, self.options.max_depth) = try .init(0); - std.debug.print("token: {s}\n", .{@tagName(token.type)}); + flag: switch (token.type) { + .eof => { + assert(query.slice().len == 0); + return root; + }, + .property => { + defer tokenizer.skipWhitespace(); - flag: switch (token.type) { - .array_end => { - cycles -= 1; - }, - .object_end => { - cycles -= 1; - }, - .array_begin => { - const idx = try self.addEmptyArray(allocator); - work_query[cycles] = idx; - cycles += 1; - }, - .object_begin => { - if (cycles == 0) { - self.index.set(root, .{ .object = .{ - .len = 0, - .property_idx = self.property_index.string_table.size, - .value_idx = self.index.len, + const scope_idx = query.get(query.len - 1); + switch (self.index.get(scope_idx)) { + .object => |scope| { + const pidx = try self.addProperty(allocator, token.value.?.string); + self.index.set(scope_idx, .{ .object = ObjectEntry{ + .len = scope.len + 1, + .property_idx = if (scope.len == 0) pidx else scope.property_idx, + .value_idx = scope.value_idx, } }); - work_query[cycles] = root; - } else { - const obj_idx = try self.addEmptyObject(allocator); - work_query[cycles] = obj_idx; - } - cycles += 1; - }, - .property => { - const scope_idx = work_query[cycles - 1]; - switch (self.index.get(scope_idx)) { - .object => |scope| { - //std.debug.print("depth: {d}\n", .{cycles}); - _ = try self.addProperty(allocator, token.value.?.string); - self.index.set(scope_idx, .{ .object = ObjectEntry{ - .len = scope.len + 1, - .property_idx = scope.property_idx, - .value_idx = scope.value_idx, - } }); - }, - else => unreachable, - } - }, - .string => { - if (it.peek()) |next| if (next.type == .colon) { + }, + else => return error.InvalidSyntax, + } + + const next = it.next() orelse return error.InvalidSyntax; + token = next; + switch (next.type) { + .colon => { + token = it.next() orelse return error.InvalidSyntax; + continue :flag token.type; + }, + else => continue :flag next.type, + // else => return error.InvalidSyntax, + } + }, + .object_begin => { + defer tokenizer.skipWhitespace(); + + if (query.slice().len == 0) { + try query.ensureUnusedCapacity(1); + const ptr = query.addOneAssumeCapacity(); + ptr.* = root; + self.index.set(root, .{ .object = ObjectEntry{ + .len = 0, + .property_idx = self.property_index.string_bytes.items.len, + .value_idx = 1, + } }); + } else { + const idx_ptr = try query.addOne(); + idx_ptr.* = try self.addEmptyObject(allocator); + self.index.set(idx_ptr.*, .{ .object = ObjectEntry{ + .len = 0, + .property_idx = self.property_index.string_bytes.items.len, + .value_idx = self.index.len, + } }); + } + + const next = it.next() orelse return error.InvalidSyntax; + token = next; + switch (next.type) { + .string => continue :flag .property, + else => return error.InvalidSyntax, + } + }, + .object_end => { + defer tokenizer.skipWhitespace(); + assert(query.pop() != null); + + const next = it.next() orelse + return root; + token = next; + switch (next.type) { + .comma => continue :flag .comma, + .object_end, .array_end => |t| continue :flag t, + else => return error.InvalidSyntax, + } + }, + .true, .false => { + defer tokenizer.skipWhitespace(); + + _ = try self.addBool(allocator, if (token.type == .true) true else false); + + const next = it.next() orelse return error.InvalidSyntax; + token = next; + switch (next.type) { + .comma => continue :flag .comma, + .object_end => continue :flag .object_end, + else => return error.InvalidSyntax, + } + }, + .string => { + defer tokenizer.skipWhitespace(); + + const next = it.next() orelse return error.InvalidSyntax; + switch (next.type) { + .colon => { continue :flag .property; - }; - _ = try self.addString(allocator, token.value.?.string); - }, - .number => { - _ = try self.addNumber(allocator, token.value.?.number); - }, - .true, .false => { - _ = try self.addBool(allocator, if (token.type == .true) true else false); - }, - .null => { - _ = try self.addNull(allocator); - }, - .comma => if (it.peek()) |t| { - if (t.type == .object_end) { - if (!self.options.flags.allow_trailing_comma) { - return error.TrailingComma; - } + }, + else => |t| { + _ = try self.addString(allocator, token.value.?.string); + + token = next; + continue :flag t; + }, + } + }, + .number => { + defer tokenizer.skipWhitespace(); + + _ = try self.addNumber(allocator, token.value.?.number); + + const next = it.next() orelse return error.InvalidSyntax; + token = next; + switch (next.type) { + .comma => continue :flag .comma, + .object_end => continue :flag .object_end, + else => return error.InvalidSyntax, + } + }, + .comma => { + if (!self.options.flags.allow_trailing_comma) { + const next = it.next() orelse return error.InvalidSyntax; + token = next; + switch (next.type) { + .object_end, .array_end => return error.TrailingComma, + else => continue :flag token.type, } - }, - else => continue, - } + } + }, + else => { + std.debug.print("token: {s}\n", .{@tagName(token.type)}); + }, } return root; @@ -439,10 +510,13 @@ test parse { var tokenizer = try Tokenizer.init(allocator, blk: { const json = \\ { - \\ "lazy": true, + \\ "bio": "cool", + \\ "age": 15, \\ "name": "yuzu", - \\ "dislikes": [["Math", 3], ["Sports", 1]], - \\ "age": 15 + \\ "admin": true, + \\ "address": { + \\ "lorem": "ipsum" + \\ } \\ } ; break :blk json; @@ -450,7 +524,7 @@ test parse { const root = blk: { const idx = try parse(&self, &tokenizer); - const val = (try getValue(&self, allocator, idx)).?; + const val, _ = try getValue(&self, allocator, idx, 0); break :blk val; }; diff --git a/tokenizer.zig b/tokenizer.zig index b7a6cd1..a6de90c 100644 --- a/tokenizer.zig +++ b/tokenizer.zig @@ -58,18 +58,11 @@ pub const Token = struct { pub const Self = @This(); text: []const u8, -position: usize, max_position: usize, stack: []usize, frame: usize, allocator: std.mem.Allocator, -prev_token: ?Token = null, - -pub fn pushBack(self: *Self, token: Token) void { - self.prev_token = token; -} - /// Initialize a new tokenizer pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self { const stack = try allocator.alloc(usize, 0x100); @@ -77,7 +70,6 @@ pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Er @memset(stack, 0); return .{ .text = text, - .position = 0, .max_position = 0, .stack = stack, .frame = 0, @@ -201,6 +193,14 @@ pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void { self.advance(1); } +pub fn anyChar(self: *Self) ?u8 { + if (self.endOfInput()) + return null; + const char = self.text[self.currentPosition()]; + self.advance(1); + return char; +} + // ========== Token Extraction ========== fn extractSlice(self: *Self, start: usize) []const u8 { @@ -319,35 +319,33 @@ pub fn nextIdentifier(self: *Self) Error!Token { /// Get the next token from the input /// WARNING: this function eats whitespaces pub fn nextToken(self: *Self) Error!Token { - if (self.prev_token) |tok| { - self.prev_token = null; - return tok; - } - const start = try self.pushFrame(); errdefer self.popFrame(); self.skipWhitespace(); - if (self.endOfInput()) { - return Token{ - .type = .eof, - .value = null, - .start = start, - .end = start, - }; - } - - self.advance(1); // Fall back to single character symbol - const c = self.lastChar(); + const c = self.anyChar() orelse return .{ + .type = .eof, + .value = null, + .start = start, + .end = start, + }; const symbol_t: TokenType = switch (c) { '{' => .object_begin, '}' => .object_end, '[' => .array_begin, ']' => .array_end, - ',' => .comma, + ',' => { + self.skipWhitespace(); + return self.commit(Token{ + .type = .comma, + .value = null, + .end = start + 1, + .start = start, + }); + }, ':' => .colon, '"' => { self.rollback(); @@ -449,31 +447,20 @@ pub const Iterator = struct { pub fn next(it: *Iterator) ?Token { defer it.tokenizer.skipWhitespace(); - if (it.tokenizer.endOfInput()) return null; - return it.tokenizer.nextToken() catch null; + if (it.tokenizer.endOfInput()) { + std.debug.print("got eof\n", .{}); + return null; + } + return it.tokenizer.nextToken() catch |err| { + std.debug.print("got err: {s}\n", .{@errorName(err)}); + return null; + }; } pub fn reset(it: *Iterator) void { it.tokenizer.position = 0; it.tokenizer.max_position = 0; it.tokenizer.frame = 0; - it.tokenizer.prev_token = null; - } - - /// nasty trick - pub fn peek(it: *Iterator) ?Token { - const frame = it.tokenizer.frame; - const pos = it.tokenizer.position; - const prev = it.tokenizer.prev_token; - const max_pos = it.tokenizer.max_position; - defer { - it.tokenizer.position = pos; - it.tokenizer.frame = frame; - it.tokenizer.max_position = max_pos; - it.tokenizer.prev_token = prev; - } - if (it.tokenizer.endOfInput()) return null; - return it.tokenizer.nextToken() catch null; } };