546 lines
14 KiB
Zig
546 lines
14 KiB
Zig
const std = @import("std");
|
|
const mem = std.mem;
|
|
|
|
pub const Error = error{
|
|
/// eg: invalid JSON syntax
|
|
InvalidSyntax,
|
|
/// eg: allocator error
|
|
OutOfMemory,
|
|
/// eg: bad escaping
|
|
UnexpectedCharacter,
|
|
/// eg: std.fmt.parseFloat failed
|
|
BadNumber,
|
|
/// fba error
|
|
BufferTooSmall,
|
|
};
|
|
|
|
pub const TokenType = enum(u8) {
|
|
zero,
|
|
eof,
|
|
null,
|
|
true,
|
|
false,
|
|
number,
|
|
string,
|
|
property,
|
|
object_begin,
|
|
object_end,
|
|
array_begin,
|
|
array_end,
|
|
colon,
|
|
comma,
|
|
whitespace,
|
|
};
|
|
|
|
pub const Token = struct {
|
|
type: TokenType,
|
|
value: ?union {
|
|
number: f64,
|
|
string: []const u8,
|
|
symbol: u8,
|
|
},
|
|
start: usize,
|
|
end: usize,
|
|
};
|
|
|
|
pub const Self = @This();
|
|
|
|
text: []const u8,
|
|
max_position: usize,
|
|
stack: []usize,
|
|
frame: usize,
|
|
allocator: std.mem.Allocator,
|
|
|
|
/// Initialize a new tokenizer
|
|
pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
|
|
const stack = try allocator.alloc(usize, 0x100);
|
|
errdefer allocator.free(stack);
|
|
@memset(stack, 0);
|
|
return .{
|
|
.text = text,
|
|
.max_position = 0,
|
|
.stack = stack,
|
|
.frame = 0,
|
|
.allocator = allocator,
|
|
};
|
|
}
|
|
|
|
/// Clean up resources
|
|
pub fn deinit(self: *Self) void {
|
|
self.allocator.free(self.stack);
|
|
}
|
|
|
|
// ========== Core Parsing Functions ==========
|
|
|
|
fn currentPosition(self: *Self) usize {
|
|
return self.stack[self.frame];
|
|
}
|
|
|
|
fn advance(self: *Self, delta: usize) void {
|
|
self.stack[self.frame] += delta;
|
|
if (self.max_position < self.stack[self.frame])
|
|
self.max_position = self.stack[self.frame];
|
|
}
|
|
|
|
fn pushFrame(self: *Self) Error!usize {
|
|
self.frame += 1;
|
|
if (self.frame == self.stack.len) {
|
|
const new_stack = try self.allocator.alloc(usize, self.stack.len * 2);
|
|
@memset(new_stack, 0);
|
|
@memcpy(new_stack, self.stack);
|
|
self.allocator.free(self.stack);
|
|
self.stack = new_stack;
|
|
}
|
|
self.stack[self.frame] = self.stack[self.frame - 1];
|
|
return self.currentPosition();
|
|
}
|
|
|
|
fn popFrame(self: *Self) void {
|
|
self.frame -= 1;
|
|
}
|
|
|
|
fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
|
|
self.frame -= 1;
|
|
self.stack[self.frame] = self.stack[self.frame + 1];
|
|
return wrapped;
|
|
}
|
|
|
|
fn rollback(self: *Self) void {
|
|
self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
|
|
}
|
|
|
|
// ========== Character Matching ==========
|
|
|
|
fn lastChar(self: *Self) u8 {
|
|
return self.text[self.currentPosition() - 1];
|
|
}
|
|
|
|
fn currentChar(self: *Self) u8 {
|
|
return self.text[self.currentPosition()];
|
|
}
|
|
|
|
fn endOfInput(self: *Self) bool {
|
|
return self.currentPosition() >= self.text.len;
|
|
}
|
|
|
|
fn matchChar(self: *Self, c: u8) ?void {
|
|
if (self.endOfInput() or self.text[self.currentPosition()] != c) {
|
|
return null;
|
|
}
|
|
self.advance(1);
|
|
}
|
|
|
|
fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
|
|
// do not change this line for some reason it fucking breaks if I use currentChar directly
|
|
if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
|
|
return null;
|
|
}
|
|
self.advance(1);
|
|
}
|
|
|
|
fn matchString(self: *Self, s: []const u8) ?[]const u8 {
|
|
if (self.text.len < self.currentPosition() + s.len) {
|
|
// eof
|
|
return null;
|
|
}
|
|
|
|
const remaining_len = s.len;
|
|
const simd_width = 16; // 128-bit SIMD (SSE/NEON)
|
|
|
|
var j: usize = 0;
|
|
while (j + simd_width <= remaining_len) {
|
|
const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
|
|
const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;
|
|
|
|
if (!@reduce(.And, expected_chunk == actual_chunk)) {
|
|
return error.InvalidSyntax;
|
|
}
|
|
j += simd_width;
|
|
}
|
|
|
|
// Handle remaining bytes
|
|
while (j < remaining_len) {
|
|
if (s[j] != self.text[self.currentPosition() + j]) {
|
|
return error.InvalidSyntax;
|
|
}
|
|
j += 1;
|
|
}
|
|
|
|
self.advance(s.len);
|
|
}
|
|
|
|
pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
|
|
if (self.endOfInput())
|
|
return null;
|
|
|
|
const c = self.text[self.currentPosition()];
|
|
|
|
if (!(c >= low and c <= high))
|
|
return null;
|
|
|
|
self.advance(1);
|
|
}
|
|
|
|
pub fn anyChar(self: *Self) ?u8 {
|
|
if (self.endOfInput())
|
|
return null;
|
|
const char = self.text[self.currentPosition()];
|
|
self.advance(1);
|
|
return char;
|
|
}
|
|
|
|
// ========== Token Extraction ==========
|
|
|
|
fn extractSlice(self: *Self, start: usize) []const u8 {
|
|
return self.text[start..self.currentPosition()];
|
|
}
|
|
|
|
// Skip all whitespace characters
|
|
pub fn skipWhitespace(self: *Self) void {
|
|
const start = self.currentPosition();
|
|
if (self.endOfInput())
|
|
return;
|
|
const end = skipWhitespaceSimd(self.text[start..]);
|
|
self.advance(end);
|
|
}
|
|
|
|
/// Parse a number token
|
|
pub fn nextNumber(self: *Self) Error!Token {
|
|
const start = try self.pushFrame();
|
|
errdefer self.popFrame();
|
|
|
|
self.skipWhitespace();
|
|
|
|
self.matchChar('-') orelse {}; // this may not fail
|
|
|
|
while (self.matchCharRange('0', '9') != null) {}
|
|
|
|
self.matchChar('.') orelse {
|
|
// int found
|
|
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
|
|
return error.BadNumber; // no floating point
|
|
};
|
|
|
|
return Token{
|
|
.type = .number,
|
|
.value = .{
|
|
.number = float,
|
|
},
|
|
.start = start,
|
|
.end = self.currentPosition(),
|
|
};
|
|
};
|
|
|
|
while (self.matchCharRange('0', '9') != null) {}
|
|
|
|
const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
|
|
return error.BadNumber; // floating point
|
|
};
|
|
|
|
return .{
|
|
.type = .number,
|
|
.value = .{
|
|
.number = float,
|
|
},
|
|
.start = start,
|
|
.end = self.currentPosition(),
|
|
};
|
|
}
|
|
|
|
/// Parse an identifier token
|
|
pub fn nextIdentifier(self: *Self) Error!Token {
|
|
const start = try self.pushFrame();
|
|
errdefer self.popFrame();
|
|
|
|
self.skipWhitespace();
|
|
|
|
var buffer = try self.allocator.alloc(u8, 0x100);
|
|
defer self.allocator.free(buffer);
|
|
|
|
self.matchCharPredicate(std.ascii.isAlphabetic) orelse
|
|
return error.InvalidSyntax;
|
|
|
|
buffer[0] = self.lastChar();
|
|
|
|
var i: usize = 1;
|
|
while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
|
|
buffer[i] = self.lastChar();
|
|
i += 1;
|
|
}
|
|
|
|
const ident = buffer[0..i];
|
|
|
|
// true
|
|
if (std.mem.eql(u8, ident, "true")) {
|
|
return .{
|
|
.type = .true,
|
|
.value = null,
|
|
.start = start,
|
|
.end = self.currentPosition(),
|
|
};
|
|
}
|
|
|
|
// false
|
|
if (std.mem.eql(u8, ident, "false")) {
|
|
return .{
|
|
.type = .false,
|
|
.value = null,
|
|
.start = start,
|
|
.end = self.currentPosition(),
|
|
};
|
|
}
|
|
|
|
// null
|
|
if (std.mem.eql(u8, ident, "null")) {
|
|
return .{
|
|
.type = .null,
|
|
.value = null,
|
|
.start = start,
|
|
.end = self.currentPosition(),
|
|
};
|
|
}
|
|
|
|
unreachable;
|
|
}
|
|
|
|
/// Get the next token from the input
|
|
/// WARNING: this function eats whitespaces
|
|
pub fn nextToken(self: *Self) Error!Token {
|
|
const start = try self.pushFrame();
|
|
errdefer self.popFrame();
|
|
|
|
self.skipWhitespace();
|
|
|
|
// Fall back to single character symbol
|
|
const c = self.anyChar() orelse return .{
|
|
.type = .eof,
|
|
.value = null,
|
|
.start = start,
|
|
.end = start,
|
|
};
|
|
|
|
const symbol_t: TokenType = switch (c) {
|
|
'{' => .object_begin,
|
|
'}' => .object_end,
|
|
'[' => .array_begin,
|
|
']' => .array_end,
|
|
',' => {
|
|
self.skipWhitespace();
|
|
return self.commit(Token{
|
|
.type = .comma,
|
|
.value = null,
|
|
.end = start + 1,
|
|
.start = start,
|
|
});
|
|
},
|
|
':' => .colon,
|
|
'"' => {
|
|
self.rollback();
|
|
return self.commit(self.nextString());
|
|
},
|
|
else => {
|
|
self.rollback();
|
|
// Try different token types in order of precedence
|
|
if (std.ascii.isDigit(c) or c == '-') {
|
|
return self.commit(self.nextNumber());
|
|
}
|
|
|
|
if (std.ascii.isAlphabetic(c)) {
|
|
return self.commit(self.nextIdentifier());
|
|
}
|
|
|
|
return error.InvalidSyntax;
|
|
},
|
|
};
|
|
|
|
return self.commit(Token{
|
|
.type = symbol_t,
|
|
.value = null,
|
|
.start = start,
|
|
.end = start + 1,
|
|
});
|
|
}
|
|
|
|
pub fn nextString(self: *Self) Error!Token {
|
|
const start = try self.pushFrame();
|
|
errdefer self.popFrame();
|
|
|
|
self.skipWhitespace();
|
|
|
|
self.matchChar('"') orelse unreachable;
|
|
|
|
var buffer: std.ArrayList(u8) = .init(self.allocator);
|
|
|
|
loop: while (!self.endOfInput()) {
|
|
self.advance(1);
|
|
|
|
switch (self.lastChar()) {
|
|
'"' => {
|
|
return .{
|
|
.type = .string,
|
|
.value = .{ .string = try buffer.toOwnedSlice() },
|
|
.start = start,
|
|
.end = self.currentPosition(),
|
|
};
|
|
},
|
|
'\\' => {
|
|
self.advance(1);
|
|
switch (self.lastChar()) {
|
|
0x22, 0x5C, 0x2F => |d| {
|
|
try buffer.append(d);
|
|
continue :loop;
|
|
},
|
|
'b' => try buffer.append(0x8),
|
|
'f' => try buffer.append(0xC),
|
|
'n' => try buffer.append(0xA),
|
|
'r' => try buffer.append(0xD),
|
|
't' => try buffer.append(0x9),
|
|
'u' => {
|
|
var code_points: [4]u8 = undefined;
|
|
inline for (0..4) |i| {
|
|
if (self.endOfInput())
|
|
return .{
|
|
.type = .eof,
|
|
.value = null,
|
|
.start = start,
|
|
.end = start + 1,
|
|
};
|
|
self.advance(1);
|
|
code_points[i] = self.lastChar();
|
|
}
|
|
const buf = try stringToUtf8(&code_points);
|
|
try buffer.appendSlice(buf);
|
|
continue :loop;
|
|
},
|
|
else => return error.UnexpectedCharacter,
|
|
} // end switch
|
|
},
|
|
else => |c| {
|
|
if (std.ascii.isControl(c)) {
|
|
return error.UnexpectedCharacter;
|
|
}
|
|
try buffer.append(c);
|
|
},
|
|
} // end switch
|
|
} // end while
|
|
|
|
return error.InvalidSyntax;
|
|
}
|
|
|
|
pub const Iterator = struct {
|
|
tokenizer: *Self,
|
|
|
|
pub fn next(it: *Iterator) ?Token {
|
|
defer it.tokenizer.skipWhitespace();
|
|
if (it.tokenizer.endOfInput()) {
|
|
// std.debug.print("got eof\n", .{});
|
|
return null;
|
|
}
|
|
return it.tokenizer.nextToken() catch {
|
|
// std.debug.print("got err: {s}\n", .{@errorName(err)});
|
|
return null;
|
|
};
|
|
}
|
|
|
|
pub fn reset(it: *Iterator) void {
|
|
it.tokenizer.position = 0;
|
|
it.tokenizer.max_position = 0;
|
|
it.tokenizer.frame = 0;
|
|
}
|
|
};
|
|
|
|
/// iterator
|
|
pub fn iterator(self: *Self) Iterator {
|
|
return Iterator{
|
|
.tokenizer = self,
|
|
};
|
|
}
|
|
|
|
pub fn stringToUtf8(bytes: []u8) ![]u8 {
|
|
const code_point = std.fmt.parseInt(u21, bytes, 16) catch {
|
|
return error.BadNumber;
|
|
};
|
|
var buffer: [4]u8 = undefined;
|
|
var index: usize = 0;
|
|
|
|
if (code_point <= 0x7F) {
|
|
if (index >= buffer.len) return error.BufferTooSmall;
|
|
buffer[index] = @as(u8, @intCast(code_point));
|
|
index += 1;
|
|
} else if (code_point <= 0x7FF) {
|
|
if (index + 2 > buffer.len) return error.BufferTooSmall;
|
|
buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
|
|
buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
|
|
index += 2;
|
|
} else if (code_point <= 0xFFFF) {
|
|
if (index + 3 > buffer.len) return error.BufferTooSmall;
|
|
buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
|
|
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
|
|
buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
|
|
index += 3;
|
|
} else if (code_point <= 0x10FFFF) {
|
|
if (index + 4 > buffer.len) return error.BufferTooSmall;
|
|
buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
|
|
buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
|
|
buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
|
|
buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
|
|
index += 4;
|
|
} else unreachable;
|
|
|
|
return buffer[0..index];
|
|
}
|
|
|
|
pub fn skipWhitespaceSimd(text: []const u8) usize {
|
|
const ChunkSize = 16;
|
|
const Vec = @Vector(ChunkSize, u8);
|
|
|
|
// Broadcast whitespace characters to vectors
|
|
const space: Vec = @splat(' ');
|
|
const tab: Vec = @splat('\t');
|
|
const lf: Vec = @splat('\n');
|
|
const cr: Vec = @splat('\r');
|
|
|
|
var j: usize = 0;
|
|
const end = text.len;
|
|
|
|
// SIMD processing
|
|
while (j + ChunkSize <= end) {
|
|
const chunk: Vec = text[j..][0..ChunkSize].*;
|
|
|
|
// Compare against each whitespace character
|
|
const is_space = chunk == space;
|
|
const is_tab = chunk == tab;
|
|
const is_lf = chunk == lf;
|
|
const is_cr = chunk == cr;
|
|
|
|
// Combine comparisons using vector operations
|
|
const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
|
|
@select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
|
|
@select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
|
|
@select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));
|
|
|
|
const TrueMask: Vec = @splat(0xFF);
|
|
const FalseMask: Vec = @splat(0x00);
|
|
|
|
// Check if all characters are whitespace
|
|
if (@reduce(.And, anyws == TrueMask)) {
|
|
j += ChunkSize;
|
|
continue;
|
|
}
|
|
|
|
// Find first non-whitespace
|
|
const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
|
|
if (mask != 0) {
|
|
return j + @ctz(mask);
|
|
}
|
|
}
|
|
|
|
// Scalar processing for remaining bytes
|
|
while (j < end) switch (text[j]) {
|
|
' ', '\t', '\n', '\r' => j += 1,
|
|
else => break,
|
|
};
|
|
|
|
return j;
|
|
}
|