const std = @import("std"); const mem = std.mem; pub const Error = error{ /// eg: invalid JSON syntax InvalidSyntax, /// eg: allocator error OutOfMemory, /// eg: bad escaping UnexpectedCharacter, /// eg: std.fmt.parseFloat failed BadNumber, /// fba error BufferTooSmall, }; pub const TokenType = enum(u8) { zero, eof, null, true, false, int, float, string, property, object_begin, object_end, array_begin, array_end, colon, comma, whitespace, }; pub const Token = struct { type: TokenType, value: ?union { int: i128, float: f64, string: []const u8, symbol: u8, }, start: usize, }; pub const Self = @This(); text: []const u8, max_position: usize, stack: []usize, frame: usize, /// Initialize a new tokenizer pub fn init(allocator: mem.Allocator, text: []const u8) mem.Allocator.Error!Self { const stack = try allocator.alloc(usize, 0x100); errdefer allocator.free(stack); @memset(stack, 0); return .{ .text = text, .max_position = 0, .stack = stack, .frame = 0, }; } /// Clean up resources pub fn deinit(self: *Self, allocator: mem.Allocator) void { allocator.free(self.stack); } // ========== Core Parsing Functions ========== fn currentPosition(self: *Self) usize { return self.stack[self.frame]; } fn advance(self: *Self, delta: usize) void { self.stack[self.frame] += delta; if (self.max_position < self.stack[self.frame]) self.max_position = self.stack[self.frame]; } fn pushFrame(self: *Self, allocator: mem.Allocator) Error!usize { self.frame += 1; if (self.frame == self.stack.len) { const new_stack = try allocator.alloc(usize, self.stack.len * 2); @memset(new_stack, 0); @memcpy(new_stack, self.stack); allocator.free(self.stack); self.stack = new_stack; } if (self.frame > self.text.len) return error.BufferTooSmall; self.stack[self.frame] = self.stack[self.frame - 1]; return self.currentPosition(); } fn popFrame(self: *Self) void { if (self.frame >= 1) self.frame -= 1; } fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) { self.frame -= 1; self.stack[self.frame] = self.stack[self.frame + 1]; return wrapped; } fn rollback(self: *Self) void { self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1]; } // ========== Character Matching ========== fn lastChar(self: *Self) !u8 { if (self.stack[self.frame] > self.text.len) return error.BufferTooSmall; return self.text[self.currentPosition() - 1]; } fn currentChar(self: *Self) u8 { return self.text[self.currentPosition()]; } pub fn endOfInput(self: *Self) bool { return self.currentPosition() >= self.text.len; } fn matchChar(self: *Self, c: u8) ?void { if (self.endOfInput() or self.text[self.currentPosition()] != c) { return null; } self.advance(1); } fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void { // do not change this line for some reason it fucking breaks if I use currentChar directly if (self.endOfInput() or !pred(self.text[self.currentPosition()])) { return null; } self.advance(1); } fn matchString(self: *Self, s: []const u8) ?[]const u8 { if (self.text.len < self.currentPosition() + s.len) { // eof return null; } const remaining_len = s.len; const simd_width = 16; // 128-bit SIMD (SSE/NEON) var j: usize = 0; while (j + simd_width <= remaining_len) { const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*; const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*; if (!@reduce(.And, expected_chunk == actual_chunk)) { return null; } j += simd_width; } // Handle remaining bytes while (j < remaining_len) { if (s[j] != self.text[self.currentPosition() + j]) { return null; } j += 1; } self.advance(s.len); return null; } pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void { if (self.endOfInput()) return null; const c = self.text[self.currentPosition()]; if (!(c >= low and c <= high)) return null; self.advance(1); } pub fn anyChar(self: *Self) ?u8 { if (self.endOfInput()) return null; const char = self.text[self.currentPosition()]; self.advance(1); return char; } // ========== Token Extraction ========== fn extractSlice(self: *Self, start: usize) []const u8 { return self.text[start..self.currentPosition()]; } // Skip all whitespace characters pub fn skipWhitespace(self: *Self) void { const start = self.currentPosition(); const end = skipWhitespaceSimd(self.text[start..]); self.advance(end); } /// Parse a number token pub fn nextNumber(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); const start = try self.pushFrame(allocator); errdefer self.popFrame(); self.matchChar('-') orelse {}; // this may not fail while (self.matchCharRange('0', '9') != null) {} self.matchChar('.') orelse { if (self.matchChar('e') != null or self.matchChar('E') != null) { self.matchChar('+') orelse self.matchChar('-') orelse {}; while (self.matchCharRange('0', '9') != null) {} } else { // int found const int = std.fmt.parseInt(i128, self.extractSlice(start), 10) catch { return error.BadNumber; // no floating point }; return Token{ .type = .int, .value = .{ .int = int }, .start = start, }; } // float with e found const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch { return error.BadNumber; // no floating point }; return Token{ .type = .float, .value = .{ .float = float }, .start = start, }; }; while (self.matchCharRange('0', '9') != null) {} if (self.matchChar('e') != null or self.matchChar('E') != null) { self.matchChar('+') orelse self.matchChar('-') orelse {}; while (self.matchCharRange('0', '9') != null) {} } const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch { return error.BadNumber; // floating point }; return .{ .type = .float, .value = .{ .float = float }, .start = start, }; } /// Parse an identifier token pub fn nextIdentifier(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); const start = try self.pushFrame(allocator); errdefer self.popFrame(); var buffer = try allocator.alloc(u8, 0x100); defer allocator.free(buffer); self.matchCharPredicate(std.ascii.isAlphabetic) orelse return error.InvalidSyntax; buffer[0] = try self.lastChar(); var i: usize = 1; while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) { buffer[i] = try self.lastChar(); i += 1; } const ident = buffer[0..i]; // true if (mem.eql(u8, ident, "true")) { return .{ .type = .true, .value = null, .start = start, }; } // false if (mem.eql(u8, ident, "false")) { return .{ .type = .false, .value = null, .start = start, }; } // null if (mem.eql(u8, ident, "null")) { return .{ .type = .null, .value = null, .start = start, }; } return error.InvalidSyntax; } /// Get the next token from the input /// WARNING: this function eats whitespaces pub fn nextToken(self: *Self, allocator: mem.Allocator, allow_comments: bool) Error!Token { self.skipWhitespace(); const start = try self.pushFrame(allocator); errdefer self.popFrame(); // Fall back to single character symbol const c = self.anyChar() orelse return .{ .type = .eof, .value = null, .start = start, }; const symbol_t: TokenType = switch (c) { '/' => { if (allow_comments and self.matchChar('/') != null) { // Single line comment while (true) { const next_c = self.anyChar() orelse return .{ .type = .eof, .value = null, .start = start, }; if (next_c == '\n') break; } self.skipWhitespace(); return self.nextToken(allocator, allow_comments); } else if (allow_comments and self.matchChar('*') != null) { // Multi-line comment while (true) { if (self.endOfInput()) return error.InvalidSyntax; // unterminated comment const next_c = self.anyChar() orelse return .{ .type = .eof, .value = null, .start = start, }; if (next_c == '*' and self.matchChar('/') != null) break; } self.skipWhitespace(); return self.nextToken(allocator, allow_comments); } return error.InvalidSyntax; // not a comment }, '{' => .object_begin, '}' => .object_end, '[' => .array_begin, ']' => .array_end, ',' => { self.skipWhitespace(); return self.commit(Token{ .type = .comma, .value = null, .start = start, }); }, ':' => .colon, '"' => { self.rollback(); const string = try self.nextString(allocator); errdefer allocator.free(string); return self.commit(string); }, else => { self.rollback(); // Try different token types in order of precedence if (std.ascii.isDigit(c) or c == '-') { return self.commit(self.nextNumber(allocator)); } if (std.ascii.isAlphabetic(c)) { return self.commit(self.nextIdentifier(allocator)); } return error.InvalidSyntax; }, }; return self.commit(Token{ .type = symbol_t, .value = null, .start = start, }); } pub fn nextString(self: *Self, allocator: mem.Allocator) Error!Token { self.skipWhitespace(); const start = try self.pushFrame(allocator); errdefer self.popFrame(); self.matchChar('"') orelse unreachable; var buffer: std.ArrayList(u8) = .init(allocator); defer buffer.deinit(); loop: while (!self.endOfInput()) { self.advance(1); switch (try self.lastChar()) { '"' => { while (mem.indexOfScalar(u8, buffer.items, 0x00)) |idx| _ = buffer.swapRemove(idx); return .{ .type = .string, .value = .{ .string = try buffer.toOwnedSlice() }, .start = start, }; }, '\\' => { self.advance(1); switch (try self.lastChar()) { 0x22, 0x5C, 0x2F => |d| { try buffer.append(d); continue :loop; }, 'b' => try buffer.append(0x8), 'f' => try buffer.append(0xC), 'n' => try buffer.append(0xA), 'r' => try buffer.append(0xD), 't' => try buffer.append(0x9), 'u' => { var code_points: [4]u8 = undefined; inline for (0..4) |i| { if (self.endOfInput()) return .{ .type = .eof, .value = null, .start = start, }; self.advance(1); code_points[i] = try self.lastChar(); } const buf = try stringToUtf8(&code_points); try buffer.appendSlice(buf); continue :loop; }, else => return error.UnexpectedCharacter, } // end switch }, else => |c| { if (std.ascii.isControl(c) and c != std.ascii.control_code.del) { return error.UnexpectedCharacter; } try buffer.append(c); }, } // end switch } // end while return error.InvalidSyntax; } pub fn stringToUtf8(bytes: []u8) ![]u8 { const code_point = std.fmt.parseInt(u21, bytes, 16) catch return error.BadNumber; var buffer: [4]u8 = undefined; var index: usize = 0; if (code_point <= 0x7F) { if (index >= buffer.len) return error.BufferTooSmall; buffer[index] = @as(u8, @intCast(code_point)); index += 1; } else if (code_point <= 0x7FF) { if (index + 2 > buffer.len) return error.BufferTooSmall; buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6))); buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F))); index += 2; } else if (code_point <= 0xFFFF) { if (index + 3 > buffer.len) return error.BufferTooSmall; buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12))); buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F))); buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F))); index += 3; } else if (code_point <= 0x10FFFF) { if (index + 4 > buffer.len) return error.BufferTooSmall; buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18))); buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F))); buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F))); buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F))); index += 4; } else unreachable; return buffer[0..index]; } pub fn skipWhitespaceSimd(text: []const u8) usize { const ChunkSize = 16; const Vec = @Vector(ChunkSize, u8); // Broadcast whitespace characters to vectors const space: Vec = @splat(' '); const tab: Vec = @splat('\t'); const lf: Vec = @splat('\n'); const cr: Vec = @splat('\r'); var j: usize = 0; const end = text.len; // SIMD processing while (j + ChunkSize <= end) { const chunk: Vec = text[j..][0..ChunkSize].*; // Compare against each whitespace character const is_space = chunk == space; const is_tab = chunk == tab; const is_lf = chunk == lf; const is_cr = chunk == cr; // Combine comparisons using vector operations const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) | @select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) | @select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) | @select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0))); const TrueMask: Vec = @splat(0xFF); const FalseMask: Vec = @splat(0x00); // Check if all characters are whitespace if (@reduce(.And, anyws == TrueMask)) { j += ChunkSize; continue; } // Find first non-whitespace const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask); if (mask != 0) return j + @ctz(mask); } // Scalar processing for remaining bytes while (j < end) switch (text[j]) { ' ', '\t', '\n', '\r' => j += 1, else => break, }; return j; }