aether/tokenizer.zig

const std = @import("std");
const mem = std.mem;

pub const Error = error{
    /// eg: invalid JSON syntax
    InvalidSyntax,
    /// eg: allocator error
    OutOfMemory,
    /// eg: bad escaping
    UnexpectedCharacter,
    /// eg: std.fmt.parseFloat failed
    BadNumber,
    /// fba error
    BufferTooSmall,
};

pub const TokenType = enum(u8) {
    zero,
    eof,
    null,
    true,
    false,
    number,
    string,
    property,
    object_begin,
    object_end,
    array_begin,
    array_end,
    colon,
    comma,
    whitespace,
};

pub const Token = struct {
    type: TokenType,
    value: ?union {
        number: f64,
        string: []const u8,
        symbol: u8,
    },
    start: usize,
    end: usize,
};

pub const Self = @This();

text: []const u8,
max_position: usize,
stack: []usize,
frame: usize,
allocator: std.mem.Allocator,

/// Initialize a new tokenizer
pub fn init(allocator: std.mem.Allocator, text: []const u8) std.mem.Allocator.Error!Self {
    const stack = try allocator.alloc(usize, 0x100);
    errdefer allocator.free(stack);
    @memset(stack, 0);
    return .{
        .text = text,
        .max_position = 0,
        .stack = stack,
        .frame = 0,
        .allocator = allocator,
    };
}

/// Clean up resources
pub fn deinit(self: *Self) void {
    self.allocator.free(self.stack);
}

// ========== Core Parsing Functions ==========

fn currentPosition(self: *Self) usize {
    return self.stack[self.frame];
}

fn advance(self: *Self, delta: usize) void {
    self.stack[self.frame] += delta;
    if (self.max_position < self.stack[self.frame])
        self.max_position = self.stack[self.frame];
}

fn pushFrame(self: *Self) Error!usize {
    self.frame += 1;
    if (self.frame == self.stack.len) {
        const new_stack = try self.allocator.alloc(usize, self.stack.len * 2);
        @memset(new_stack, 0);
        @memcpy(new_stack, self.stack);
        self.allocator.free(self.stack);
        self.stack = new_stack;
    }
    self.stack[self.frame] = self.stack[self.frame - 1];
    return self.currentPosition();
}

fn popFrame(self: *Self) void {
    self.frame -= 1;
}

fn commit(self: *Self, wrapped: anytype) @TypeOf(wrapped) {
    self.frame -= 1;
    self.stack[self.frame] = self.stack[self.frame + 1];
    return wrapped;
}

fn rollback(self: *Self) void {
    self.stack[self.frame] = if (self.frame == 0) 0 else self.stack[self.frame - 1];
}

// ========== Character Matching ==========

fn lastChar(self: *Self) u8 {
    return self.text[self.currentPosition() - 1];
}

fn currentChar(self: *Self) u8 {
    return self.text[self.currentPosition()];
}

fn endOfInput(self: *Self) bool {
    return self.currentPosition() >= self.text.len;
}

fn matchChar(self: *Self, c: u8) ?void {
    if (self.endOfInput() or self.text[self.currentPosition()] != c) {
        return null;
    }
    self.advance(1);
}

fn matchCharPredicate(self: *Self, pred: fn (u8) bool) ?void {
    // do not change this line for some reason it fucking breaks if I use currentChar directly
    if (self.endOfInput() or !pred(self.text[self.currentPosition()])) {
        return null;
    }
    self.advance(1);
}

fn matchString(self: *Self, s: []const u8) ?[]const u8 {
    if (self.text.len < self.currentPosition() + s.len) {
        // eof
        return null;
    }

    const remaining_len = s.len;
    const simd_width = 16; // 128-bit SIMD (SSE/NEON)

    var j: usize = 0;
    while (j + simd_width <= remaining_len) {
        const expected_chunk: @Vector(simd_width, u8) = s[j..][0..simd_width].*;
        const actual_chunk: @Vector(simd_width, u8) = self.text[self.currentPosition() + j ..][0..simd_width].*;

        if (!@reduce(.And, expected_chunk == actual_chunk)) {
            return error.InvalidSyntax;
        }
        j += simd_width;
    }

    // Handle remaining bytes
    while (j < remaining_len) {
        if (s[j] != self.text[self.currentPosition() + j]) {
            return error.InvalidSyntax;
        }
        j += 1;
    }

    self.advance(s.len);
}

pub fn matchCharRange(self: *Self, low: u8, high: u8) ?void {
    if (self.endOfInput())
        return null;

    const c = self.text[self.currentPosition()];

    if (!(c >= low and c <= high))
        return null;

    self.advance(1);
}

pub fn anyChar(self: *Self) ?u8 {
    if (self.endOfInput())
        return null;
    const char = self.text[self.currentPosition()];
    self.advance(1);
    return char;
}

// ========== Token Extraction ==========

fn extractSlice(self: *Self, start: usize) []const u8 {
    return self.text[start..self.currentPosition()];
}

// Skip all whitespace characters
pub fn skipWhitespace(self: *Self) void {
    const start = self.currentPosition();
    if (self.endOfInput())
        return;
    const end = skipWhitespaceSimd(self.text[start..]);
    self.advance(end);
}

/// Parse a number token
pub fn nextNumber(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();

    self.skipWhitespace();

    self.matchChar('-') orelse {}; // this may not fail

    while (self.matchCharRange('0', '9') != null) {}

    self.matchChar('.') orelse {
        // int found
        const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
            return error.BadNumber; // no floating point
        };

        return Token{
            .type = .number,
            .value = .{
                .number = float,
            },
            .start = start,
            .end = self.currentPosition(),
        };
    };

    while (self.matchCharRange('0', '9') != null) {}

    const float = std.fmt.parseFloat(f64, self.extractSlice(start)) catch {
        return error.BadNumber; // floating point
    };

    return .{
        .type = .number,
        .value = .{
            .number = float,
        },
        .start = start,
        .end = self.currentPosition(),
    };
}

/// Parse an identifier token
pub fn nextIdentifier(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();

    self.skipWhitespace();

    var buffer = try self.allocator.alloc(u8, 0x100);
    defer self.allocator.free(buffer);

    self.matchCharPredicate(std.ascii.isAlphabetic) orelse
        return error.InvalidSyntax;

    buffer[0] = self.lastChar();

    var i: usize = 1;
    while (self.matchCharPredicate(std.ascii.isAlphanumeric) != null) {
        buffer[i] = self.lastChar();
        i += 1;
    }

    const ident = buffer[0..i];

    // true
    if (std.mem.eql(u8, ident, "true")) {
        return .{
            .type = .true,
            .value = null,
            .start = start,
            .end = self.currentPosition(),
        };
    }

    // false
    if (std.mem.eql(u8, ident, "false")) {
        return .{
            .type = .false,
            .value = null,
            .start = start,
            .end = self.currentPosition(),
        };
    }

    // null
    if (std.mem.eql(u8, ident, "null")) {
        return .{
            .type = .null,
            .value = null,
            .start = start,
            .end = self.currentPosition(),
        };
    }

    unreachable;
}

/// Get the next token from the input
/// WARNING: this function eats whitespaces
pub fn nextToken(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();

    self.skipWhitespace();

    // Fall back to single character symbol
    const c = self.anyChar() orelse return .{
        .type = .eof,
        .value = null,
        .start = start,
        .end = start,
    };

    const symbol_t: TokenType = switch (c) {
        '{' => .object_begin,
        '}' => .object_end,
        '[' => .array_begin,
        ']' => .array_end,
        ',' => {
            self.skipWhitespace();
            return self.commit(Token{
                .type = .comma,
                .value = null,
                .end = start + 1,
                .start = start,
            });
        },
        ':' => .colon,
        '"' => {
            self.rollback();
            return self.commit(self.nextString());
        },
        else => {
            self.rollback();
            // Try different token types in order of precedence
            if (std.ascii.isDigit(c) or c == '-') {
                return self.commit(self.nextNumber());
            }

            if (std.ascii.isAlphabetic(c)) {
                return self.commit(self.nextIdentifier());
            }

            return error.InvalidSyntax;
        },
    };

    return self.commit(Token{
        .type = symbol_t,
        .value = null,
        .start = start,
        .end = start + 1,
    });
}

pub fn nextString(self: *Self) Error!Token {
    const start = try self.pushFrame();
    errdefer self.popFrame();

    self.skipWhitespace();

    self.matchChar('"') orelse unreachable;

    var buffer: std.ArrayList(u8) = .init(self.allocator);

    loop: while (!self.endOfInput()) {
        self.advance(1);

        switch (self.lastChar()) {
            '"' => {
                return .{
                    .type = .string,
                    .value = .{ .string = try buffer.toOwnedSlice() },
                    .start = start,
                    .end = self.currentPosition(),
                };
            },
            '\\' => {
                self.advance(1);
                switch (self.lastChar()) {
                    0x22, 0x5C, 0x2F => |d| {
                        try buffer.append(d);
                        continue :loop;
                    },
                    'b' => try buffer.append(0x8),
                    'f' => try buffer.append(0xC),
                    'n' => try buffer.append(0xA),
                    'r' => try buffer.append(0xD),
                    't' => try buffer.append(0x9),
                    'u' => {
                        var code_points: [4]u8 = undefined;
                        inline for (0..4) |i| {
                            if (self.endOfInput())
                                return .{
                                    .type = .eof,
                                    .value = null,
                                    .start = start,
                                    .end = start + 1,
                                };
                            self.advance(1);
                            code_points[i] = self.lastChar();
                        }
                        const buf = try stringToUtf8(&code_points);
                        try buffer.appendSlice(buf);
                        continue :loop;
                    },
                    else => return error.UnexpectedCharacter,
                } // end switch
            },
            else => |c| {
                if (std.ascii.isControl(c)) {
                    return error.UnexpectedCharacter;
                }
                try buffer.append(c);
            },
        } // end switch
    } // end while

    return error.InvalidSyntax;
}

pub const Iterator = struct {
    tokenizer: *Self,

    pub fn next(it: *Iterator) ?Token {
        defer it.tokenizer.skipWhitespace();
        if (it.tokenizer.endOfInput()) {
            // std.debug.print("got eof\n", .{});
            return null;
        }
        return it.tokenizer.nextToken() catch {
            // std.debug.print("got err: {s}\n", .{@errorName(err)});
            return null;
        };
    }

    pub fn reset(it: *Iterator) void {
        it.tokenizer.position = 0;
        it.tokenizer.max_position = 0;
        it.tokenizer.frame = 0;
    }
};

/// iterator
pub fn iterator(self: *Self) Iterator {
    return Iterator{
        .tokenizer = self,
    };
}

pub fn stringToUtf8(bytes: []u8) ![]u8 {
    const code_point = std.fmt.parseInt(u21, bytes, 16) catch {
        return error.BadNumber;
    };
    var buffer: [4]u8 = undefined;
    var index: usize = 0;

    if (code_point <= 0x7F) {
        if (index >= buffer.len) return error.BufferTooSmall;
        buffer[index] = @as(u8, @intCast(code_point));
        index += 1;
    } else if (code_point <= 0x7FF) {
        if (index + 2 > buffer.len) return error.BufferTooSmall;
        buffer[index] = 0xC0 | (@as(u8, @intCast(code_point >> 6)));
        buffer[index + 1] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
        index += 2;
    } else if (code_point <= 0xFFFF) {
        if (index + 3 > buffer.len) return error.BufferTooSmall;
        buffer[index] = 0xE0 | (@as(u8, @intCast(code_point >> 12)));
        buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
        buffer[index + 2] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
        index += 3;
    } else if (code_point <= 0x10FFFF) {
        if (index + 4 > buffer.len) return error.BufferTooSmall;
        buffer[index] = 0xF0 | (@as(u8, @intCast(code_point >> 18)));
        buffer[index + 1] = 0x80 | (@as(u8, @intCast((code_point >> 12) & 0x3F)));
        buffer[index + 2] = 0x80 | (@as(u8, @intCast((code_point >> 6) & 0x3F)));
        buffer[index + 3] = 0x80 | (@as(u8, @intCast(code_point & 0x3F)));
        index += 4;
    } else unreachable;

    return buffer[0..index];
}

pub fn skipWhitespaceSimd(text: []const u8) usize {
    const ChunkSize = 16;
    const Vec = @Vector(ChunkSize, u8);

    // Broadcast whitespace characters to vectors
    const space: Vec = @splat(' ');
    const tab: Vec = @splat('\t');
    const lf: Vec = @splat('\n');
    const cr: Vec = @splat('\r');

    var j: usize = 0;
    const end = text.len;

    // SIMD processing
    while (j + ChunkSize <= end) {
        const chunk: Vec = text[j..][0..ChunkSize].*;

        // Compare against each whitespace character
        const is_space = chunk == space;
        const is_tab = chunk == tab;
        const is_lf = chunk == lf;
        const is_cr = chunk == cr;

        // Combine comparisons using vector operations
        const anyws = @select(u8, is_space, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
            @select(u8, is_tab, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
            @select(u8, is_lf, @as(Vec, @splat(1)), @as(Vec, @splat(0))) |
            @select(u8, is_cr, @as(Vec, @splat(1)), @as(Vec, @splat(0)));

        const TrueMask: Vec = @splat(0xFF);
        const FalseMask: Vec = @splat(0x00);

        // Check if all characters are whitespace
        if (@reduce(.And, anyws == TrueMask)) {
            j += ChunkSize;
            continue;
        }

        // Find first non-whitespace
        const mask: std.meta.Int(.unsigned, ChunkSize) = @bitCast(anyws == FalseMask);
        if (mask != 0) {
            return j + @ctz(mask);
        }
    }

    // Scalar processing for remaining bytes
    while (j < end) switch (text[j]) {
        ' ', '\t', '\n', '\r' => j += 1,
        else => break,
    };

    return j;
}